From ddac81ae44bc6ae0f07e0951bcd4059226ad86af Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 13:45:09 -0600 Subject: [PATCH 1/8] raise ValueError if video URL is invalid --- src/astro.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/astro.py b/src/astro.py index 06f412d..1c0b2a5 100644 --- a/src/astro.py +++ b/src/astro.py @@ -20,6 +20,9 @@ def extract_video_id_from_url(url: str) -> str: """ video_id = url.split('v=')[1] + if not YouTubeDataAPI.valid_video_id(video_id): + raise ValueError('Invalid video URL provided') + return video_id From 1b75c063fb06fc26b32662867324aadbcdc03e13 Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 13:45:36 -0600 Subject: [PATCH 2/8] remove extra dataframe print --- src/astro_db.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/astro_db.py b/src/astro_db.py index b1d7497..dce36fe 100644 --- a/src/astro_db.py +++ b/src/astro_db.py @@ -178,7 +178,6 @@ def insert_comment_dataframe(self, video_data, dataframe: pd.DataFrame): self.logger.debug(f'Comment table for video id {video_data.video_id} did not exist - creating it now') comment_table = self.create_comment_table_for_video(video_data) - self.logger.debug(f'Appending comment dataframe to database:\n{dataframe}') dataframe.to_sql(comment_table, self.conn, index=False, if_exists='append') self.conn.commit() From 8694dee44a9ba69b185e06ae4713a55ecfdd0bfe Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 13:46:26 -0600 Subject: [PATCH 3/8] first pass at adding progress bar, added comment counting --- src/data_collection/yt_data_api.py | 75 +++++++++++++++++++----------- 1 file changed, 47 insertions(+), 28 deletions(-) diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py index b24bd5e..e14dfe8 100644 --- a/src/data_collection/yt_data_api.py +++ b/src/data_collection/yt_data_api.py @@ -8,6 +8,7 @@ from src.data_collection.data_structures import VideoData from googleapiclient.discovery import build +from rich.progress import Progress class YouTubeDataAPI: @@ -45,6 +46,8 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram # if the dataframe is non-null and not empty, we're appending data to the dataframe append_dataframe = comment_dataframe is not None and not comment_dataframe.empty + comment_count = 0 + if append_dataframe: df_index = len(comment_dataframe.index) # last index in dataframe df = comment_dataframe @@ -62,6 +65,7 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram df.loc[df_index] = [comment, user, date] df_index += 1 + comment_count += 1 if has_replies: for reply in item['replies']['comments']: @@ -73,8 +77,9 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram df.loc[df_index] = [comment, user, date] df_index += 1 + comment_count += 1 - return df + return df, comment_count def get_comments(self, video_data) -> pd.DataFrame: """ @@ -91,33 +96,47 @@ def get_comments(self, video_data) -> pd.DataFrame: comment_count = video_data.comment_count unfetched_comments = True - while unfetched_comments: - # The API limits comment requests to 100 records - max_comments = min(100, comment_count) - - self.logger.debug('collecting {} comments'.format(max_comments)) - - request = self.youtube.commentThreads().list( - part='snippet,replies', - videoId=video_data.video_id, - pageToken=page_token, - maxResults=max_comments, - textFormat='plainText') - - comment_count -= max_comments - unfetched_comments = True if comment_count > 0 else False - - try: - response = request.execute() - comment_dataframe = self.parse_comment_api_response(response, comment_dataframe) - if 'nextPageToken' in response: # there are more comments to fetch - page_token = response['nextPageToken'] - else: - unfetched_comments = False - - except Exception as e: - self.logger.error(str(e)) - self.logger.error(traceback.format_exc()) + with Progress() as progress: + progress_steps = comment_count + download_task = progress.add_task("[green]Downloading comments", total=progress_steps) + + while not progress.finished: + while unfetched_comments: + # The API limits comment requests to 100 records + max_comments = min(100, comment_count) + + self.logger.debug('collecting {} comments'.format(max_comments)) + + request = self.youtube.commentThreads().list( + part='snippet,replies', + videoId=video_data.video_id, + pageToken=page_token, + maxResults=max_comments, + textFormat='plainText') + + comment_count -= max_comments + unfetched_comments = True if comment_count > 0 else False + + try: + response = request.execute() + #import json + #print('response: {}'.format(json.dumps(response, indent=4))) + #collected_comments = response['pageInfo']['resultsPerPage'] + comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe) + if 'nextPageToken' in response: # there are more comments to fetch + page_token = response['nextPageToken'] + else: + self.logger.debug("comment collection complete") + unfetched_comments = False + + progress.update(download_task, advance=num_comments) + + except Exception as e: + self.logger.error(str(e)) + self.logger.error(traceback.format_exc()) + + # unexpectedly finished collecting comments, set progress bar to complete + progress.update(download_task, completed=progress_steps) return comment_dataframe From 637fcb0868806614a7bc8595324b03bc9c3769fd Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 16:02:30 -0600 Subject: [PATCH 4/8] - Progress bar now working - Added new field to the VideoData class to capture the video title for display purposes in a later change - Added progress.py, which implements AstroProgress that acts as a wrapper for rich.Progress - Added log method to allow objects to check log level - might use this to conditionally enable certain features based on log level --- src/data_collection/data_structures.py | 3 ++ src/data_collection/sentiment.py | 3 ++ src/data_collection/yt_data_api.py | 63 ++++++++++++-------------- src/log.py | 7 +++ src/progress.py | 37 +++++++++++++++ 5 files changed, 78 insertions(+), 35 deletions(-) create mode 100644 src/progress.py diff --git a/src/data_collection/data_structures.py b/src/data_collection/data_structures.py index bff9e72..d6e8317 100644 --- a/src/data_collection/data_structures.py +++ b/src/data_collection/data_structures.py @@ -5,6 +5,7 @@ class VideoData: video_id: str + video_title: str channel_id: str channel_title: str view_count: int @@ -14,6 +15,7 @@ class VideoData: def __init__( self, video_id='', + video_title='', channel_id='', channel_title='', view_count=0, @@ -21,6 +23,7 @@ def __init__( comment_count=0): self.video_id = video_id + self.video_title = video_title self.channel_id = channel_id self.channel_title = channel_title self.view_count = view_count diff --git a/src/data_collection/sentiment.py b/src/data_collection/sentiment.py index 36b0dbf..4e95f4f 100644 --- a/src/data_collection/sentiment.py +++ b/src/data_collection/sentiment.py @@ -29,6 +29,9 @@ def nltk_init(self): nltk.download(pkg, quiet=True, raise_on_error=True) def add_sentiment_to_dataframe(self, df): + if df is None: + raise ValueError('received null dataframe') + if not df.empty: df['PSentiment'] = '' df['NSentiment'] = '' diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py index e14dfe8..f4af067 100644 --- a/src/data_collection/yt_data_api.py +++ b/src/data_collection/yt_data_api.py @@ -8,7 +8,7 @@ from src.data_collection.data_structures import VideoData from googleapiclient.discovery import build -from rich.progress import Progress +from progress import AstroProgress class YouTubeDataAPI: @@ -96,47 +96,39 @@ def get_comments(self, video_data) -> pd.DataFrame: comment_count = video_data.comment_count unfetched_comments = True - with Progress() as progress: - progress_steps = comment_count - download_task = progress.add_task("[green]Downloading comments", total=progress_steps) + with AstroProgress('Downloading comments', comment_count) as progress: + while unfetched_comments: + # The API limits comment requests to 100 records + max_comments = min(100, comment_count) - while not progress.finished: - while unfetched_comments: - # The API limits comment requests to 100 records - max_comments = min(100, comment_count) + self.logger.debug('collecting {} comments'.format(max_comments)) - self.logger.debug('collecting {} comments'.format(max_comments)) + request = self.youtube.commentThreads().list( + part='snippet,replies', + videoId=video_data.video_id, + pageToken=page_token, + maxResults=max_comments, + textFormat='plainText') - request = self.youtube.commentThreads().list( - part='snippet,replies', - videoId=video_data.video_id, - pageToken=page_token, - maxResults=max_comments, - textFormat='plainText') + comment_count -= max_comments + unfetched_comments = True if comment_count > 0 else False - comment_count -= max_comments - unfetched_comments = True if comment_count > 0 else False + try: + response = request.execute() + comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe) + if 'nextPageToken' in response: # there are more comments to fetch + page_token = response['nextPageToken'] + else: + self.logger.debug("comment collection complete") + unfetched_comments = False - try: - response = request.execute() - #import json - #print('response: {}'.format(json.dumps(response, indent=4))) - #collected_comments = response['pageInfo']['resultsPerPage'] - comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe) - if 'nextPageToken' in response: # there are more comments to fetch - page_token = response['nextPageToken'] - else: - self.logger.debug("comment collection complete") - unfetched_comments = False + progress.advance(num_comments) - progress.update(download_task, advance=num_comments) + except Exception as e: + self.logger.error(str(e)) + self.logger.error(traceback.format_exc()) - except Exception as e: - self.logger.error(str(e)) - self.logger.error(traceback.format_exc()) - - # unexpectedly finished collecting comments, set progress bar to complete - progress.update(download_task, completed=progress_steps) + progress.complete() return comment_dataframe @@ -157,6 +149,7 @@ def get_video_metadata(self, video_id: str) -> VideoData: video_stats = response['items'][0]['statistics'] return_data.video_id = video_id + return_data.title = video_data['title'] return_data.channel_id = video_data['channelId'] return_data.channel_title = video_data['channelTitle'] return_data.like_count = int(video_stats['likeCount']) diff --git a/src/log.py b/src/log.py index c21e444..86cbdf6 100644 --- a/src/log.py +++ b/src/log.py @@ -5,8 +5,12 @@ class Logger: + log_level_str: str + log_level: int + logger: logging.Logger def __init__(self, log_level_str: str): + self.log_level_str = log_level_str self.log_level = self.get_log_level(log_level_str) self.logger = self.initialize_logging() @@ -32,3 +36,6 @@ def initialize_logging(self) -> logging.Logger: def get_logger(self) -> logging.Logger: return self.logger + + def level(log_level_str: str) -> bool: + return self.get_log_level(log_level_str) == self.log_level diff --git a/src/progress.py b/src/progress.py new file mode 100644 index 0000000..54441e9 --- /dev/null +++ b/src/progress.py @@ -0,0 +1,37 @@ +""" +Progress bar utilities. +""" +from rich.progress import ( + BarColumn, + MofNCompleteColumn, + Progress, + TextColumn, + TimeElapsedColumn, + TimeRemainingColumn, +) + + +class AstroProgress(Progress): + total_steps: int + task: int + + def __init__(self, task_str: str, steps: int): + self.task_str = task_str + self.total_steps = steps + + super().__init__( + TextColumn("{task.description} [progress.percentage]{task.percentage:>3.0f}%"), + BarColumn(), + MofNCompleteColumn(), + TextColumn("•"), + TimeElapsedColumn(), + TextColumn("•"), + TimeRemainingColumn()) + + self.task = super().add_task('[green]' + task_str, total=self.total_steps) + + def advance(self, steps: int): + super().update(self.task, advance=steps) + + def complete(self): + return super().update(self.task, completed=self.total_steps) From 8c732a5375b3de9c189259a7bf2f1db037459eb9 Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 16:09:56 -0600 Subject: [PATCH 5/8] fix error in log.py --- src/log.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log.py b/src/log.py index 86cbdf6..8662c93 100644 --- a/src/log.py +++ b/src/log.py @@ -37,5 +37,5 @@ def initialize_logging(self) -> logging.Logger: def get_logger(self) -> logging.Logger: return self.logger - def level(log_level_str: str) -> bool: + def level(self, log_level_str: str) -> bool: return self.get_log_level(log_level_str) == self.log_level From f67f31a0aba801d0a5f675dce7b0362a3d7e7e4d Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 16:13:18 -0600 Subject: [PATCH 6/8] update requirements.txt and setup.py to account for new module and packages --- requirements.txt | 4 ++++ setup.py | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e23def5..3abf55c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -13,7 +13,9 @@ httplib2==0.22.0 idna==3.10 iniconfig==2.0.0 joblib==1.4.2 +markdown-it-py==3.0.0 mccabe==0.7.0 +mdurl==0.1.2 nltk==3.9.1 numpy==2.1.1 packaging==24.1 @@ -25,6 +27,7 @@ pyasn1==0.6.1 pyasn1_modules==0.4.1 pycodestyle==2.12.1 pyflakes==3.2.0 +Pygments==2.18.0 pyparsing==3.1.4 pytest==8.3.3 python-dateutil==2.9.0.post0 @@ -32,6 +35,7 @@ python-dotenv==1.0.1 pytz==2024.2 regex==2024.9.11 requests==2.32.3 +rich==13.9.2 rsa==4.9 six==1.16.0 tqdm==4.66.5 diff --git a/setup.py b/setup.py index b11a2ea..0158830 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,8 @@ # Top level, single-file modules py_modules=[ "src/log", - "src/astro_db"], + "src/astro_db", + "src/progress"], # Packages required to run the app install_requires=[ From 8d33469ecce6d2bb678b620afd79baff53b0205a Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Fri, 11 Oct 2024 16:24:30 -0600 Subject: [PATCH 7/8] fix import path in yt_data_api.py --- src/data_collection/yt_data_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py index f4af067..6fea9d5 100644 --- a/src/data_collection/yt_data_api.py +++ b/src/data_collection/yt_data_api.py @@ -7,8 +7,8 @@ import string from src.data_collection.data_structures import VideoData +from src.progress import AstroProgress from googleapiclient.discovery import build -from progress import AstroProgress class YouTubeDataAPI: From a105bc7f0b27f054e377a0baff96672a57c0664f Mon Sep 17 00:00:00 2001 From: Austin Cullar Date: Sat, 12 Oct 2024 13:39:19 -0600 Subject: [PATCH 8/8] add progress bar to sentiment analysis, change naming of variable in yt_data_api.py --- src/data_collection/sentiment.py | 20 +++++++++++--------- src/data_collection/yt_data_api.py | 4 ++-- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/src/data_collection/sentiment.py b/src/data_collection/sentiment.py index 4e95f4f..1412b25 100644 --- a/src/data_collection/sentiment.py +++ b/src/data_collection/sentiment.py @@ -1,15 +1,13 @@ """ Functionality for determining the sentiment of a given comment/string. This -approach utilizes the Natural Language Toolkit in combination with -SentiWordNet. - -This logic was informed by the following article written by "AI & Tech by Nidhika, PhD": -https://medium.com/@nidhikayadav/sentiment-analysis-with-python-sentiwordnet-fd07ffc557 +approach utilizes the Natural Language Toolkit in combination with SentiWordNet. """ import nltk from nltk.corpus import wordnet as wn from nltk.corpus import sentiwordnet as swn +from src.progress import AstroProgress + class SentimentAnalysis: logger = None @@ -29,18 +27,22 @@ def nltk_init(self): nltk.download(pkg, quiet=True, raise_on_error=True) def add_sentiment_to_dataframe(self, df): - if df is None: + if df is None or df.empty: raise ValueError('received null dataframe') - if not df.empty: - df['PSentiment'] = '' - df['NSentiment'] = '' + # add new columns to dataframe + df['PSentiment'] = '' + df['NSentiment'] = '' + comment_count = len(df.index) + with AstroProgress('Calculating comment sentiment', comment_count) as progress: for index, row in df.iterrows(): sentiment = self.get_sentiment(row['comment']) df.loc[index, 'PSentiment'] = sentiment[0] df.loc[index, 'NSentiment'] = sentiment[1] + progress.advance(1) + def get_sentiment(self, comment: str) -> (): token_comment = nltk.word_tokenize(comment) pos_tag_comment = nltk.pos_tag(token_comment) diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py index 6fea9d5..dac1966 100644 --- a/src/data_collection/yt_data_api.py +++ b/src/data_collection/yt_data_api.py @@ -115,14 +115,14 @@ def get_comments(self, video_data) -> pd.DataFrame: try: response = request.execute() - comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe) + comment_dataframe, comments_added = self.parse_comment_api_response(response, comment_dataframe) if 'nextPageToken' in response: # there are more comments to fetch page_token = response['nextPageToken'] else: self.logger.debug("comment collection complete") unfetched_comments = False - progress.advance(num_comments) + progress.advance(comments_added) except Exception as e: self.logger.error(str(e))