AustinCullar · AustinCullar · Oct 12, 2024 · Oct 11, 2024 · Oct 11, 2024 · Oct 11, 2024
diff --git a/requirements.txt b/requirements.txt
@@ -13,7 +13,9 @@ httplib2==0.22.0
 idna==3.10
 iniconfig==2.0.0
 joblib==1.4.2
+markdown-it-py==3.0.0
 mccabe==0.7.0
+mdurl==0.1.2
 nltk==3.9.1
 numpy==2.1.1
 packaging==24.1
@@ -25,13 +27,15 @@ pyasn1==0.6.1
 pyasn1_modules==0.4.1
 pycodestyle==2.12.1
 pyflakes==3.2.0
+Pygments==2.18.0
 pyparsing==3.1.4
 pytest==8.3.3
 python-dateutil==2.9.0.post0
 python-dotenv==1.0.1
 pytz==2024.2
 regex==2024.9.11
 requests==2.32.3
+rich==13.9.2
 rsa==4.9
 six==1.16.0
 tqdm==4.66.5

diff --git a/setup.py b/setup.py
@@ -9,7 +9,8 @@
     # Top level, single-file modules
     py_modules=[
         "src/log",
-        "src/astro_db"],
+        "src/astro_db",
+        "src/progress"],
 
     # Packages required to run the app
     install_requires=[

diff --git a/src/astro.py b/src/astro.py
@@ -20,6 +20,9 @@ def extract_video_id_from_url(url: str) -> str:
     """
 
     video_id = url.split('v=')[1]
+    if not YouTubeDataAPI.valid_video_id(video_id):
+        raise ValueError('Invalid video URL provided')
+
     return video_id
 
 

diff --git a/src/astro_db.py b/src/astro_db.py
@@ -178,7 +178,6 @@ def insert_comment_dataframe(self, video_data, dataframe: pd.DataFrame):
             self.logger.debug(f'Comment table for video id {video_data.video_id} did not exist - creating it now')
             comment_table = self.create_comment_table_for_video(video_data)
 
-        self.logger.debug(f'Appending comment dataframe to database:\n{dataframe}')
         dataframe.to_sql(comment_table, self.conn, index=False, if_exists='append')
 
         self.conn.commit()

diff --git a/src/data_collection/data_structures.py b/src/data_collection/data_structures.py
@@ -5,6 +5,7 @@
 
 class VideoData:
     video_id: str
+    video_title: str
     channel_id: str
     channel_title: str
     view_count: int
@@ -14,13 +15,15 @@ class VideoData:
     def __init__(
             self,
             video_id='',
+            video_title='',
             channel_id='',
             channel_title='',
             view_count=0,
             like_count=0,
             comment_count=0):
 
         self.video_id = video_id
+        self.video_title = video_title
         self.channel_id = channel_id
         self.channel_title = channel_title
         self.view_count = view_count

diff --git a/src/data_collection/sentiment.py b/src/data_collection/sentiment.py
@@ -1,15 +1,13 @@
 """
 Functionality for determining the sentiment of a given comment/string. This
-approach utilizes the Natural Language Toolkit in combination with
-SentiWordNet.
-
-This logic was informed by the following article written by "AI & Tech by Nidhika, PhD":
-https://medium.com/@nidhikayadav/sentiment-analysis-with-python-sentiwordnet-fd07ffc557
+approach utilizes the Natural Language Toolkit in combination with SentiWordNet.
 """
 import nltk
 from nltk.corpus import wordnet as wn
 from nltk.corpus import sentiwordnet as swn
 
+from src.progress import AstroProgress
+
 
 class SentimentAnalysis:
     logger = None
@@ -29,15 +27,22 @@ def nltk_init(self):
             nltk.download(pkg, quiet=True, raise_on_error=True)
 
     def add_sentiment_to_dataframe(self, df):
-        if not df.empty:
-            df['PSentiment'] = ''
-            df['NSentiment'] = ''
+        if df is None or df.empty:
+            raise ValueError('received null dataframe')
 
+        # add new columns to dataframe
+        df['PSentiment'] = ''
+        df['NSentiment'] = ''
+
+        comment_count = len(df.index)
+        with AstroProgress('Calculating comment sentiment', comment_count) as progress:
             for index, row in df.iterrows():
                 sentiment = self.get_sentiment(row['comment'])
                 df.loc[index, 'PSentiment'] = sentiment[0]
                 df.loc[index, 'NSentiment'] = sentiment[1]
 
+                progress.advance(1)
+
     def get_sentiment(self, comment: str) -> ():
         token_comment = nltk.word_tokenize(comment)
         pos_tag_comment = nltk.pos_tag(token_comment)

diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py
@@ -7,6 +7,7 @@
 import string
 
 from src.data_collection.data_structures import VideoData
+from src.progress import AstroProgress
 from googleapiclient.discovery import build
 
 
@@ -45,6 +46,8 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
         # if the dataframe is non-null and not empty, we're appending data to the dataframe
         append_dataframe = comment_dataframe is not None and not comment_dataframe.empty
 
+        comment_count = 0
+
         if append_dataframe:
             df_index = len(comment_dataframe.index)  # last index in dataframe
             df = comment_dataframe
@@ -62,6 +65,7 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
 
             df.loc[df_index] = [comment, user, date]
             df_index += 1
+            comment_count += 1
 
             if has_replies:
                 for reply in item['replies']['comments']:
@@ -73,8 +77,9 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
 
                     df.loc[df_index] = [comment, user, date]
                     df_index += 1
+                    comment_count += 1
 
-        return df
+        return df, comment_count
 
     def get_comments(self, video_data) -> pd.DataFrame:
         """
@@ -91,33 +96,39 @@ def get_comments(self, video_data) -> pd.DataFrame:
         comment_count = video_data.comment_count
         unfetched_comments = True
 
-        while unfetched_comments:
-            # The API limits comment requests to 100 records
-            max_comments = min(100, comment_count)
+        with AstroProgress('Downloading comments', comment_count) as progress:
+            while unfetched_comments:
+                # The API limits comment requests to 100 records
+                max_comments = min(100, comment_count)
+
+                self.logger.debug('collecting {} comments'.format(max_comments))
+
+                request = self.youtube.commentThreads().list(
+                    part='snippet,replies',
+                    videoId=video_data.video_id,
+                    pageToken=page_token,
+                    maxResults=max_comments,
+                    textFormat='plainText')
 
-            self.logger.debug('collecting {} comments'.format(max_comments))
+                comment_count -= max_comments
+                unfetched_comments = True if comment_count > 0 else False
 
-            request = self.youtube.commentThreads().list(
-                part='snippet,replies',
-                videoId=video_data.video_id,
-                pageToken=page_token,
-                maxResults=max_comments,
-                textFormat='plainText')
+                try:
+                    response = request.execute()
+                    comment_dataframe, comments_added = self.parse_comment_api_response(response, comment_dataframe)
+                    if 'nextPageToken' in response:  # there are more comments to fetch
+                        page_token = response['nextPageToken']
+                    else:
+                        self.logger.debug("comment collection complete")
+                        unfetched_comments = False
 
-            comment_count -= max_comments
-            unfetched_comments = True if comment_count > 0 else False
+                    progress.advance(comments_added)
 
-            try:
-                response = request.execute()
-                comment_dataframe = self.parse_comment_api_response(response, comment_dataframe)
-                if 'nextPageToken' in response:  # there are more comments to fetch
-                    page_token = response['nextPageToken']
-                else:
-                    unfetched_comments = False
+                except Exception as e:
+                    self.logger.error(str(e))
+                    self.logger.error(traceback.format_exc())
 
-            except Exception as e:
-                self.logger.error(str(e))
-                self.logger.error(traceback.format_exc())
+            progress.complete()
 
         return comment_dataframe
 
@@ -138,6 +149,7 @@ def get_video_metadata(self, video_id: str) -> VideoData:
             video_stats = response['items'][0]['statistics']
 
             return_data.video_id = video_id
+            return_data.title = video_data['title']
             return_data.channel_id = video_data['channelId']
             return_data.channel_title = video_data['channelTitle']
             return_data.like_count = int(video_stats['likeCount'])

diff --git a/src/log.py b/src/log.py
@@ -5,8 +5,12 @@
 
 
 class Logger:
+    log_level_str: str
+    log_level: int
+    logger: logging.Logger
 
     def __init__(self, log_level_str: str):
+        self.log_level_str = log_level_str
         self.log_level = self.get_log_level(log_level_str)
         self.logger = self.initialize_logging()
 
@@ -32,3 +36,6 @@ def initialize_logging(self) -> logging.Logger:
 
     def get_logger(self) -> logging.Logger:
         return self.logger
+
+    def level(self, log_level_str: str) -> bool:
+        return self.get_log_level(log_level_str) == self.log_level
diff --git a/src/progress.py b/src/progress.py
@@ -0,0 +1,37 @@
+"""
+Progress bar utilities.
+"""
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+
+
+class AstroProgress(Progress):
+    total_steps: int
+    task: int
+
+    def __init__(self, task_str: str, steps: int):
+        self.task_str = task_str
+        self.total_steps = steps
+
+        super().__init__(
+            TextColumn("{task.description} [progress.percentage]{task.percentage:>3.0f}%"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("•"),
+            TimeElapsedColumn(),
+            TextColumn("•"),
+            TimeRemainingColumn())
+
+        self.task = super().add_task('[green]' + task_str, total=self.total_steps)
+
+    def advance(self, steps: int):
+        super().update(self.task, advance=steps)
+
+    def complete(self):
+        return super().update(self.task, completed=self.total_steps)