From ddac81ae44bc6ae0f07e0951bcd4059226ad86af Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 13:45:09 -0600
Subject: [PATCH 1/8] raise ValueError if video URL is invalid

---
 src/astro.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/astro.py b/src/astro.py
index 06f412d..1c0b2a5 100644
--- a/src/astro.py
+++ b/src/astro.py
@@ -20,6 +20,9 @@ def extract_video_id_from_url(url: str) -> str:
     """
 
     video_id = url.split('v=')[1]
+    if not YouTubeDataAPI.valid_video_id(video_id):
+        raise ValueError('Invalid video URL provided')
+
     return video_id
 
 

From 1b75c063fb06fc26b32662867324aadbcdc03e13 Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 13:45:36 -0600
Subject: [PATCH 2/8] remove extra dataframe print

---
 src/astro_db.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/astro_db.py b/src/astro_db.py
index b1d7497..dce36fe 100644
--- a/src/astro_db.py
+++ b/src/astro_db.py
@@ -178,7 +178,6 @@ def insert_comment_dataframe(self, video_data, dataframe: pd.DataFrame):
             self.logger.debug(f'Comment table for video id {video_data.video_id} did not exist - creating it now')
             comment_table = self.create_comment_table_for_video(video_data)
 
-        self.logger.debug(f'Appending comment dataframe to database:\n{dataframe}')
         dataframe.to_sql(comment_table, self.conn, index=False, if_exists='append')
 
         self.conn.commit()

From 8694dee44a9ba69b185e06ae4713a55ecfdd0bfe Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 13:46:26 -0600
Subject: [PATCH 3/8] first pass at adding progress bar, added comment counting

---
 src/data_collection/yt_data_api.py | 75 +++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 28 deletions(-)

diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py
index b24bd5e..e14dfe8 100644
--- a/src/data_collection/yt_data_api.py
+++ b/src/data_collection/yt_data_api.py
@@ -8,6 +8,7 @@
 
 from src.data_collection.data_structures import VideoData
 from googleapiclient.discovery import build
+from rich.progress import Progress
 
 
 class YouTubeDataAPI:
@@ -45,6 +46,8 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
         # if the dataframe is non-null and not empty, we're appending data to the dataframe
         append_dataframe = comment_dataframe is not None and not comment_dataframe.empty
 
+        comment_count = 0
+
         if append_dataframe:
             df_index = len(comment_dataframe.index)  # last index in dataframe
             df = comment_dataframe
@@ -62,6 +65,7 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
 
             df.loc[df_index] = [comment, user, date]
             df_index += 1
+            comment_count += 1
 
             if has_replies:
                 for reply in item['replies']['comments']:
@@ -73,8 +77,9 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
 
                     df.loc[df_index] = [comment, user, date]
                     df_index += 1
+                    comment_count += 1
 
-        return df
+        return df, comment_count
 
     def get_comments(self, video_data) -> pd.DataFrame:
         """
@@ -91,33 +96,47 @@ def get_comments(self, video_data) -> pd.DataFrame:
         comment_count = video_data.comment_count
         unfetched_comments = True
 
-        while unfetched_comments:
-            # The API limits comment requests to 100 records
-            max_comments = min(100, comment_count)
-
-            self.logger.debug('collecting {} comments'.format(max_comments))
-
-            request = self.youtube.commentThreads().list(
-                part='snippet,replies',
-                videoId=video_data.video_id,
-                pageToken=page_token,
-                maxResults=max_comments,
-                textFormat='plainText')
-
-            comment_count -= max_comments
-            unfetched_comments = True if comment_count > 0 else False
-
-            try:
-                response = request.execute()
-                comment_dataframe = self.parse_comment_api_response(response, comment_dataframe)
-                if 'nextPageToken' in response:  # there are more comments to fetch
-                    page_token = response['nextPageToken']
-                else:
-                    unfetched_comments = False
-
-            except Exception as e:
-                self.logger.error(str(e))
-                self.logger.error(traceback.format_exc())
+        with Progress() as progress:
+            progress_steps = comment_count
+            download_task = progress.add_task("[green]Downloading comments", total=progress_steps)
+
+            while not progress.finished:
+                while unfetched_comments:
+                    # The API limits comment requests to 100 records
+                    max_comments = min(100, comment_count)
+
+                    self.logger.debug('collecting {} comments'.format(max_comments))
+
+                    request = self.youtube.commentThreads().list(
+                        part='snippet,replies',
+                        videoId=video_data.video_id,
+                        pageToken=page_token,
+                        maxResults=max_comments,
+                        textFormat='plainText')
+
+                    comment_count -= max_comments
+                    unfetched_comments = True if comment_count > 0 else False
+
+                    try:
+                        response = request.execute()
+                        #import json
+                        #print('response: {}'.format(json.dumps(response, indent=4)))
+                        #collected_comments = response['pageInfo']['resultsPerPage']
+                        comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe)
+                        if 'nextPageToken' in response:  # there are more comments to fetch
+                            page_token = response['nextPageToken']
+                        else:
+                            self.logger.debug("comment collection complete")
+                            unfetched_comments = False
+
+                        progress.update(download_task, advance=num_comments)
+
+                    except Exception as e:
+                        self.logger.error(str(e))
+                        self.logger.error(traceback.format_exc())
+
+                # unexpectedly finished collecting comments, set progress bar to complete
+                progress.update(download_task, completed=progress_steps)
 
         return comment_dataframe
 

From 637fcb0868806614a7bc8595324b03bc9c3769fd Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 16:02:30 -0600
Subject: [PATCH 4/8] - Progress bar now working - Added new field to the
 VideoData class to capture the video title for display purposes in a later
 change - Added progress.py, which implements AstroProgress that acts as a
 wrapper for rich.Progress - Added log method to allow objects to check log
 level     - might use this to conditionally enable certain features based on
 log level

---
 src/data_collection/data_structures.py |  3 ++
 src/data_collection/sentiment.py       |  3 ++
 src/data_collection/yt_data_api.py     | 63 ++++++++++++--------------
 src/log.py                             |  7 +++
 src/progress.py                        | 37 +++++++++++++++
 5 files changed, 78 insertions(+), 35 deletions(-)
 create mode 100644 src/progress.py

diff --git a/src/data_collection/data_structures.py b/src/data_collection/data_structures.py
index bff9e72..d6e8317 100644
--- a/src/data_collection/data_structures.py
+++ b/src/data_collection/data_structures.py
@@ -5,6 +5,7 @@
 
 class VideoData:
     video_id: str
+    video_title: str
     channel_id: str
     channel_title: str
     view_count: int
@@ -14,6 +15,7 @@ class VideoData:
     def __init__(
             self,
             video_id='',
+            video_title='',
             channel_id='',
             channel_title='',
             view_count=0,
@@ -21,6 +23,7 @@ def __init__(
             comment_count=0):
 
         self.video_id = video_id
+        self.video_title = video_title
         self.channel_id = channel_id
         self.channel_title = channel_title
         self.view_count = view_count
diff --git a/src/data_collection/sentiment.py b/src/data_collection/sentiment.py
index 36b0dbf..4e95f4f 100644
--- a/src/data_collection/sentiment.py
+++ b/src/data_collection/sentiment.py
@@ -29,6 +29,9 @@ def nltk_init(self):
             nltk.download(pkg, quiet=True, raise_on_error=True)
 
     def add_sentiment_to_dataframe(self, df):
+        if df is None:
+            raise ValueError('received null dataframe')
+
         if not df.empty:
             df['PSentiment'] = ''
             df['NSentiment'] = ''
diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py
index e14dfe8..f4af067 100644
--- a/src/data_collection/yt_data_api.py
+++ b/src/data_collection/yt_data_api.py
@@ -8,7 +8,7 @@
 
 from src.data_collection.data_structures import VideoData
 from googleapiclient.discovery import build
-from rich.progress import Progress
+from progress import AstroProgress
 
 
 class YouTubeDataAPI:
@@ -96,47 +96,39 @@ def get_comments(self, video_data) -> pd.DataFrame:
         comment_count = video_data.comment_count
         unfetched_comments = True
 
-        with Progress() as progress:
-            progress_steps = comment_count
-            download_task = progress.add_task("[green]Downloading comments", total=progress_steps)
+        with AstroProgress('Downloading comments', comment_count) as progress:
+            while unfetched_comments:
+                # The API limits comment requests to 100 records
+                max_comments = min(100, comment_count)
 
-            while not progress.finished:
-                while unfetched_comments:
-                    # The API limits comment requests to 100 records
-                    max_comments = min(100, comment_count)
+                self.logger.debug('collecting {} comments'.format(max_comments))
 
-                    self.logger.debug('collecting {} comments'.format(max_comments))
+                request = self.youtube.commentThreads().list(
+                    part='snippet,replies',
+                    videoId=video_data.video_id,
+                    pageToken=page_token,
+                    maxResults=max_comments,
+                    textFormat='plainText')
 
-                    request = self.youtube.commentThreads().list(
-                        part='snippet,replies',
-                        videoId=video_data.video_id,
-                        pageToken=page_token,
-                        maxResults=max_comments,
-                        textFormat='plainText')
+                comment_count -= max_comments
+                unfetched_comments = True if comment_count > 0 else False
 
-                    comment_count -= max_comments
-                    unfetched_comments = True if comment_count > 0 else False
+                try:
+                    response = request.execute()
+                    comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe)
+                    if 'nextPageToken' in response:  # there are more comments to fetch
+                        page_token = response['nextPageToken']
+                    else:
+                        self.logger.debug("comment collection complete")
+                        unfetched_comments = False
 
-                    try:
-                        response = request.execute()
-                        #import json
-                        #print('response: {}'.format(json.dumps(response, indent=4)))
-                        #collected_comments = response['pageInfo']['resultsPerPage']
-                        comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe)
-                        if 'nextPageToken' in response:  # there are more comments to fetch
-                            page_token = response['nextPageToken']
-                        else:
-                            self.logger.debug("comment collection complete")
-                            unfetched_comments = False
+                    progress.advance(num_comments)
 
-                        progress.update(download_task, advance=num_comments)
+                except Exception as e:
+                    self.logger.error(str(e))
+                    self.logger.error(traceback.format_exc())
 
-                    except Exception as e:
-                        self.logger.error(str(e))
-                        self.logger.error(traceback.format_exc())
-
-                # unexpectedly finished collecting comments, set progress bar to complete
-                progress.update(download_task, completed=progress_steps)
+            progress.complete()
 
         return comment_dataframe
 
@@ -157,6 +149,7 @@ def get_video_metadata(self, video_id: str) -> VideoData:
             video_stats = response['items'][0]['statistics']
 
             return_data.video_id = video_id
+            return_data.title = video_data['title']
             return_data.channel_id = video_data['channelId']
             return_data.channel_title = video_data['channelTitle']
             return_data.like_count = int(video_stats['likeCount'])
diff --git a/src/log.py b/src/log.py
index c21e444..86cbdf6 100644
--- a/src/log.py
+++ b/src/log.py
@@ -5,8 +5,12 @@
 
 
 class Logger:
+    log_level_str: str
+    log_level: int
+    logger: logging.Logger
 
     def __init__(self, log_level_str: str):
+        self.log_level_str = log_level_str
         self.log_level = self.get_log_level(log_level_str)
         self.logger = self.initialize_logging()
 
@@ -32,3 +36,6 @@ def initialize_logging(self) -> logging.Logger:
 
     def get_logger(self) -> logging.Logger:
         return self.logger
+
+    def level(log_level_str: str) -> bool:
+        return self.get_log_level(log_level_str) == self.log_level
diff --git a/src/progress.py b/src/progress.py
new file mode 100644
index 0000000..54441e9
--- /dev/null
+++ b/src/progress.py
@@ -0,0 +1,37 @@
+"""
+Progress bar utilities.
+"""
+from rich.progress import (
+    BarColumn,
+    MofNCompleteColumn,
+    Progress,
+    TextColumn,
+    TimeElapsedColumn,
+    TimeRemainingColumn,
+)
+
+
+class AstroProgress(Progress):
+    total_steps: int
+    task: int
+
+    def __init__(self, task_str: str, steps: int):
+        self.task_str = task_str
+        self.total_steps = steps
+
+        super().__init__(
+            TextColumn("{task.description} [progress.percentage]{task.percentage:>3.0f}%"),
+            BarColumn(),
+            MofNCompleteColumn(),
+            TextColumn("•"),
+            TimeElapsedColumn(),
+            TextColumn("•"),
+            TimeRemainingColumn())
+
+        self.task = super().add_task('[green]' + task_str, total=self.total_steps)
+
+    def advance(self, steps: int):
+        super().update(self.task, advance=steps)
+
+    def complete(self):
+        return super().update(self.task, completed=self.total_steps)

From 8c732a5375b3de9c189259a7bf2f1db037459eb9 Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 16:09:56 -0600
Subject: [PATCH 5/8] fix error in log.py

---
 src/log.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/log.py b/src/log.py
index 86cbdf6..8662c93 100644
--- a/src/log.py
+++ b/src/log.py
@@ -37,5 +37,5 @@ def initialize_logging(self) -> logging.Logger:
     def get_logger(self) -> logging.Logger:
         return self.logger
 
-    def level(log_level_str: str) -> bool:
+    def level(self, log_level_str: str) -> bool:
         return self.get_log_level(log_level_str) == self.log_level

From f67f31a0aba801d0a5f675dce7b0362a3d7e7e4d Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 16:13:18 -0600
Subject: [PATCH 6/8] update requirements.txt and setup.py to account for new
 module and packages

---
 requirements.txt | 4 ++++
 setup.py         | 3 ++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index e23def5..3abf55c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,9 @@ httplib2==0.22.0
 idna==3.10
 iniconfig==2.0.0
 joblib==1.4.2
+markdown-it-py==3.0.0
 mccabe==0.7.0
+mdurl==0.1.2
 nltk==3.9.1
 numpy==2.1.1
 packaging==24.1
@@ -25,6 +27,7 @@ pyasn1==0.6.1
 pyasn1_modules==0.4.1
 pycodestyle==2.12.1
 pyflakes==3.2.0
+Pygments==2.18.0
 pyparsing==3.1.4
 pytest==8.3.3
 python-dateutil==2.9.0.post0
@@ -32,6 +35,7 @@ python-dotenv==1.0.1
 pytz==2024.2
 regex==2024.9.11
 requests==2.32.3
+rich==13.9.2
 rsa==4.9
 six==1.16.0
 tqdm==4.66.5
diff --git a/setup.py b/setup.py
index b11a2ea..0158830 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,8 @@
     # Top level, single-file modules
     py_modules=[
         "src/log",
-        "src/astro_db"],
+        "src/astro_db",
+        "src/progress"],
 
     # Packages required to run the app
     install_requires=[

From 8d33469ecce6d2bb678b620afd79baff53b0205a Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Fri, 11 Oct 2024 16:24:30 -0600
Subject: [PATCH 7/8] fix import path in yt_data_api.py

---
 src/data_collection/yt_data_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py
index f4af067..6fea9d5 100644
--- a/src/data_collection/yt_data_api.py
+++ b/src/data_collection/yt_data_api.py
@@ -7,8 +7,8 @@
 import string
 
 from src.data_collection.data_structures import VideoData
+from src.progress import AstroProgress
 from googleapiclient.discovery import build
-from progress import AstroProgress
 
 
 class YouTubeDataAPI:

From a105bc7f0b27f054e377a0baff96672a57c0664f Mon Sep 17 00:00:00 2001
From: Austin Cullar <austincullar.work@gmail.com>
Date: Sat, 12 Oct 2024 13:39:19 -0600
Subject: [PATCH 8/8] add progress bar to sentiment analysis, change naming of
 variable in yt_data_api.py

---
 src/data_collection/sentiment.py   | 20 +++++++++++---------
 src/data_collection/yt_data_api.py |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/src/data_collection/sentiment.py b/src/data_collection/sentiment.py
index 4e95f4f..1412b25 100644
--- a/src/data_collection/sentiment.py
+++ b/src/data_collection/sentiment.py
@@ -1,15 +1,13 @@
 """
 Functionality for determining the sentiment of a given comment/string. This
-approach utilizes the Natural Language Toolkit in combination with
-SentiWordNet.
-
-This logic was informed by the following article written by "AI & Tech by Nidhika, PhD":
-https://medium.com/@nidhikayadav/sentiment-analysis-with-python-sentiwordnet-fd07ffc557
+approach utilizes the Natural Language Toolkit in combination with SentiWordNet.
 """
 import nltk
 from nltk.corpus import wordnet as wn
 from nltk.corpus import sentiwordnet as swn
 
+from src.progress import AstroProgress
+
 
 class SentimentAnalysis:
     logger = None
@@ -29,18 +27,22 @@ def nltk_init(self):
             nltk.download(pkg, quiet=True, raise_on_error=True)
 
     def add_sentiment_to_dataframe(self, df):
-        if df is None:
+        if df is None or df.empty:
             raise ValueError('received null dataframe')
 
-        if not df.empty:
-            df['PSentiment'] = ''
-            df['NSentiment'] = ''
+        # add new columns to dataframe
+        df['PSentiment'] = ''
+        df['NSentiment'] = ''
 
+        comment_count = len(df.index)
+        with AstroProgress('Calculating comment sentiment', comment_count) as progress:
             for index, row in df.iterrows():
                 sentiment = self.get_sentiment(row['comment'])
                 df.loc[index, 'PSentiment'] = sentiment[0]
                 df.loc[index, 'NSentiment'] = sentiment[1]
 
+                progress.advance(1)
+
     def get_sentiment(self, comment: str) -> ():
         token_comment = nltk.word_tokenize(comment)
         pos_tag_comment = nltk.pos_tag(token_comment)
diff --git a/src/data_collection/yt_data_api.py b/src/data_collection/yt_data_api.py
index 6fea9d5..dac1966 100644
--- a/src/data_collection/yt_data_api.py
+++ b/src/data_collection/yt_data_api.py
@@ -115,14 +115,14 @@ def get_comments(self, video_data) -> pd.DataFrame:
 
                 try:
                     response = request.execute()
-                    comment_dataframe, num_comments = self.parse_comment_api_response(response, comment_dataframe)
+                    comment_dataframe, comments_added = self.parse_comment_api_response(response, comment_dataframe)
                     if 'nextPageToken' in response:  # there are more comments to fetch
                         page_token = response['nextPageToken']
                     else:
                         self.logger.debug("comment collection complete")
                         unfetched_comments = False
 
-                    progress.advance(num_comments)
+                    progress.advance(comments_added)
 
                 except Exception as e:
                     self.logger.error(str(e))