Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ httplib2==0.22.0
idna==3.10
iniconfig==2.0.0
joblib==1.4.2
markdown-it-py==3.0.0
mccabe==0.7.0
mdurl==0.1.2
nltk==3.9.1
numpy==2.1.1
packaging==24.1
Expand All @@ -25,13 +27,15 @@ pyasn1==0.6.1
pyasn1_modules==0.4.1
pycodestyle==2.12.1
pyflakes==3.2.0
Pygments==2.18.0
pyparsing==3.1.4
pytest==8.3.3
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
pytz==2024.2
regex==2024.9.11
requests==2.32.3
rich==13.9.2
rsa==4.9
six==1.16.0
tqdm==4.66.5
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
# Top level, single-file modules
py_modules=[
"src/log",
"src/astro_db"],
"src/astro_db",
"src/progress"],

# Packages required to run the app
install_requires=[
Expand Down
3 changes: 3 additions & 0 deletions src/astro.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ def extract_video_id_from_url(url: str) -> str:
"""

video_id = url.split('v=')[1]
if not YouTubeDataAPI.valid_video_id(video_id):
raise ValueError('Invalid video URL provided')

return video_id


Expand Down
1 change: 0 additions & 1 deletion src/astro_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,6 @@ def insert_comment_dataframe(self, video_data, dataframe: pd.DataFrame):
self.logger.debug(f'Comment table for video id {video_data.video_id} did not exist - creating it now')
comment_table = self.create_comment_table_for_video(video_data)

self.logger.debug(f'Appending comment dataframe to database:\n{dataframe}')
dataframe.to_sql(comment_table, self.conn, index=False, if_exists='append')

self.conn.commit()
Expand Down
3 changes: 3 additions & 0 deletions src/data_collection/data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

class VideoData:
video_id: str
video_title: str
channel_id: str
channel_title: str
view_count: int
Expand All @@ -14,13 +15,15 @@ class VideoData:
def __init__(
self,
video_id='',
video_title='',
channel_id='',
channel_title='',
view_count=0,
like_count=0,
comment_count=0):

self.video_id = video_id
self.video_title = video_title
self.channel_id = channel_id
self.channel_title = channel_title
self.view_count = view_count
Expand Down
21 changes: 13 additions & 8 deletions src/data_collection/sentiment.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
"""
Functionality for determining the sentiment of a given comment/string. This
approach utilizes the Natural Language Toolkit in combination with
SentiWordNet.

This logic was informed by the following article written by "AI & Tech by Nidhika, PhD":
https://medium.com/@nidhikayadav/sentiment-analysis-with-python-sentiwordnet-fd07ffc557
approach utilizes the Natural Language Toolkit in combination with SentiWordNet.
"""
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn

from src.progress import AstroProgress


class SentimentAnalysis:
logger = None
Expand All @@ -29,15 +27,22 @@ def nltk_init(self):
nltk.download(pkg, quiet=True, raise_on_error=True)

def add_sentiment_to_dataframe(self, df):
if not df.empty:
df['PSentiment'] = ''
df['NSentiment'] = ''
if df is None or df.empty:
raise ValueError('received null dataframe')

# add new columns to dataframe
df['PSentiment'] = ''
df['NSentiment'] = ''

comment_count = len(df.index)
with AstroProgress('Calculating comment sentiment', comment_count) as progress:
for index, row in df.iterrows():
sentiment = self.get_sentiment(row['comment'])
df.loc[index, 'PSentiment'] = sentiment[0]
df.loc[index, 'NSentiment'] = sentiment[1]

progress.advance(1)

def get_sentiment(self, comment: str) -> ():
token_comment = nltk.word_tokenize(comment)
pos_tag_comment = nltk.pos_tag(token_comment)
Expand Down
58 changes: 35 additions & 23 deletions src/data_collection/yt_data_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import string

from src.data_collection.data_structures import VideoData
from src.progress import AstroProgress
from googleapiclient.discovery import build


Expand Down Expand Up @@ -45,6 +46,8 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram
# if the dataframe is non-null and not empty, we're appending data to the dataframe
append_dataframe = comment_dataframe is not None and not comment_dataframe.empty

comment_count = 0

if append_dataframe:
df_index = len(comment_dataframe.index) # last index in dataframe
df = comment_dataframe
Expand All @@ -62,6 +65,7 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram

df.loc[df_index] = [comment, user, date]
df_index += 1
comment_count += 1

if has_replies:
for reply in item['replies']['comments']:
Expand All @@ -73,8 +77,9 @@ def parse_comment_api_response(self, response, comment_dataframe) -> pd.DataFram

df.loc[df_index] = [comment, user, date]
df_index += 1
comment_count += 1

return df
return df, comment_count

def get_comments(self, video_data) -> pd.DataFrame:
"""
Expand All @@ -91,33 +96,39 @@ def get_comments(self, video_data) -> pd.DataFrame:
comment_count = video_data.comment_count
unfetched_comments = True

while unfetched_comments:
# The API limits comment requests to 100 records
max_comments = min(100, comment_count)
with AstroProgress('Downloading comments', comment_count) as progress:
while unfetched_comments:
# The API limits comment requests to 100 records
max_comments = min(100, comment_count)

self.logger.debug('collecting {} comments'.format(max_comments))

request = self.youtube.commentThreads().list(
part='snippet,replies',
videoId=video_data.video_id,
pageToken=page_token,
maxResults=max_comments,
textFormat='plainText')

self.logger.debug('collecting {} comments'.format(max_comments))
comment_count -= max_comments
unfetched_comments = True if comment_count > 0 else False

request = self.youtube.commentThreads().list(
part='snippet,replies',
videoId=video_data.video_id,
pageToken=page_token,
maxResults=max_comments,
textFormat='plainText')
try:
response = request.execute()
comment_dataframe, comments_added = self.parse_comment_api_response(response, comment_dataframe)
if 'nextPageToken' in response: # there are more comments to fetch
page_token = response['nextPageToken']
else:
self.logger.debug("comment collection complete")
unfetched_comments = False

comment_count -= max_comments
unfetched_comments = True if comment_count > 0 else False
progress.advance(comments_added)

try:
response = request.execute()
comment_dataframe = self.parse_comment_api_response(response, comment_dataframe)
if 'nextPageToken' in response: # there are more comments to fetch
page_token = response['nextPageToken']
else:
unfetched_comments = False
except Exception as e:
self.logger.error(str(e))
self.logger.error(traceback.format_exc())

except Exception as e:
self.logger.error(str(e))
self.logger.error(traceback.format_exc())
progress.complete()

return comment_dataframe

Expand All @@ -138,6 +149,7 @@ def get_video_metadata(self, video_id: str) -> VideoData:
video_stats = response['items'][0]['statistics']

return_data.video_id = video_id
return_data.title = video_data['title']
return_data.channel_id = video_data['channelId']
return_data.channel_title = video_data['channelTitle']
return_data.like_count = int(video_stats['likeCount'])
Expand Down
7 changes: 7 additions & 0 deletions src/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@


class Logger:
log_level_str: str
log_level: int
logger: logging.Logger

def __init__(self, log_level_str: str):
self.log_level_str = log_level_str
self.log_level = self.get_log_level(log_level_str)
self.logger = self.initialize_logging()

Expand All @@ -32,3 +36,6 @@ def initialize_logging(self) -> logging.Logger:

def get_logger(self) -> logging.Logger:
return self.logger

def level(self, log_level_str: str) -> bool:
return self.get_log_level(log_level_str) == self.log_level
37 changes: 37 additions & 0 deletions src/progress.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""
Progress bar utilities.
"""
from rich.progress import (
BarColumn,
MofNCompleteColumn,
Progress,
TextColumn,
TimeElapsedColumn,
TimeRemainingColumn,
)


class AstroProgress(Progress):
total_steps: int
task: int

def __init__(self, task_str: str, steps: int):
self.task_str = task_str
self.total_steps = steps

super().__init__(
TextColumn("{task.description} [progress.percentage]{task.percentage:>3.0f}%"),
BarColumn(),
MofNCompleteColumn(),
TextColumn("•"),
TimeElapsedColumn(),
TextColumn("•"),
TimeRemainingColumn())

self.task = super().add_task('[green]' + task_str, total=self.total_steps)

def advance(self, steps: int):
super().update(self.task, advance=steps)

def complete(self):
return super().update(self.task, completed=self.total_steps)
Loading