22import duckdb
33import psutil
44import networkx as nx
5+ import pandas as pd
56from datetime import datetime
67import hashlib
78from pathlib import Path
9+ import gzip
810
911
1012class GexfNodeGenerator :
@@ -74,18 +76,19 @@ def generate_gexf_nodes_for_topics(self, topics):
7476 # Note: Cannot create indexes on read-only database
7577 # The query will rely on DuckDB's built-in query optimization
7678
77- # Optimized query with better structure and materialized CTEs
79+ # Optimized query: Push more work to database, use database functions
7880 query = f"""
7981 WITH matching_repos AS MATERIALIZED (
8082 SELECT DISTINCT r.nameWithOwner
8183 FROM repos r
8284 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
8385 WHERE LOWER(t.topic) IN ({ placeholders } )
86+ ORDER BY COALESCE(r.stars, 0) DESC, COALESCE(r.forks, 0) DESC
8487 ),
8588 repo_topics_agg AS MATERIALIZED (
8689 SELECT
8790 r.nameWithOwner,
88- GROUP_CONCAT (t.topic, '|') AS topics
91+ STRING_AGG (t.topic, '|') AS topics -- Use STRING_AGG instead of GROUP_CONCAT
8992 FROM repos r
9093 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
9194 INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
@@ -101,16 +104,21 @@ def generate_gexf_nodes_for_topics(self, topics):
101104 COALESCE(r.pullRequests, 0) as pullRequests,
102105 COALESCE(r.issues, 0) as issues,
103106 COALESCE(r.primaryLanguage, '') as primaryLanguage,
104- r.createdAt,
107+ EXTRACT(YEAR FROM CAST( r.createdAt AS TIMESTAMP)) as createdAt_year, -- Extract year in SQL
105108 COALESCE(r.license, '') as license,
106109 rt.topics,
107- r.bigquery_contributors,
108- r.bigquery_stargazers
110+ CASE
111+ WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
112+ ELSE ''
113+ END as contributors, -- Convert array to string in SQL
114+ CASE
115+ WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
116+ ELSE ''
117+ END as stargazers -- Convert array to string in SQL
109118 FROM repos r
110119 INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
111120 INNER JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
112- ORDER BY r.stars DESC, r.forks DESC
113- LIMIT 10000; -- Limit results to prevent memory issues
121+ ORDER BY r.stars DESC, r.forks DESC;
114122 """
115123
116124 # Execute query with optimized settings
@@ -130,19 +138,24 @@ def generate_gexf_nodes_for_topics(self, topics):
130138 COALESCE(r.pullRequests, 0) as pullRequests,
131139 COALESCE(r.issues, 0) as issues,
132140 COALESCE(r.primaryLanguage, '') as primaryLanguage,
133- r.createdAt,
141+ EXTRACT(YEAR FROM CAST( r.createdAt AS TIMESTAMP)) as createdAt_year ,
134142 COALESCE(r.license, '') as license,
135- GROUP_CONCAT(t.topic, '|') as topics,
136- r.bigquery_contributors,
137- r.bigquery_stargazers
143+ STRING_AGG(t.topic, '|') as topics,
144+ CASE
145+ WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
146+ ELSE ''
147+ END as contributors,
148+ CASE
149+ WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
150+ ELSE ''
151+ END as stargazers
138152 FROM repos r
139153 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
140154 WHERE LOWER(t.topic) IN ({ placeholders } )
141155 GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived,
142156 r.languageCount, r.pullRequests, r.issues, r.primaryLanguage,
143157 r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
144- ORDER BY r.stars DESC, r.forks DESC
145- LIMIT 10000;
158+ ORDER BY r.stars DESC, r.forks DESC;
146159 """
147160 result = self .con .execute (fallback_query , topics_lower ).fetchall ()
148161
@@ -165,16 +178,17 @@ def generate_gexf_nodes_for_topics(self, topics):
165178 "pullRequests" ,
166179 "issues" ,
167180 "primaryLanguage" ,
168- "createdAt " ,
181+ "createdAt_year " ,
169182 "license" ,
170183 "topics" ,
171- "bigquery_contributors " ,
172- "bigquery_stargazers "
184+ "contributors " ,
185+ "stargazers "
173186 ]
174- G = nx .Graph ()
175- G .graph ['has_edges' ] = False # Add this attribute to indicate no edges in this graph
176-
177- # Define default values for each column type
187+
188+ # Use pandas for vectorized processing instead of row-by-row loops
189+ df = pd .DataFrame (result , columns = columns )
190+
191+ # Fill missing values with defaults efficiently
178192 default_values = {
179193 "stars" : 0 ,
180194 "forks" : 0 ,
@@ -184,16 +198,28 @@ def generate_gexf_nodes_for_topics(self, topics):
184198 "pullRequests" : 0 ,
185199 "issues" : 0 ,
186200 "primaryLanguage" : "" ,
187- "createdAt_year" : 0 , # Keep only year
201+ "createdAt_year" : 0 ,
188202 "license" : "" ,
189- "topics" : "" , # Default empty string for topics
190- "contributors" : "" , # Default empty string for contributors
191- "stargazers" : "" # Default empty string for stargazers
203+ "topics" : "" ,
204+ "contributors" : "" ,
205+ "stargazers" : ""
192206 }
193-
207+
208+ # Fill NaN values with defaults
209+ for col , default_val in default_values .items ():
210+ if col in df .columns :
211+ df [col ] = df [col ].fillna (default_val )
212+
213+ # Add GitHub URL column efficiently
214+ df ['github_url' ] = 'https://github.com/' + df ['nameWithOwner' ]
215+
216+ # Convert DataFrame to graph efficiently
217+ G = nx .Graph ()
218+ G .graph ['has_edges' ] = False
219+
194220 # Add attributes to the graph
195221 G .graph ['node_attributes' ] = {
196- 'createdAt_year' : {'type' : 'integer' }, # Keep only year
222+ 'createdAt_year' : {'type' : 'integer' },
197223 'stars' : {'type' : 'integer' },
198224 'forks' : {'type' : 'integer' },
199225 'watchers' : {'type' : 'integer' },
@@ -204,63 +230,31 @@ def generate_gexf_nodes_for_topics(self, topics):
204230 'primaryLanguage' : {'type' : 'string' },
205231 'license' : {'type' : 'string' },
206232 'github_url' : {'type' : 'string' },
207- 'topics' : {'type' : 'string' }, # Add topics as a string attribute
208- 'contributors' : {'type' : 'string' }, # Add contributors as a string attribute
209- 'stargazers' : {'type' : 'string' }, # Add stargazers as a string attribute
233+ 'topics' : {'type' : 'string' },
234+ 'contributors' : {'type' : 'string' },
235+ 'stargazers' : {'type' : 'string' },
210236 }
211-
212- for row in result :
213- node_attrs = {}
214- for col , val in zip (columns , row ):
215- if col == "nameWithOwner" :
216- repo_name = val
217- # Add GitHub URL using nameWithOwner
218- node_attrs ["github_url" ] = f"https://github.com/{ val } "
219- elif col == "createdAt" :
220- # Only extract year from the date
221- if val :
222- try :
223- # Handle both string and datetime objects
224- if isinstance (val , str ):
225- # Parse ISO format date (e.g., "2018-06-02T04:08:16Z")
226- date = datetime .strptime (val .split ('T' )[0 ], "%Y-%m-%d" )
227- else :
228- date = val # Assume it's already a datetime object
229- node_attrs ["createdAt_year" ] = date .year
230- except (ValueError , TypeError ) as e :
231- print (f"Error processing date for { repo_name } : { e } " )
232- # If date parsing fails, use default value
233- node_attrs ["createdAt_year" ] = 0
234- else :
235- node_attrs ["createdAt_year" ] = 0
236- elif col == "topics" :
237- # Store topics as a comma-separated string
238- node_attrs [col ] = val if val else default_values [col ]
239- elif col == "bigquery_contributors" :
240- # Store contributors as a comma-separated string
241- if val and isinstance (val , list ):
242- node_attrs ["contributors" ] = "," .join (val )
243- else :
244- node_attrs ["contributors" ] = ""
245- elif col == "bigquery_stargazers" :
246- # Store stargazers as a comma-separated string
247- if val and isinstance (val , list ):
248- node_attrs ["stargazers" ] = "," .join (val )
249- else :
250- node_attrs ["stargazers" ] = ""
251- elif col == "isArchived" :
252- # Ensure isArchived is always a boolean value
253- node_attrs [col ] = bool (val ) if val is not None else False
254- else :
255- # Use default value if the value is None
256- node_attrs [col ] = default_values [col ] if val is None else val
257- G .add_node (repo_name , ** node_attrs )
237+
238+ # Bulk add nodes efficiently
239+ for _ , row in df .iterrows ():
240+ attrs = row .to_dict ()
241+ repo_name = attrs .pop ("nameWithOwner" )
242+ G .add_node (repo_name , ** attrs )
258243
259244 # Print some statistics about the years
260245 years = [attrs .get ("createdAt_year" , 0 ) for _ , attrs in G .nodes (data = True )]
261246 # print(f"Date statistics:")
262247 # print(f"Years range: {min(years)} to {max(years)}")
263248 # print(f"Number of nodes with year=0: {years.count(0)}")
264249
265- nx .write_gexf (G , gexf_path )
266- return gexf_path # Return the unique file path
250+ # Write GEXF file
251+ nx .write_gexf (G , gexf_path , encoding = "utf-8" , prettyprint = False )
252+
253+ # Also create a compressed version for storage efficiency (optional)
254+ compressed_path = gexf_path + '.gz'
255+ with open (gexf_path , 'rb' ) as f_in :
256+ with gzip .open (compressed_path , 'wb' , compresslevel = 6 ) as f_out :
257+ f_out .writelines (f_in )
258+
259+ # Return the uncompressed file path for immediate use
260+ return gexf_path
0 commit comments