Skip to content

Commit 6341806

Browse files
committed
compress file
1 parent a6b96e0 commit 6341806

File tree

2 files changed

+74
-79
lines changed

2 files changed

+74
-79
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,5 @@ dist
1212
*repo_metadata.json
1313
__pycache__
1414
*.duckdb
15-
*.gexf
15+
*.gexf
16+
*.gexf.gz

backend/app/services/gexf_node_service.py

Lines changed: 72 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,11 @@
22
import duckdb
33
import psutil
44
import networkx as nx
5+
import pandas as pd
56
from datetime import datetime
67
import hashlib
78
from pathlib import Path
9+
import gzip
810

911

1012
class GexfNodeGenerator:
@@ -74,18 +76,19 @@ def generate_gexf_nodes_for_topics(self, topics):
7476
# Note: Cannot create indexes on read-only database
7577
# The query will rely on DuckDB's built-in query optimization
7678

77-
# Optimized query with better structure and materialized CTEs
79+
# Optimized query: Push more work to database, use database functions
7880
query = f"""
7981
WITH matching_repos AS MATERIALIZED (
8082
SELECT DISTINCT r.nameWithOwner
8183
FROM repos r
8284
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
8385
WHERE LOWER(t.topic) IN ({placeholders})
86+
ORDER BY COALESCE(r.stars, 0) DESC, COALESCE(r.forks, 0) DESC
8487
),
8588
repo_topics_agg AS MATERIALIZED (
8689
SELECT
8790
r.nameWithOwner,
88-
GROUP_CONCAT(t.topic, '|') AS topics
91+
STRING_AGG(t.topic, '|') AS topics -- Use STRING_AGG instead of GROUP_CONCAT
8992
FROM repos r
9093
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
9194
INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
@@ -101,16 +104,21 @@ def generate_gexf_nodes_for_topics(self, topics):
101104
COALESCE(r.pullRequests, 0) as pullRequests,
102105
COALESCE(r.issues, 0) as issues,
103106
COALESCE(r.primaryLanguage, '') as primaryLanguage,
104-
r.createdAt,
107+
EXTRACT(YEAR FROM CAST(r.createdAt AS TIMESTAMP)) as createdAt_year, -- Extract year in SQL
105108
COALESCE(r.license, '') as license,
106109
rt.topics,
107-
r.bigquery_contributors,
108-
r.bigquery_stargazers
110+
CASE
111+
WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
112+
ELSE ''
113+
END as contributors, -- Convert array to string in SQL
114+
CASE
115+
WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
116+
ELSE ''
117+
END as stargazers -- Convert array to string in SQL
109118
FROM repos r
110119
INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
111120
INNER JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
112-
ORDER BY r.stars DESC, r.forks DESC
113-
LIMIT 10000; -- Limit results to prevent memory issues
121+
ORDER BY r.stars DESC, r.forks DESC;
114122
"""
115123

116124
# Execute query with optimized settings
@@ -130,19 +138,24 @@ def generate_gexf_nodes_for_topics(self, topics):
130138
COALESCE(r.pullRequests, 0) as pullRequests,
131139
COALESCE(r.issues, 0) as issues,
132140
COALESCE(r.primaryLanguage, '') as primaryLanguage,
133-
r.createdAt,
141+
EXTRACT(YEAR FROM CAST(r.createdAt AS TIMESTAMP)) as createdAt_year,
134142
COALESCE(r.license, '') as license,
135-
GROUP_CONCAT(t.topic, '|') as topics,
136-
r.bigquery_contributors,
137-
r.bigquery_stargazers
143+
STRING_AGG(t.topic, '|') as topics,
144+
CASE
145+
WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
146+
ELSE ''
147+
END as contributors,
148+
CASE
149+
WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
150+
ELSE ''
151+
END as stargazers
138152
FROM repos r
139153
INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
140154
WHERE LOWER(t.topic) IN ({placeholders})
141155
GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived,
142156
r.languageCount, r.pullRequests, r.issues, r.primaryLanguage,
143157
r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
144-
ORDER BY r.stars DESC, r.forks DESC
145-
LIMIT 10000;
158+
ORDER BY r.stars DESC, r.forks DESC;
146159
"""
147160
result = self.con.execute(fallback_query, topics_lower).fetchall()
148161

@@ -165,16 +178,17 @@ def generate_gexf_nodes_for_topics(self, topics):
165178
"pullRequests",
166179
"issues",
167180
"primaryLanguage",
168-
"createdAt",
181+
"createdAt_year",
169182
"license",
170183
"topics",
171-
"bigquery_contributors",
172-
"bigquery_stargazers"
184+
"contributors",
185+
"stargazers"
173186
]
174-
G = nx.Graph()
175-
G.graph['has_edges'] = False # Add this attribute to indicate no edges in this graph
176-
177-
# Define default values for each column type
187+
188+
# Use pandas for vectorized processing instead of row-by-row loops
189+
df = pd.DataFrame(result, columns=columns)
190+
191+
# Fill missing values with defaults efficiently
178192
default_values = {
179193
"stars": 0,
180194
"forks": 0,
@@ -184,16 +198,28 @@ def generate_gexf_nodes_for_topics(self, topics):
184198
"pullRequests": 0,
185199
"issues": 0,
186200
"primaryLanguage": "",
187-
"createdAt_year": 0, # Keep only year
201+
"createdAt_year": 0,
188202
"license": "",
189-
"topics": "", # Default empty string for topics
190-
"contributors": "", # Default empty string for contributors
191-
"stargazers": "" # Default empty string for stargazers
203+
"topics": "",
204+
"contributors": "",
205+
"stargazers": ""
192206
}
193-
207+
208+
# Fill NaN values with defaults
209+
for col, default_val in default_values.items():
210+
if col in df.columns:
211+
df[col] = df[col].fillna(default_val)
212+
213+
# Add GitHub URL column efficiently
214+
df['github_url'] = 'https://github.com/' + df['nameWithOwner']
215+
216+
# Convert DataFrame to graph efficiently
217+
G = nx.Graph()
218+
G.graph['has_edges'] = False
219+
194220
# Add attributes to the graph
195221
G.graph['node_attributes'] = {
196-
'createdAt_year': {'type': 'integer'}, # Keep only year
222+
'createdAt_year': {'type': 'integer'},
197223
'stars': {'type': 'integer'},
198224
'forks': {'type': 'integer'},
199225
'watchers': {'type': 'integer'},
@@ -204,63 +230,31 @@ def generate_gexf_nodes_for_topics(self, topics):
204230
'primaryLanguage': {'type': 'string'},
205231
'license': {'type': 'string'},
206232
'github_url': {'type': 'string'},
207-
'topics': {'type': 'string'}, # Add topics as a string attribute
208-
'contributors': {'type': 'string'}, # Add contributors as a string attribute
209-
'stargazers': {'type': 'string'}, # Add stargazers as a string attribute
233+
'topics': {'type': 'string'},
234+
'contributors': {'type': 'string'},
235+
'stargazers': {'type': 'string'},
210236
}
211-
212-
for row in result:
213-
node_attrs = {}
214-
for col, val in zip(columns, row):
215-
if col == "nameWithOwner":
216-
repo_name = val
217-
# Add GitHub URL using nameWithOwner
218-
node_attrs["github_url"] = f"https://github.com/{val}"
219-
elif col == "createdAt":
220-
# Only extract year from the date
221-
if val:
222-
try:
223-
# Handle both string and datetime objects
224-
if isinstance(val, str):
225-
# Parse ISO format date (e.g., "2018-06-02T04:08:16Z")
226-
date = datetime.strptime(val.split('T')[0], "%Y-%m-%d")
227-
else:
228-
date = val # Assume it's already a datetime object
229-
node_attrs["createdAt_year"] = date.year
230-
except (ValueError, TypeError) as e:
231-
print(f"Error processing date for {repo_name}: {e}")
232-
# If date parsing fails, use default value
233-
node_attrs["createdAt_year"] = 0
234-
else:
235-
node_attrs["createdAt_year"] = 0
236-
elif col == "topics":
237-
# Store topics as a comma-separated string
238-
node_attrs[col] = val if val else default_values[col]
239-
elif col == "bigquery_contributors":
240-
# Store contributors as a comma-separated string
241-
if val and isinstance(val, list):
242-
node_attrs["contributors"] = ",".join(val)
243-
else:
244-
node_attrs["contributors"] = ""
245-
elif col == "bigquery_stargazers":
246-
# Store stargazers as a comma-separated string
247-
if val and isinstance(val, list):
248-
node_attrs["stargazers"] = ",".join(val)
249-
else:
250-
node_attrs["stargazers"] = ""
251-
elif col == "isArchived":
252-
# Ensure isArchived is always a boolean value
253-
node_attrs[col] = bool(val) if val is not None else False
254-
else:
255-
# Use default value if the value is None
256-
node_attrs[col] = default_values[col] if val is None else val
257-
G.add_node(repo_name, **node_attrs)
237+
238+
# Bulk add nodes efficiently
239+
for _, row in df.iterrows():
240+
attrs = row.to_dict()
241+
repo_name = attrs.pop("nameWithOwner")
242+
G.add_node(repo_name, **attrs)
258243

259244
# Print some statistics about the years
260245
years = [attrs.get("createdAt_year", 0) for _, attrs in G.nodes(data=True)]
261246
# print(f"Date statistics:")
262247
# print(f"Years range: {min(years)} to {max(years)}")
263248
# print(f"Number of nodes with year=0: {years.count(0)}")
264249

265-
nx.write_gexf(G, gexf_path)
266-
return gexf_path # Return the unique file path
250+
# Write GEXF file
251+
nx.write_gexf(G, gexf_path, encoding="utf-8", prettyprint=False)
252+
253+
# Also create a compressed version for storage efficiency (optional)
254+
compressed_path = gexf_path + '.gz'
255+
with open(gexf_path, 'rb') as f_in:
256+
with gzip.open(compressed_path, 'wb', compresslevel=6) as f_out:
257+
f_out.writelines(f_in)
258+
259+
# Return the uncompressed file path for immediate use
260+
return gexf_path

0 commit comments

Comments
 (0)