compress file

yilinxia · yilinxia · commit 634180672fda · 2025-08-30T14:08:21.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -12,4 +12,5 @@ dist
 *repo_metadata.json
 __pycache__
 *.duckdb
-*.gexf
+*.gexf
+*.gexf.gz
diff --git a/backend/app/services/gexf_node_service.py b/backend/app/services/gexf_node_service.py
@@ -2,9 +2,11 @@
 import duckdb
 import psutil
 import networkx as nx
+import pandas as pd
 from datetime import datetime
 import hashlib
 from pathlib import Path
+import gzip
 
 
 class GexfNodeGenerator:
@@ -74,18 +76,19 @@ def generate_gexf_nodes_for_topics(self, topics):
         # Note: Cannot create indexes on read-only database
         # The query will rely on DuckDB's built-in query optimization
 
-        # Optimized query with better structure and materialized CTEs
+        # Optimized query: Push more work to database, use database functions
         query = f"""
            WITH matching_repos AS MATERIALIZED (
                 SELECT DISTINCT r.nameWithOwner
                 FROM repos r
                 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
                 WHERE LOWER(t.topic) IN ({placeholders})
+                ORDER BY COALESCE(r.stars, 0) DESC, COALESCE(r.forks, 0) DESC
             ),
             repo_topics_agg AS MATERIALIZED (
                 SELECT 
                     r.nameWithOwner,
-                    GROUP_CONCAT(t.topic, '|') AS topics
+                    STRING_AGG(t.topic, '|') AS topics  -- Use STRING_AGG instead of GROUP_CONCAT
                 FROM repos r
                 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
                 INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
@@ -101,16 +104,21 @@ def generate_gexf_nodes_for_topics(self, topics):
                 COALESCE(r.pullRequests, 0) as pullRequests,
                 COALESCE(r.issues, 0) as issues,
                 COALESCE(r.primaryLanguage, '') as primaryLanguage,
-                r.createdAt,
+                EXTRACT(YEAR FROM CAST(r.createdAt AS TIMESTAMP)) as createdAt_year,  -- Extract year in SQL
                 COALESCE(r.license, '') as license,
                 rt.topics,
-                r.bigquery_contributors,
-                r.bigquery_stargazers
+                CASE 
+                    WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
+                    ELSE ''
+                END as contributors,  -- Convert array to string in SQL
+                CASE 
+                    WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
+                    ELSE ''
+                END as stargazers  -- Convert array to string in SQL
             FROM repos r
             INNER JOIN matching_repos mr ON r.nameWithOwner = mr.nameWithOwner
             INNER JOIN repo_topics_agg rt ON r.nameWithOwner = rt.nameWithOwner
-            ORDER BY r.stars DESC, r.forks DESC
-            LIMIT 10000;  -- Limit results to prevent memory issues
+            ORDER BY r.stars DESC, r.forks DESC;
         """
         
         # Execute query with optimized settings
@@ -130,19 +138,24 @@ def generate_gexf_nodes_for_topics(self, topics):
                     COALESCE(r.pullRequests, 0) as pullRequests,
                     COALESCE(r.issues, 0) as issues,
                     COALESCE(r.primaryLanguage, '') as primaryLanguage,
-                    r.createdAt,
+                    EXTRACT(YEAR FROM CAST(r.createdAt AS TIMESTAMP)) as createdAt_year,
                     COALESCE(r.license, '') as license,
-                    GROUP_CONCAT(t.topic, '|') as topics,
-                    r.bigquery_contributors,
-                    r.bigquery_stargazers
+                    STRING_AGG(t.topic, '|') as topics,
+                    CASE 
+                        WHEN r.bigquery_contributors IS NOT NULL THEN array_to_string(r.bigquery_contributors, ',')
+                        ELSE ''
+                    END as contributors,
+                    CASE 
+                        WHEN r.bigquery_stargazers IS NOT NULL THEN array_to_string(r.bigquery_stargazers, ',')
+                        ELSE ''
+                    END as stargazers
                 FROM repos r
                 INNER JOIN repo_topics t ON r.nameWithOwner = t.repo
                 WHERE LOWER(t.topic) IN ({placeholders})
                 GROUP BY r.nameWithOwner, r.stars, r.forks, r.watchers, r.isArchived, 
                          r.languageCount, r.pullRequests, r.issues, r.primaryLanguage, 
                          r.createdAt, r.license, r.bigquery_contributors, r.bigquery_stargazers
-                ORDER BY r.stars DESC, r.forks DESC
-                LIMIT 10000;
+                ORDER BY r.stars DESC, r.forks DESC;
             """
             result = self.con.execute(fallback_query, topics_lower).fetchall()
         
@@ -165,16 +178,17 @@ def generate_gexf_nodes_for_topics(self, topics):
             "pullRequests",
             "issues",
             "primaryLanguage",
-            "createdAt",
+            "createdAt_year",
             "license",
             "topics",
-            "bigquery_contributors",
-            "bigquery_stargazers"
+            "contributors",
+            "stargazers"
         ]
-        G = nx.Graph()
-        G.graph['has_edges'] = False  # Add this attribute to indicate no edges in this graph
-
-        # Define default values for each column type
+        
+        # Use pandas for vectorized processing instead of row-by-row loops
+        df = pd.DataFrame(result, columns=columns)
+        
+        # Fill missing values with defaults efficiently
         default_values = {
             "stars": 0,
             "forks": 0,
@@ -184,16 +198,28 @@ def generate_gexf_nodes_for_topics(self, topics):
             "pullRequests": 0,
             "issues": 0,
             "primaryLanguage": "",
-            "createdAt_year": 0,  # Keep only year
+            "createdAt_year": 0,
             "license": "",
-            "topics": "",  # Default empty string for topics
-            "contributors": "",  # Default empty string for contributors
-            "stargazers": ""     # Default empty string for stargazers
+            "topics": "",
+            "contributors": "",
+            "stargazers": ""
         }
-
+        
+        # Fill NaN values with defaults
+        for col, default_val in default_values.items():
+            if col in df.columns:
+                df[col] = df[col].fillna(default_val)
+        
+        # Add GitHub URL column efficiently
+        df['github_url'] = 'https://github.com/' + df['nameWithOwner']
+        
+        # Convert DataFrame to graph efficiently
+        G = nx.Graph()
+        G.graph['has_edges'] = False
+        
         # Add attributes to the graph
         G.graph['node_attributes'] = {
-            'createdAt_year': {'type': 'integer'},  # Keep only year
+            'createdAt_year': {'type': 'integer'},
             'stars': {'type': 'integer'},
             'forks': {'type': 'integer'},
             'watchers': {'type': 'integer'},
@@ -204,63 +230,31 @@ def generate_gexf_nodes_for_topics(self, topics):
             'primaryLanguage': {'type': 'string'},
             'license': {'type': 'string'},
             'github_url': {'type': 'string'},
-            'topics': {'type': 'string'},  # Add topics as a string attribute
-            'contributors': {'type': 'string'},  # Add contributors as a string attribute
-            'stargazers': {'type': 'string'},  # Add stargazers as a string attribute
+            'topics': {'type': 'string'},
+            'contributors': {'type': 'string'},
+            'stargazers': {'type': 'string'},
         }
-
-        for row in result:
-            node_attrs = {}
-            for col, val in zip(columns, row):
-                if col == "nameWithOwner":
-                    repo_name = val
-                    # Add GitHub URL using nameWithOwner
-                    node_attrs["github_url"] = f"https://github.com/{val}"
-                elif col == "createdAt":
-                    # Only extract year from the date
-                    if val:
-                        try:
-                            # Handle both string and datetime objects
-                            if isinstance(val, str):
-                                # Parse ISO format date (e.g., "2018-06-02T04:08:16Z")
-                                date = datetime.strptime(val.split('T')[0], "%Y-%m-%d")
-                            else:
-                                date = val  # Assume it's already a datetime object
-                            node_attrs["createdAt_year"] = date.year
-                        except (ValueError, TypeError) as e:
-                            print(f"Error processing date for {repo_name}: {e}")
-                            # If date parsing fails, use default value
-                            node_attrs["createdAt_year"] = 0
-                    else:
-                        node_attrs["createdAt_year"] = 0
-                elif col == "topics":
-                    # Store topics as a comma-separated string
-                    node_attrs[col] = val if val else default_values[col]
-                elif col == "bigquery_contributors":
-                    # Store contributors as a comma-separated string
-                    if val and isinstance(val, list):
-                        node_attrs["contributors"] = ",".join(val)
-                    else:
-                        node_attrs["contributors"] = ""
-                elif col == "bigquery_stargazers":
-                    # Store stargazers as a comma-separated string
-                    if val and isinstance(val, list):
-                        node_attrs["stargazers"] = ",".join(val)
-                    else:
-                        node_attrs["stargazers"] = ""
-                elif col == "isArchived":
-                    # Ensure isArchived is always a boolean value
-                    node_attrs[col] = bool(val) if val is not None else False
-                else:
-                    # Use default value if the value is None
-                    node_attrs[col] = default_values[col] if val is None else val
-            G.add_node(repo_name, **node_attrs)
+        
+        # Bulk add nodes efficiently
+        for _, row in df.iterrows():
+            attrs = row.to_dict()
+            repo_name = attrs.pop("nameWithOwner")
+            G.add_node(repo_name, **attrs)
 
         # Print some statistics about the years
         years = [attrs.get("createdAt_year", 0) for _, attrs in G.nodes(data=True)]
         # print(f"Date statistics:")
         # print(f"Years range: {min(years)} to {max(years)}")
         # print(f"Number of nodes with year=0: {years.count(0)}")
 
-        nx.write_gexf(G, gexf_path)
-        return gexf_path  # Return the unique file path
+        # Write GEXF file
+        nx.write_gexf(G, gexf_path, encoding="utf-8", prettyprint=False)
+        
+        # Also create a compressed version for storage efficiency (optional)
+        compressed_path = gexf_path + '.gz'
+        with open(gexf_path, 'rb') as f_in:
+            with gzip.open(compressed_path, 'wb', compresslevel=6) as f_out:
+                f_out.writelines(f_in)
+        
+        # Return the uncompressed file path for immediate use
+        return gexf_path