@@ -40,9 +40,7 @@ def github_source(organization: str = "nf-core", api_token: str = dlt.secrets.va
4040 )
4141
4242 return [
43- dlt .resource (
44- traffic_stats (organization , headers , repos , only_active_repos = True , max_repos = 30 ), name = "traffic_stats"
45- ),
43+ dlt .resource (traffic_stats (organization , headers , repos ), name = "traffic_stats" ),
4644 dlt .resource (contributor_stats (organization , headers , repos ), name = "contributor_stats" ),
4745 dlt .resource (issue_stats (organization , headers , repos ), name = "issue_stats" ),
4846 dlt .resource (org_members (organization , headers ), name = "org_members" ),
@@ -54,18 +52,18 @@ def github_source(organization: str = "nf-core", api_token: str = dlt.secrets.va
5452
5553@dlt .resource (write_disposition = "merge" , primary_key = ["pipeline_name" , "timestamp" ])
5654def traffic_stats (
57- organization : str , headers : dict , repos : list [dict ], only_active_repos : bool = True , max_repos : int | None = 30
55+ organization : str , headers : dict , repos : list [dict ], only_active_repos : bool = False , max_repos : int | None = None
5856) -> Iterator [dict ]:
59- """Collect traffic stats for repositories with optimizations to reduce API burden
57+ """Collect traffic stats for repositories
6058
6159 Args:
6260 organization: GitHub organization name
6361 headers: GitHub API headers
6462 repos: List of repository data
65- only_active_repos: Only collect traffic for recently active repos (default True )
66- max_repos: Maximum number of repos to process (default 30, None for all)
63+ only_active_repos: Only collect traffic for recently active repos (default False )
64+ max_repos: Maximum number of repos to process (default None for all)
6765 """
68- # Filter repositories to reduce API calls
66+ # Filter repositories if requested
6967 filtered_repos = repos
7068
7169 if only_active_repos :
@@ -78,6 +76,12 @@ def traffic_stats(
7876 and not repo ["archived" ]
7977 ]
8078 logger .info (f"Filtered to { len (filtered_repos )} active repositories (updated in last 6 months)" )
79+ else :
80+ # Process all repos, but skip archived ones
81+ filtered_repos = [repo for repo in repos if not repo ["archived" ]]
82+ logger .info (
83+ f"Processing { len (filtered_repos )} repositories (skipping { len (repos ) - len (filtered_repos )} archived)"
84+ )
8185
8286 # Sort by stars/activity to prioritize important repos
8387 filtered_repos = sorted (filtered_repos , key = lambda x : x .get ("stargazers_count" , 0 ), reverse = True )
@@ -86,7 +90,7 @@ def traffic_stats(
8690 filtered_repos = filtered_repos [:max_repos ]
8791 logger .info (f"Limited to top { max_repos } repositories by stars" )
8892
89- logger .info (f"Collecting traffic stats for { len (filtered_repos )} repositories (reduced from { len ( repos ) } ) " )
93+ logger .info (f"Collecting traffic stats for { len (filtered_repos )} repositories" )
9094
9195 successful_repos = 0
9296 failed_repos = 0
@@ -440,6 +444,7 @@ def commit_stats(organization: str, headers: dict, repos: list[dict]) -> Iterato
440444
441445 commit_counts [week_timestamp ] = commit_counts .get (week_timestamp , 0 ) + 1
442446
447+ # Yield all data for this repo
443448 for timestamp , commit_count in commit_counts .items ():
444449 yield {
445450 "pipeline_name" : name ,
@@ -531,8 +536,8 @@ def main(
531536 ]
532537 ]
533538 | None = None ,
534- traffic_only_active_repos : bool = True ,
535- traffic_max_repos : int | None = 50 ,
539+ traffic_only_active_repos : bool = False ,
540+ traffic_max_repos : int | None = None ,
536541):
537542 """
538543 Run the github data ingestion pipeline
@@ -541,10 +546,8 @@ def main(
541546 destination: dlt backend. Use 'motherduck' for production. Can use 'duckdb' for local testing
542547 resources: Resources to run. If None, run all resources.
543548 traffic_only_active_repos:
544- Only collect traffic for repos updated in last 6 months.
545- Traffic stats optimization settings to reduce API burden.
546- This reduces API calls from ~100+ repos to ~30 most important active repos.
547- traffic_max_repos: Limit to top N repos by stars (None for all)
549+ Only collect traffic for repos updated in last 6 months (default False - all repos).
550+ traffic_max_repos: Limit to top N repos by stars (default None - all repos)
548551 """
549552 logger .info ("Starting GitHub data pipeline..." )
550553 pipeline = dlt .pipeline (pipeline_name = "github" , destination = destination , dataset_name = "github" )
0 commit comments