Skip to content

Commit 18c3994

Browse files
committed
remove api limiters for traffic stats
1 parent fd45783 commit 18c3994

File tree

1 file changed

+18
-15
lines changed

1 file changed

+18
-15
lines changed

pipeline/src/nf_core_stats/github_pipeline.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,9 +40,7 @@ def github_source(organization: str = "nf-core", api_token: str = dlt.secrets.va
4040
)
4141

4242
return [
43-
dlt.resource(
44-
traffic_stats(organization, headers, repos, only_active_repos=True, max_repos=30), name="traffic_stats"
45-
),
43+
dlt.resource(traffic_stats(organization, headers, repos), name="traffic_stats"),
4644
dlt.resource(contributor_stats(organization, headers, repos), name="contributor_stats"),
4745
dlt.resource(issue_stats(organization, headers, repos), name="issue_stats"),
4846
dlt.resource(org_members(organization, headers), name="org_members"),
@@ -54,18 +52,18 @@ def github_source(organization: str = "nf-core", api_token: str = dlt.secrets.va
5452

5553
@dlt.resource(write_disposition="merge", primary_key=["pipeline_name", "timestamp"])
5654
def traffic_stats(
57-
organization: str, headers: dict, repos: list[dict], only_active_repos: bool = True, max_repos: int | None = 30
55+
organization: str, headers: dict, repos: list[dict], only_active_repos: bool = False, max_repos: int | None = None
5856
) -> Iterator[dict]:
59-
"""Collect traffic stats for repositories with optimizations to reduce API burden
57+
"""Collect traffic stats for repositories
6058
6159
Args:
6260
organization: GitHub organization name
6361
headers: GitHub API headers
6462
repos: List of repository data
65-
only_active_repos: Only collect traffic for recently active repos (default True)
66-
max_repos: Maximum number of repos to process (default 30, None for all)
63+
only_active_repos: Only collect traffic for recently active repos (default False)
64+
max_repos: Maximum number of repos to process (default None for all)
6765
"""
68-
# Filter repositories to reduce API calls
66+
# Filter repositories if requested
6967
filtered_repos = repos
7068

7169
if only_active_repos:
@@ -78,6 +76,12 @@ def traffic_stats(
7876
and not repo["archived"]
7977
]
8078
logger.info(f"Filtered to {len(filtered_repos)} active repositories (updated in last 6 months)")
79+
else:
80+
# Process all repos, but skip archived ones
81+
filtered_repos = [repo for repo in repos if not repo["archived"]]
82+
logger.info(
83+
f"Processing {len(filtered_repos)} repositories (skipping {len(repos) - len(filtered_repos)} archived)"
84+
)
8185

8286
# Sort by stars/activity to prioritize important repos
8387
filtered_repos = sorted(filtered_repos, key=lambda x: x.get("stargazers_count", 0), reverse=True)
@@ -86,7 +90,7 @@ def traffic_stats(
8690
filtered_repos = filtered_repos[:max_repos]
8791
logger.info(f"Limited to top {max_repos} repositories by stars")
8892

89-
logger.info(f"Collecting traffic stats for {len(filtered_repos)} repositories (reduced from {len(repos)})")
93+
logger.info(f"Collecting traffic stats for {len(filtered_repos)} repositories")
9094

9195
successful_repos = 0
9296
failed_repos = 0
@@ -440,6 +444,7 @@ def commit_stats(organization: str, headers: dict, repos: list[dict]) -> Iterato
440444

441445
commit_counts[week_timestamp] = commit_counts.get(week_timestamp, 0) + 1
442446

447+
# Yield all data for this repo
443448
for timestamp, commit_count in commit_counts.items():
444449
yield {
445450
"pipeline_name": name,
@@ -531,8 +536,8 @@ def main(
531536
]
532537
]
533538
| None = None,
534-
traffic_only_active_repos: bool = True,
535-
traffic_max_repos: int | None = 50,
539+
traffic_only_active_repos: bool = False,
540+
traffic_max_repos: int | None = None,
536541
):
537542
"""
538543
Run the github data ingestion pipeline
@@ -541,10 +546,8 @@ def main(
541546
destination: dlt backend. Use 'motherduck' for production. Can use 'duckdb' for local testing
542547
resources: Resources to run. If None, run all resources.
543548
traffic_only_active_repos:
544-
Only collect traffic for repos updated in last 6 months.
545-
Traffic stats optimization settings to reduce API burden.
546-
This reduces API calls from ~100+ repos to ~30 most important active repos.
547-
traffic_max_repos: Limit to top N repos by stars (None for all)
549+
Only collect traffic for repos updated in last 6 months (default False - all repos).
550+
traffic_max_repos: Limit to top N repos by stars (default None - all repos)
548551
"""
549552
logger.info("Starting GitHub data pipeline...")
550553
pipeline = dlt.pipeline(pipeline_name="github", destination=destination, dataset_name="github")

0 commit comments

Comments
 (0)