-
Notifications
You must be signed in to change notification settings - Fork 908
feat: contributor metric #3213
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
feat: contributor metric #3213
Changes from 7 commits
bc9a5e0
f936e01
bdaa65e
245ae6b
611a911
91731ac
91a2bf8
299bd90
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3600,4 +3600,50 @@ class RepoClone(Base): | |
count_clones = Column(BigInteger) | ||
clone_data_timestamp = Column(TIMESTAMP(precision=6)) | ||
|
||
repo = relationship("Repo") | ||
repo = relationship("Repo") | ||
|
||
class ContributorEngagement(Base): | ||
__tablename__ = "contributor_engagement" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @ABrain7710 / @Ulincsys : Can you confirm that its our practice not to modify the main script for table creation, but to have the versioning script also included so that new installs just get "all the upgrades"? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The way i've done it in the past for other projects is that both the main schema gets modified AND migrations get created. Then if someone creates a new database, they get the latest schema (this has required a small bit of code when augur detects a new DB and creates the tables to stamp it with the current alembic version). Then that database can be upgraded as time goes on, but new dbs are always starting out on the latest version |
||
__table_args__ = {"schema": "augur_data"} | ||
|
||
engagement_id = Column( | ||
BigInteger, | ||
primary_key=True, | ||
server_default=text( | ||
"nextval('augur_data.contributor_engagement_engagement_id_seq'::regclass)" | ||
), | ||
) | ||
repo_id = Column(ForeignKey("augur_data.repo.repo_id"), nullable=False) | ||
cntrb_id = Column(ForeignKey("augur_data.contributors.cntrb_id"), nullable=False) | ||
username = Column(String, nullable=False) | ||
full_name = Column(String) | ||
country = Column(String) | ||
platform = Column(String) | ||
|
||
# D0 Level - Basic Engagement | ||
d0_forked = Column(Boolean, server_default=text("false")) | ||
d0_starred_or_watched = Column(Boolean, server_default=text("false")) | ||
d0_engagement_timestamp = Column(TIMESTAMP(precision=6)) | ||
|
||
# D1 Level - Issue/Review Engagement | ||
d1_first_issue_created_at = Column(TIMESTAMP(precision=6)) | ||
d1_first_pr_opened_at = Column(TIMESTAMP(precision=6)) | ||
d1_first_pr_commented_at = Column(TIMESTAMP(precision=6)) | ||
|
||
# D2 Level - Significant Contributions | ||
d2_has_merged_pr = Column(Boolean, server_default=text("false")) | ||
d2_created_many_issues = Column(Boolean, server_default=text("false")) | ||
d2_total_comments = Column(BigInteger, server_default=text("0")) | ||
d2_has_pr_with_many_commits = Column(Boolean, server_default=text("false")) | ||
d2_commented_on_multiple_prs = Column(Boolean, server_default=text("false")) | ||
|
||
# Metadata | ||
tool_source = Column(String) | ||
tool_version = Column(String) | ||
data_source = Column(String) | ||
data_collection_date = Column( | ||
TIMESTAMP(precision=6), server_default=text("CURRENT_TIMESTAMP") | ||
) | ||
|
||
repo = relationship("Repo") | ||
contributor = relationship("Contributor") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
""" | ||
Contributor Engagement Tasks | ||
|
||
This module contains tasks for collecting and processing contributor engagement data | ||
at different levels (D0, D1, D2) as defined in the CHAOSS metrics. | ||
|
||
D0: Basic engagement (forks, stars/watches) | ||
D1: Issue/review engagement (first issue, first PR, first comment) | ||
D2: Significant contributions (merged PRs, many issues, multiple comments) | ||
""" | ||
|
||
from augur.tasks.contributors_engagement.worker import ( | ||
collect_contributor_engagement, | ||
collect_d0_engagement, | ||
collect_d1_engagement, | ||
collect_d2_engagement | ||
) | ||
|
||
__all__ = [ | ||
'collect_contributor_engagement', | ||
'collect_d0_engagement', | ||
'collect_d1_engagement', | ||
'collect_d2_engagement' | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,278 @@ | ||
-- name: d0_engagement_query | ||
SELECT DISTINCT ON (c.cntrb_login, cr.cntrb_category) | ||
c.cntrb_id, | ||
c.cntrb_login AS username, | ||
c.cntrb_full_name AS full_name, | ||
c.cntrb_country_code AS country, | ||
CASE | ||
WHEN cr.repo_git ILIKE '%gitlab%' THEN 'GitLab' | ||
WHEN cr.repo_git ILIKE '%github%' THEN 'GitHub' | ||
ELSE 'Unknown' | ||
END AS platform, | ||
(cr.cntrb_category = 'ForkEvent') AS forked, | ||
(cr.cntrb_category = 'WatchEvent') AS starred_or_watched, | ||
cr.created_at AS engagement_timestamp | ||
FROM | ||
augur_data.contributors c | ||
JOIN | ||
augur_data.contributor_repo cr ON cr.cntrb_id = c.cntrb_id | ||
WHERE | ||
cr.cntrb_category IN ('ForkEvent', 'WatchEvent') | ||
AND cr.repo_git = (SELECT repo_git FROM augur_data.repo WHERE repo_id = :repo_id) | ||
{time_filter} | ||
ORDER BY | ||
c.cntrb_login, cr.cntrb_category, cr.created_at; | ||
|
||
-- name: create_d0_materialized_view | ||
CREATE MATERIALIZED VIEW IF NOT EXISTS augur_data.d0_contributor_engagement AS | ||
SELECT DISTINCT ON (c.cntrb_login, cr.cntrb_category) | ||
c.cntrb_login AS username, | ||
c.cntrb_full_name AS full_name, | ||
c.cntrb_country_code AS country, | ||
CASE | ||
WHEN cr.repo_git ILIKE '%gitlab%' THEN 'GitLab' | ||
WHEN cr.repo_git ILIKE '%github%' THEN 'GitHub' | ||
ELSE 'Unknown' | ||
END AS platform, | ||
cr.cntrb_category = 'ForkEvent' AS forked, | ||
cr.cntrb_category = 'WatchEvent' AS starred_or_watched, | ||
cr.created_at AS engagement_timestamp | ||
FROM | ||
augur_data.contributors c | ||
JOIN | ||
augur_data.contributor_repo cr ON cr.cntrb_id = c.cntrb_id | ||
WHERE | ||
cr.cntrb_category IN ('ForkEvent', 'WatchEvent') | ||
ORDER BY | ||
c.cntrb_login, cr.cntrb_category, cr.created_at; | ||
|
||
-- name: refresh_d0_materialized_view | ||
REFRESH MATERIALIZED VIEW augur_data.d0_contributor_engagement; | ||
|
||
|
||
-- name: d1_engagement_query | ||
SELECT | ||
c.cntrb_id, | ||
c.cntrb_login AS username, | ||
c.cntrb_full_name AS full_name, | ||
c.cntrb_country_code AS country, | ||
'GitHub' AS platform, | ||
MIN(i.created_at) AS first_issue_created_at, | ||
MIN(pr.pr_created_at) AS first_pr_opened_at, | ||
MIN(pm.msg_timestamp) AS first_pr_commented_at | ||
FROM | ||
augur_data.contributors c | ||
|
||
LEFT JOIN augur_data.issues i | ||
ON i.reporter_id = c.cntrb_id AND i.repo_id = :repo_id | ||
|
||
LEFT JOIN augur_data.pull_requests pr | ||
ON pr.pr_augur_contributor_id = c.cntrb_id AND pr.repo_id = :repo_id | ||
|
||
LEFT JOIN augur_data.pull_request_message_ref pmr | ||
ON pmr.pull_request_id = pr.pull_request_id | ||
LEFT JOIN augur_data.message pm | ||
ON pm.msg_id = pmr.msg_id AND pm.cntrb_id = c.cntrb_id AND pm.repo_id = :repo_id | ||
|
||
WHERE | ||
(i.created_at >= NOW() - INTERVAL '{time_filter}' | ||
OR pr.pr_created_at >= NOW() - INTERVAL '{time_filter}' | ||
OR pm.msg_timestamp >= NOW() - INTERVAL '{time_filter}') | ||
|
||
GROUP BY | ||
c.cntrb_id, c.cntrb_login, c.cntrb_full_name, c.cntrb_country_code | ||
HAVING | ||
MIN(i.created_at) IS NOT NULL | ||
OR MIN(pr.pr_created_at) IS NOT NULL | ||
OR MIN(pm.msg_timestamp) IS NOT NULL; | ||
|
||
-- name: create_d1_materialized_view | ||
CREATE MATERIALIZED VIEW IF NOT EXISTS augur_data.d1_contributor_engagement AS | ||
SELECT | ||
c.cntrb_login AS username, | ||
MIN(i.created_at) AS first_issue_created_at, | ||
MIN(pr.pr_created_at) AS first_pr_opened_at, | ||
MIN(pm.msg_timestamp) AS first_pr_commented_at | ||
FROM | ||
augur_data.contributors c | ||
LEFT JOIN augur_data.issues i | ||
ON i.reporter_id = c.cntrb_id | ||
LEFT JOIN augur_data.pull_requests pr | ||
ON pr.pr_augur_contributor_id = c.cntrb_id | ||
LEFT JOIN augur_data.pull_request_message_ref pmr | ||
ON pmr.pull_request_id = pr.pull_request_id | ||
LEFT JOIN augur_data.message pm | ||
ON pm.msg_id = pmr.msg_id AND pm.cntrb_id = c.cntrb_id | ||
WHERE | ||
(i.created_at >= NOW() - INTERVAL '1 year' | ||
OR pr.pr_created_at >= NOW() - INTERVAL '1 year' | ||
OR pm.msg_timestamp >= NOW() - INTERVAL '1 year') | ||
GROUP BY | ||
c.cntrb_login; | ||
|
||
-- name: refresh_d1_materialized_view | ||
REFRESH MATERIALIZED VIEW augur_data.d1_contributor_engagement; | ||
|
||
|
||
-- name: d2_engagement_query | ||
WITH pr_merged AS ( | ||
SELECT DISTINCT pr.pr_augur_contributor_id | ||
FROM augur_data.pull_requests pr | ||
WHERE pr.pr_merged_at IS NOT NULL AND pr.repo_id = :repo_id | ||
), | ||
|
||
issue_counts AS ( | ||
SELECT reporter_id AS cntrb_id, COUNT(*) AS issue_count | ||
FROM augur_data.issues | ||
WHERE repo_id = :repo_id | ||
GROUP BY reporter_id | ||
), | ||
|
||
comment_counts AS ( | ||
SELECT m.cntrb_id, COUNT(*) AS total_comments | ||
FROM augur_data.message m | ||
LEFT JOIN augur_data.issue_message_ref imr ON imr.msg_id = m.msg_id | ||
LEFT JOIN augur_data.pull_request_message_ref pmr ON pmr.msg_id = m.msg_id | ||
LEFT JOIN augur_data.issues i ON i.issue_id = imr.issue_id | ||
LEFT JOIN augur_data.pull_requests pr ON pr.pull_request_id = pmr.pull_request_id | ||
WHERE m.repo_id = :repo_id | ||
AND (i.repo_id = :repo_id OR pr.repo_id = :repo_id) | ||
AND (imr.issue_id IS NOT NULL OR pmr.pull_request_id IS NOT NULL) | ||
GROUP BY m.cntrb_id | ||
), | ||
|
||
pr_commits_over_3 AS ( | ||
SELECT pr.pr_augur_contributor_id AS cntrb_id | ||
FROM augur_data.pull_requests pr | ||
JOIN augur_data.pull_request_commits prc ON prc.pull_request_id = pr.pull_request_id | ||
WHERE pr.repo_id = :repo_id | ||
GROUP BY pr.pr_augur_contributor_id, pr.pull_request_id | ||
HAVING COUNT(prc.pr_cmt_sha) > 3 | ||
), | ||
|
||
commented_on_multiple_prs AS ( | ||
SELECT m.cntrb_id | ||
FROM augur_data.message m | ||
JOIN augur_data.pull_request_message_ref pmr ON pmr.msg_id = m.msg_id | ||
JOIN augur_data.pull_requests pr ON pr.pull_request_id = pmr.pull_request_id | ||
WHERE m.repo_id = :repo_id AND pr.repo_id = :repo_id | ||
GROUP BY m.cntrb_id | ||
HAVING COUNT(DISTINCT pmr.pull_request_id) > 2 | ||
) | ||
|
||
SELECT | ||
c.cntrb_id, | ||
c.cntrb_login AS username, | ||
c.cntrb_full_name AS full_name, | ||
c.cntrb_country_code AS country, | ||
'GitHub' AS platform, | ||
CASE WHEN pm.pr_augur_contributor_id IS NOT NULL THEN true ELSE false END AS has_merged_pr, | ||
CASE WHEN ic.issue_count > 5 THEN true ELSE false END AS created_many_issues, | ||
COALESCE(cc.total_comments, 0) AS total_comments, | ||
CASE WHEN pco3.cntrb_id IS NOT NULL THEN true ELSE false END AS has_pr_with_many_commits, | ||
CASE WHEN cmp.cntrb_id IS NOT NULL THEN true ELSE false END AS commented_on_multiple_prs | ||
FROM augur_data.contributors c | ||
LEFT JOIN pr_merged pm ON pm.pr_augur_contributor_id = c.cntrb_id | ||
LEFT JOIN issue_counts ic ON ic.cntrb_id = c.cntrb_id | ||
LEFT JOIN comment_counts cc ON cc.cntrb_id = c.cntrb_id | ||
LEFT JOIN pr_commits_over_3 pco3 ON pco3.cntrb_id = c.cntrb_id | ||
LEFT JOIN commented_on_multiple_prs cmp ON cmp.cntrb_id = c.cntrb_id | ||
WHERE (pm.pr_augur_contributor_id IS NOT NULL | ||
OR ic.cntrb_id IS NOT NULL | ||
OR cc.cntrb_id IS NOT NULL | ||
OR pco3.cntrb_id IS NOT NULL | ||
OR cmp.cntrb_id IS NOT NULL); | ||
|
||
-- name: create_d2_materialized_view | ||
CREATE MATERIALIZED VIEW IF NOT EXISTS augur_data.d2_contributor_engagement AS | ||
WITH pr_merged AS ( | ||
SELECT DISTINCT pr.pr_augur_contributor_id | ||
FROM augur_data.pull_requests pr | ||
WHERE pr.pr_merged_at IS NOT NULL | ||
), | ||
|
||
issue_counts AS ( | ||
SELECT reporter_id AS cntrb_id, COUNT(*) AS issue_count | ||
FROM augur_data.issues | ||
GROUP BY reporter_id | ||
), | ||
|
||
comment_counts AS ( | ||
SELECT m.cntrb_id, COUNT(*) AS total_comments | ||
FROM augur_data.message m | ||
LEFT JOIN augur_data.issue_message_ref imr ON imr.msg_id = m.msg_id | ||
LEFT JOIN augur_data.pull_request_message_ref pmr ON pmr.msg_id = m.msg_id | ||
WHERE imr.issue_id IS NOT NULL OR pmr.pull_request_id IS NOT NULL | ||
GROUP BY m.cntrb_id | ||
), | ||
|
||
pr_commits_over_3 AS ( | ||
SELECT pr.pr_augur_contributor_id AS cntrb_id | ||
FROM augur_data.pull_requests pr | ||
JOIN augur_data.pull_request_commits prc ON prc.pull_request_id = pr.pull_request_id | ||
GROUP BY pr.pr_augur_contributor_id, pr.pull_request_id | ||
HAVING COUNT(prc.pr_cmt_sha) > 3 | ||
), | ||
|
||
commented_on_multiple_prs AS ( | ||
SELECT m.cntrb_id | ||
FROM augur_data.message m | ||
JOIN augur_data.pull_request_message_ref pmr ON pmr.msg_id = m.msg_id | ||
GROUP BY m.cntrb_id | ||
HAVING COUNT(DISTINCT pmr.pull_request_id) > 2 | ||
) | ||
|
||
SELECT | ||
c.cntrb_login AS username, | ||
CASE WHEN pm.pr_augur_contributor_id IS NOT NULL THEN true ELSE false END AS has_merged_pr, | ||
CASE WHEN ic.issue_count > 5 THEN true ELSE false END AS created_many_issues, | ||
COALESCE(cc.total_comments, 0) AS total_comments, | ||
CASE WHEN pco3.cntrb_id IS NOT NULL THEN true ELSE false END AS has_pr_with_many_commits, | ||
CASE WHEN cmp.cntrb_id IS NOT NULL THEN true ELSE false END AS commented_on_multiple_prs | ||
FROM augur_data.contributors c | ||
LEFT JOIN pr_merged pm ON pm.pr_augur_contributor_id = c.cntrb_id | ||
LEFT JOIN issue_counts ic ON ic.cntrb_id = c.cntrb_id | ||
LEFT JOIN comment_counts cc ON cc.cntrb_id = c.cntrb_id | ||
LEFT JOIN pr_commits_over_3 pco3 ON pco3.cntrb_id = c.cntrb_id | ||
LEFT JOIN commented_on_multiple_prs cmp ON cmp.cntrb_id = c.cntrb_id; | ||
|
||
-- name: refresh_d2_materialized_view | ||
REFRESH MATERIALIZED VIEW augur_data.d2_contributor_engagement; | ||
|
||
-- name: create_contributor_engagement_table | ||
CREATE TABLE IF NOT EXISTS augur_data.contributor_engagement ( | ||
engagement_id BIGSERIAL PRIMARY KEY, | ||
repo_id BIGINT NOT NULL REFERENCES augur_data.repo(repo_id), | ||
cntrb_id UUID NOT NULL REFERENCES augur_data.contributors(cntrb_id), | ||
username VARCHAR NOT NULL, | ||
full_name VARCHAR, | ||
country VARCHAR, | ||
platform VARCHAR, | ||
d0_forked BOOLEAN DEFAULT FALSE, | ||
d0_starred_or_watched BOOLEAN DEFAULT FALSE, | ||
d0_engagement_timestamp TIMESTAMP(6), | ||
d1_first_issue_created_at TIMESTAMP(6), | ||
d1_first_pr_opened_at TIMESTAMP(6), | ||
d1_first_pr_commented_at TIMESTAMP(6), | ||
d2_has_merged_pr BOOLEAN DEFAULT FALSE, | ||
d2_created_many_issues BOOLEAN DEFAULT FALSE, | ||
d2_total_comments BIGINT DEFAULT 0, | ||
d2_has_pr_with_many_commits BOOLEAN DEFAULT FALSE, | ||
d2_commented_on_multiple_prs BOOLEAN DEFAULT FALSE, | ||
tool_source VARCHAR, | ||
tool_version VARCHAR, | ||
data_source VARCHAR, | ||
data_collection_date TIMESTAMP(6) DEFAULT CURRENT_TIMESTAMP, | ||
UNIQUE(repo_id, cntrb_id) | ||
); | ||
|
||
-- name: create_contributor_engagement_indexes | ||
CREATE INDEX IF NOT EXISTS idx_contributor_engagement_repo_id ON augur_data.contributor_engagement(repo_id); | ||
CREATE INDEX IF NOT EXISTS idx_contributor_engagement_cntrb_id ON augur_data.contributor_engagement(cntrb_id); | ||
CREATE INDEX IF NOT EXISTS idx_contributor_engagement_username ON augur_data.contributor_engagement(username); | ||
CREATE INDEX IF NOT EXISTS idx_contributor_engagement_platform ON augur_data.contributor_engagement(platform); | ||
|
||
|
||
-- name: create_contributor_engagement_sequence | ||
CREATE SEQUENCE IF NOT EXISTS augur_data.contributor_engagement_engagement_id_seq | ||
OWNED BY augur_data.contributor_engagement.engagement_id; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@officialasishkumar : New database objects should be in a file in
{repo root}/augur/application/schema/alembic/versions
I think with the PR open for the other GSOC team the next number in sequence is 35.
That enables alembic upgrades and downgrade.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sgoggins updated with commit 299bd90