Skip to content

Commit

Permalink
Merge pull request #661 from oss-aspen/dev
Browse files Browse the repository at this point in the history
push dev to main
  • Loading branch information
cdolfi authored Mar 12, 2024
2 parents 5f8dbe6 + 97f9526 commit 4fcefe6
Show file tree
Hide file tree
Showing 47 changed files with 1,028 additions and 384 deletions.
4 changes: 4 additions & 0 deletions .wordlist-md
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,7 @@ yml
ansible
gitlab
SSL
Docker's
data-center
OAuth
postgres
2 changes: 1 addition & 1 deletion 8Knot/cache_manager/cache_facade.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def retrieve_from_cache(
"""
SELECT *
FROM {tablename} t
WHERE t.id IN %s;
WHERE t.repo_id IN %s;
""".format(
tablename=tablename
),
Expand Down
72 changes: 37 additions & 35 deletions 8Knot/cache_manager/db_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,10 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS commits_query(
id int,
commits text, -- this is the commit hash, so it's base64 hash.
repo_id int,
commit_hash text, -- this is the commit hash, so it's base64 hash.
author_email text,
date text,
author_date text,
author_timestamp text,
committer_timestamp text)
"""
Expand All @@ -134,15 +134,15 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issues_query(
id int,
repo_id bigint,
repo_name text,
issue int,
issue_number int,
gh_issue int,
issue bigint,
issue_number bigint,
gh_issue bigint,
reporter_id text,
issue_closer text,
created text,
closed text
created_at text,
closed_at text
)
"""
)
Expand All @@ -151,14 +151,14 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS prs_query(
id int,
repo_id int,
repo_name text,
pull_request int,
pull_request_id int,
pr_src_number int,
cntrb_id text,
created text,
closed text,
merged text
created_at text,
closed_at text,
merged_at text
)
"""
)
Expand All @@ -168,8 +168,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS affiliation_query(
cntrb_id text,
created text,
id int,
created_at text,
repo_id int,
login text,
action text,
rank int,
Expand All @@ -183,7 +183,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS contributors_query(
id int,
repo_id int,
repo_name text,
cntrb_id text,
created_at text,
Expand All @@ -199,9 +199,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS issue_assignee_query(
issue_id text,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -214,9 +214,9 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_assignee_query(
pull_request_id int,
id int,
created text,
closed text,
repo_id int,
created_at text,
closed_at text,
assign_date text,
assignment_action text,
assignee text
Expand All @@ -228,9 +228,10 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS cntrb_per_file_query(
repo_id int,
file_path text,
id int,
cntrb_ids text
cntrb_ids text,
reviewer_ids text
)
"""
)
Expand All @@ -240,8 +241,8 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_file_query(
file_path text,
pull_request int,
id int
pull_request_id int,
repo_id int
)
"""
)
Expand All @@ -250,7 +251,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_files_query(
id int,
repo_id int,
repo_name text,
repo_path text,
rl_analysis_date text,
Expand All @@ -264,7 +265,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_languages_query(
id int,
repo_id int,
programming_language text,
code_lines int,
files int
Expand All @@ -276,11 +277,12 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS package_version_query(
id int,
repo_id int,
name text,
current_release_date text,
latest_release_date text,
libyear float4
libyear float4,
dep_age text
)
"""
)
Expand All @@ -289,7 +291,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_releases_query(
id int,
repo_id int,
release_name text,
release_created_at text,
release_published_at text,
Expand All @@ -302,7 +304,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS ossf_score_query(
id int,
repo_id int,
name text,
score float4
)
Expand All @@ -313,7 +315,7 @@ def _create_application_tables() -> None:
cur.execute(
"""
CREATE UNLOGGED TABLE IF NOT EXISTS repo_info_query(
id int,
repo_id int,
issues_enabled text,
fork_count int,
watchers_count int,
Expand All @@ -331,7 +333,7 @@ def _create_application_tables() -> None:
"""
CREATE UNLOGGED TABLE IF NOT EXISTS pr_response_query(
pull_request_id int,
ID int,
repo_id int,
cntrb_id text,
msg_timestamp text,
msg_cntrb_id text,
Expand Down
4 changes: 2 additions & 2 deletions 8Knot/pages/affiliation/visualizations/commit_domains.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,10 +187,10 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
# creates df of domains and counts
df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()

df = df.rename(columns={0: "occurrences"})
df = df.rename(columns={"count": "occurrences"})

# changes the name of the company if under a certain threshold
df.loc[df.occurrences <= num, "domains"] = "Other"
df.loc[df["occurrences"] <= num, "domains"] = "Other"

# groups others together for final counts
df = (
Expand Down
12 changes: 6 additions & 6 deletions 8Knot/pages/affiliation/visualizations/gh_org_affiliation.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,16 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
requiring no further processing."""

# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# intital count of same company name in github profile
result = df.cntrb_company.value_counts(dropna=False)
Expand All @@ -187,7 +187,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
df["company_name"] = df.index
df = df.reset_index()
df["company_name"] = df["company_name"].astype(str)
df = df.rename(columns={"index": "orginal_name", "cntrb_company": "contribution_count"})
df = df.rename(columns={"cntrb_company": "orginal_name", "count": "contribution_count"})

# applies fuzzy matching comparing all rows to each other
df["match"] = df.apply(lambda row: fuzzy_match(df, row["company_name"]), axis=1)
Expand All @@ -212,7 +212,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
)

# changes the name of the company if under a certain threshold
df.loc[df.contribution_count <= num, "company_name"] = "Other"
df.loc[df["contribution_count"] <= num, "company_name"] = "Other"

# groups others together for final counts
df = (
Expand Down
20 changes: 13 additions & 7 deletions 8Knot/pages/affiliation/visualizations/org_associated_activity.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -201,16 +207,16 @@ def org_associated_activity_graph(repolist, num, start_date, end_date, email_fil

def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# creates list of emails for each contribution and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand All @@ -224,7 +230,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
# creates df of domains and counts
df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()

df = df.rename(columns={0: "occurrences"})
df = df.rename(columns={"count": "occurrences"})

# changes the name of the organization if under a certain threshold
df.loc[df.occurrences <= num, "domains"] = "Other"
Expand Down
32 changes: 22 additions & 10 deletions 8Knot/pages/affiliation/visualizations/org_core_contributors.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,14 @@
dbc.Checklist(
id=f"email-filter-{PAGE}-{VIZ_ID}",
options=[
{"label": "Exclude Gmail", "value": "gmail"},
{"label": "Exclude GitHub", "value": "github"},
{
"label": "Exclude Gmail",
"value": "gmail",
},
{
"label": "Exclude GitHub",
"value": "github",
},
],
value=[""],
inline=True,
Expand Down Expand Up @@ -165,7 +171,13 @@ def toggle_popover(n, is_open):
background=True,
)
def compay_associated_activity_graph(
repolist, contributions, contributors, start_date, end_date, email_filter, bot_switch
repolist,
contributions,
contributors,
start_date,
end_date,
email_filter,
bot_switch,
):
# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=aq.__name__, repolist=repolist):
Expand Down Expand Up @@ -201,23 +213,23 @@ def compay_associated_activity_graph(

def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_date, email_filter):
# convert to datetime objects rather than strings
df["created"] = pd.to_datetime(df["created"], utc=True)
df["created_at"] = pd.to_datetime(df["created_at"], utc=True)

# order values chronologically by COLUMN_TO_SORT_BY date
df = df.sort_values(by="created", axis=0, ascending=True)
df = df.sort_values(by="created_at", axis=0, ascending=True)

# filter values based on date picker
if start_date is not None:
df = df[df.created >= start_date]
df = df[df.created_at >= start_date]
if end_date is not None:
df = df[df.created <= end_date]
df = df[df.created_at <= end_date]

# groups contributions by countributor id and counts, created column now hold the number
# of contributions for its respective contributor
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created"]].count()
df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created_at"]].count()

# filters out contributors that dont meet the core contribution threshhold
df = df[df.created >= contributions]
df = df[df.created_at >= contributions]

# creates list of unique emails and flattens list result
emails = df.email_list.str.split(" , ").explode("email_list").tolist()
Expand All @@ -231,7 +243,7 @@ def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_
# creates df of domains and counts
df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()

df = df.rename(columns={0: "contributors"})
df = df.rename(columns={"count": "contributors"})

# changes the name of the org if under a certain threshold
df.loc[df.contributors <= contributors, "domains"] = "Other"
Expand Down
Loading

0 comments on commit 4fcefe6

Please sign in to comment.