Merge pull request #661 from oss-aspen/dev

push dev to main
oss-aspen · Mar 12, 2024 · 4fcefe6 · 4fcefe6
2 parents 5f8dbe6 + 97f9526
commit 4fcefe6
Show file tree

Hide file tree

Showing 47 changed files with 1,028 additions and 384 deletions.
diff --git a/.wordlist-md b/.wordlist-md
@@ -68,3 +68,7 @@ yml
 ansible
 gitlab
 SSL
+Docker's
+data-center
+OAuth
+postgres
diff --git a/8Knot/cache_manager/cache_facade.py b/8Knot/cache_manager/cache_facade.py
@@ -219,7 +219,7 @@ def retrieve_from_cache(
                 """
                 SELECT *
                 FROM {tablename} t
-                WHERE t.id IN %s;
+                WHERE t.repo_id IN %s;
                 """.format(
                     tablename=tablename
                 ),

diff --git a/8Knot/cache_manager/db_init.py b/8Knot/cache_manager/db_init.py
@@ -121,10 +121,10 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS commits_query(
-                id int,
-                commits text, -- this is the commit hash, so it's base64 hash.
+                repo_id int,
+                commit_hash text, -- this is the commit hash, so it's base64 hash.
                 author_email text,
-                date text,
+                author_date text,
                 author_timestamp text,
                 committer_timestamp text)
             """
@@ -134,15 +134,15 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS issues_query(
-                id int,
+                repo_id bigint,
                 repo_name text,
-                issue int,
-                issue_number int,
-                gh_issue int,
+                issue bigint,
+                issue_number bigint,
+                gh_issue bigint,
                 reporter_id text,
                 issue_closer text,
-                created text,
-                closed text
+                created_at text,
+                closed_at text
             )
             """
         )
@@ -151,14 +151,14 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS prs_query(
-                id int,
+                repo_id int,
                 repo_name text,
-                pull_request int,
+                pull_request_id int,
                 pr_src_number int,
                 cntrb_id text,
-                created text,
-                closed text,
-                merged text
+                created_at text,
+                closed_at text,
+                merged_at text
             )
             """
         )
@@ -168,8 +168,8 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS affiliation_query(
                 cntrb_id text,
-                created text,
-                id int,
+                created_at text,
+                repo_id int,
                 login text,
                 action text,
                 rank int,
@@ -183,7 +183,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS contributors_query(
-                id int,
+                repo_id int,
                 repo_name text,
                 cntrb_id text,
                 created_at text,
@@ -199,9 +199,9 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS issue_assignee_query(
                 issue_id text,
-                id int,
-                created text,
-                closed text,
+                repo_id int,
+                created_at text,
+                closed_at text,
                 assign_date text,
                 assignment_action text,
                 assignee text
@@ -214,9 +214,9 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_assignee_query(
                 pull_request_id int,
-                id int,
-                created text,
-                closed text,
+                repo_id int,
+                created_at text,
+                closed_at text,
                 assign_date text,
                 assignment_action text,
                 assignee text
@@ -228,9 +228,10 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS cntrb_per_file_query(
+                repo_id int,
                 file_path text,
-                id int,
-                cntrb_ids text
+                cntrb_ids text,
+                reviewer_ids text
             )
             """
         )
@@ -240,8 +241,8 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_file_query(
                 file_path text,
-                pull_request int,
-                id int
+                pull_request_id int,
+                repo_id int
             )
             """
         )
@@ -250,7 +251,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_files_query(
-                id int,
+                repo_id int,
                 repo_name text,
                 repo_path text,
                 rl_analysis_date text,
@@ -264,7 +265,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_languages_query(
-                id int,
+                repo_id int,
                 programming_language text,
                 code_lines int,
                 files int
@@ -276,11 +277,12 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS package_version_query(
-                id int,
+                repo_id int,
                 name text,
                 current_release_date text,
                 latest_release_date text,
-                libyear float4
+                libyear float4,
+                dep_age text
             )
             """
         )
@@ -289,7 +291,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_releases_query(
-                id int,
+                repo_id int,
                 release_name text,
                 release_created_at text,
                 release_published_at text,
@@ -302,7 +304,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS ossf_score_query(
-                id int,
+                repo_id int,
                 name text,
                 score float4
             )
@@ -313,7 +315,7 @@ def _create_application_tables() -> None:
         cur.execute(
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS repo_info_query(
-                id int,
+                repo_id int,
                 issues_enabled text,
                 fork_count int,
                 watchers_count int,
@@ -331,7 +333,7 @@ def _create_application_tables() -> None:
             """
             CREATE UNLOGGED TABLE IF NOT EXISTS pr_response_query(
                 pull_request_id int,
-                ID int,
+                repo_id int,
                 cntrb_id text,
                 msg_timestamp text,
                 msg_cntrb_id text,

diff --git a/8Knot/pages/affiliation/visualizations/commit_domains.py b/8Knot/pages/affiliation/visualizations/commit_domains.py
@@ -187,10 +187,10 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     # creates df of domains and counts
     df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()
 
-    df = df.rename(columns={0: "occurrences"})
+    df = df.rename(columns={"count": "occurrences"})
 
     # changes the name of the company if under a certain threshold
-    df.loc[df.occurrences <= num, "domains"] = "Other"
+    df.loc[df["occurrences"] <= num, "domains"] = "Other"
 
     # groups others together for final counts
     df = (

diff --git a/8Knot/pages/affiliation/visualizations/gh_org_affiliation.py b/8Knot/pages/affiliation/visualizations/gh_org_affiliation.py
@@ -168,16 +168,16 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     requiring no further processing."""
 
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # intital count of same company name in github profile
     result = df.cntrb_company.value_counts(dropna=False)
@@ -187,7 +187,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     df["company_name"] = df.index
     df = df.reset_index()
     df["company_name"] = df["company_name"].astype(str)
-    df = df.rename(columns={"index": "orginal_name", "cntrb_company": "contribution_count"})
+    df = df.rename(columns={"cntrb_company": "orginal_name", "count": "contribution_count"})
 
     # applies fuzzy matching comparing all rows to each other
     df["match"] = df.apply(lambda row: fuzzy_match(df, row["company_name"]), axis=1)
@@ -212,7 +212,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date):
     )
 
     # changes the name of the company if under a certain threshold
-    df.loc[df.contribution_count <= num, "company_name"] = "Other"
+    df.loc[df["contribution_count"] <= num, "company_name"] = "Other"
 
     # groups others together for final counts
     df = (

diff --git a/8Knot/pages/affiliation/visualizations/org_associated_activity.py b/8Knot/pages/affiliation/visualizations/org_associated_activity.py
@@ -82,8 +82,14 @@
                                     dbc.Checklist(
                                         id=f"email-filter-{PAGE}-{VIZ_ID}",
                                         options=[
-                                            {"label": "Exclude Gmail", "value": "gmail"},
-                                            {"label": "Exclude GitHub", "value": "github"},
+                                            {
+                                                "label": "Exclude Gmail",
+                                                "value": "gmail",
+                                            },
+                                            {
+                                                "label": "Exclude GitHub",
+                                                "value": "github",
+                                            },
                                         ],
                                         value=[""],
                                         inline=True,
@@ -201,16 +207,16 @@ def org_associated_activity_graph(repolist, num, start_date, end_date, email_fil
 
 def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # creates list of emails for each contribution and flattens list result
     emails = df.email_list.str.split(" , ").explode("email_list").tolist()
@@ -224,7 +230,7 @@ def process_data(df: pd.DataFrame, num, start_date, end_date, email_filter):
     # creates df of domains and counts
     df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()
 
-    df = df.rename(columns={0: "occurrences"})
+    df = df.rename(columns={"count": "occurrences"})
 
     # changes the name of the organization if under a certain threshold
     df.loc[df.occurrences <= num, "domains"] = "Other"

diff --git a/8Knot/pages/affiliation/visualizations/org_core_contributors.py b/8Knot/pages/affiliation/visualizations/org_core_contributors.py
@@ -107,8 +107,14 @@
                                     dbc.Checklist(
                                         id=f"email-filter-{PAGE}-{VIZ_ID}",
                                         options=[
-                                            {"label": "Exclude Gmail", "value": "gmail"},
-                                            {"label": "Exclude GitHub", "value": "github"},
+                                            {
+                                                "label": "Exclude Gmail",
+                                                "value": "gmail",
+                                            },
+                                            {
+                                                "label": "Exclude GitHub",
+                                                "value": "github",
+                                            },
                                         ],
                                         value=[""],
                                         inline=True,
@@ -165,7 +171,13 @@ def toggle_popover(n, is_open):
     background=True,
 )
 def compay_associated_activity_graph(
-    repolist, contributions, contributors, start_date, end_date, email_filter, bot_switch
+    repolist,
+    contributions,
+    contributors,
+    start_date,
+    end_date,
+    email_filter,
+    bot_switch,
 ):
     # wait for data to asynchronously download and become available.
     while not_cached := cf.get_uncached(func_name=aq.__name__, repolist=repolist):
@@ -201,23 +213,23 @@ def compay_associated_activity_graph(
 
 def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_date, email_filter):
     # convert to datetime objects rather than strings
-    df["created"] = pd.to_datetime(df["created"], utc=True)
+    df["created_at"] = pd.to_datetime(df["created_at"], utc=True)
 
     # order values chronologically by COLUMN_TO_SORT_BY date
-    df = df.sort_values(by="created", axis=0, ascending=True)
+    df = df.sort_values(by="created_at", axis=0, ascending=True)
 
     # filter values based on date picker
     if start_date is not None:
-        df = df[df.created >= start_date]
+        df = df[df.created_at >= start_date]
     if end_date is not None:
-        df = df[df.created <= end_date]
+        df = df[df.created_at <= end_date]
 
     # groups contributions by countributor id and counts, created column now hold the number
     # of contributions for its respective contributor
-    df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created"]].count()
+    df = df.groupby(["cntrb_id", "email_list"], as_index=False)[["created_at"]].count()
 
     # filters out contributors that dont meet the core contribution threshhold
-    df = df[df.created >= contributions]
+    df = df[df.created_at >= contributions]
 
     # creates list of unique emails and flattens list result
     emails = df.email_list.str.split(" , ").explode("email_list").tolist()
@@ -231,7 +243,7 @@ def process_data(df: pd.DataFrame, contributions, contributors, start_date, end_
     # creates df of domains and counts
     df = pd.DataFrame(email_domains, columns=["domains"]).value_counts().to_frame().reset_index()
 
-    df = df.rename(columns={0: "contributors"})
+    df = df.rename(columns={"count": "contributors"})
 
     # changes the name of the org if under a certain threshold
     df.loc[df.contributors <= contributors, "domains"] = "Other"