oss-aspen · cdolfi · Apr 13, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/.wordlist-md b/.wordlist-md
@@ -72,3 +72,11 @@ Docker's
 data-center
 OAuth
 postgres
+Podman
+filesystem
+credsStore
+credStore
+filesystem
+Podman
+credsStore
+credStore
diff --git a/8Knot/pages/chaoss/visualizations/project_velocity.py b/8Knot/pages/chaoss/visualizations/project_velocity.py
@@ -331,9 +331,11 @@ def process_data(
     # replace all nan to 0
     df_consolidated.fillna(value=0, inplace=True)
 
-    # log of commits and contribs
-    df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(math.log)
-    df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(math.log)
+    # log of commits and contribs if values are not 0
+    df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(lambda x: math.log(x) if x != 0 else 0)
+    df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(
+        lambda x: math.log(x) if x != 0 else 0
+    )
 
     # column to hold the weighted values of pr and issues actions summed together
     df_consolidated["prs_issues_actions_weighted"] = (

diff --git a/8Knot/pages/codebase/codebase.py b/8Knot/pages/codebase/codebase.py
@@ -16,14 +16,14 @@
     [
         dbc.Row(
             [
-                dbc.Col(gc_cntrb_file_heatmap, width=12),
+                dbc.Col(gc_contribution_file_heatmap, width=12),
             ],
             align="center",
             style={"marginBottom": ".5%"},
         ),
         dbc.Row(
             [
-                dbc.Col(gc_contribution_file_heatmap, width=12),
+                dbc.Col(gc_cntrb_file_heatmap, width=12),
             ],
             align="center",
             style={"marginBottom": ".5%"},

diff --git a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
@@ -80,6 +80,7 @@
                                     classNames={"values": "dmc-multiselect-custom"},
                                     searchable=True,
                                     clearable=False,
+                                    value="Top Level Directory",
                                 ),
                             ],
                             className="me-2",
@@ -215,7 +216,7 @@ def directory_dropdown(repo_id):
 
     # add top level directory to the list of directories
     directories.insert(0, "Top Level Directory")
-    logging.warning(f"DIRECTORY DROPDOWN - FINISHED")
+    logging.warning(f"CNTRB DIRECTORY DROPDOWN - FINISHED")
 
     return directories, "Top Level Directory"
 
@@ -224,18 +225,19 @@ def directory_dropdown(repo_id):
 @callback(
     Output(f"{PAGE}-{VIZ_ID}", "figure"),
     [
+        Input("repo-choices", "data"),
         Input(f"repo-{PAGE}-{VIZ_ID}", "value"),
         Input(f"directory-{PAGE}-{VIZ_ID}", "value"),
         Input("bot-switch", "value"),
     ],
     background=True,
 )
-def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
+def cntrb_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch):
     start = time.perf_counter()
     logging.warning(f"{VIZ_ID}- START")
 
     # get dataframes of data from cache
-    df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id])
+    df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id])
 
     # test if there is data
     if df_file.empty or df_actions.empty or df_file_cntbs.empty:
@@ -255,40 +257,40 @@ def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
     return fig
 
 
-def multi_query_helper(repos):
+def multi_query_helper(searchbar_repos, repo):
     """
     For cntrb_file_heatmap_graph-
     hack to put all of the cache-retrieval
     in the same place temporarily
     """
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # GET ALL DATA FROM POSTGRES CACHE
     df_file = cf.retrieve_from_cache(
         tablename=rfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
     df_actions = cf.retrieve_from_cache(
         tablename=cnq.__name__,
-        repolist=repos,
+        repolist=searchbar_repos,
     )
     df_file_cntrbs = cf.retrieve_from_cache(
         tablename=cpfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
 
     # necessary preprocessing steps that were lifted out of the querying step
@@ -305,6 +307,64 @@ def process_data(
     directory,
     bot_switch,
 ):
+    """
+    Processing steps
+
+        1 - Cleans up file data to only include current files and relate files in the repository to the contributors who have reviewed them in past PRs.
+        2 - For a given level in the directory tree, aggregate the list of contributors for sub-directories and for individual files at the level.
+        3 - For each contributor, identify their most recent contribution.
+        4 - Transforms dataframe where columns are months with counts of "last seen" dates in that month and the rows are the file/subdirectory
+    """
+
+    df_file = df_file_clean(df_file, df_file_cntbs, bot_switch)
+
+    df_dynamic_directory = cntrb_per_directory_value(directory, df_file)
+
+    # work around for using functions, will clean later
+    if df_dynamic_directory.empty:
+        return df_dynamic_directory
+
+    df_dynamic_directory = cntrb_to_last_activity(df_actions, df_dynamic_directory)
+
+    final = file_cntrb_activity_by_month(df_dynamic_directory, df_actions)
+
+    return final
+
+
+def create_figure(df: pd.DataFrame):
+    fig = px.imshow(
+        df,
+        labels=dict(x="Time", y="Directory Entries", color="Contributors"),
+        color_continuous_scale=px.colors.sequential.deep,
+    )
+
+    fig["layout"]["yaxis"]["tickmode"] = "linear"
+    fig["layout"]["height"] = 700
+    fig["layout"]["coloraxis_colorbar_x"] = -0.15
+    fig["layout"]["yaxis"]["side"] = "right"
+
+    return fig
+
+
+def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch):
+    """
+    This function cleans the df_file data and combines it with the related cntrb_ids
+
+    Args:
+    -----
+        df_file : Pandas Dataframe
+            Dataframe with the output of the repo_files_query
+
+        df_file_cntrbs : Pandas Dataframe
+            Dataframe with the output of the cntrb_per_file_query
+
+        bot_switch : boolan
+            T/F for the status of the bot switch
+
+    Returns:
+    --------
+        df_file: df with file and cntrb_ids of contributors that reviewed a pr with that file in it
+    """
     # strings to hold the values for each column (always the same for every row of this query)
     repo_name = df_file["repo_name"].iloc[0]
     repo_path = df_file["repo_path"].iloc[0]
@@ -326,7 +386,7 @@ def process_data(
     df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)
 
     # Left join on df_files to only get the files that are currently in the repository
-    # and the contributors that have ever opened a pr that included edits on the file
+    # and the contributors that have ever reviewed a pr that included edits on the file
     df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
     # replace nan with empty string to avoid errors in list comprehension
     df_file.cntrb_ids.fillna("", inplace=True)
@@ -343,6 +403,26 @@ def process_data(
             axis=1,
         )
 
+    return df_file
+
+
+def cntrb_per_directory_value(directory, df_file):
+    """
+    This function gets the files in the specified directory, groups together any files in
+    subdirectories, and creates a list of their contributors cntrb_ids
+
+    Args:
+    -----
+        directory : string
+            Output from the directory drop down
+
+        df_file : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and their reviewers cntrb_ids
+    """
     # determine directory level to use in later step
     level = directory.count("/")
     if directory == "Top Level Directory":
@@ -377,6 +457,25 @@ def process_data(
         lambda row: set(row.cntrb_ids),
         axis=1,
     )
+    return df_dynamic_directory
+
+
+def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame):
+    """
+    This function creates a df with the files and the the dates of the most recent activity for each cntrb_id.
+
+    Args:
+    -----
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
+
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and the dates of the most recent activity for the reviewers.
+    """
 
     # date reformating
     df_actions["created_at"] = pd.to_datetime(df_actions["created_at"], utc=True)
@@ -406,6 +505,26 @@ def process_data(
     # most recent activity - preprocessing step
     df_dynamic_directory = df_dynamic_directory.explode("dates")
 
+    return df_dynamic_directory
+
+
+def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame):
+    """
+    This function transforms the df_dynamic_directory to be counts of "last seen" contributors by month.
+
+    Args:
+    -----
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
+
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_final: df with files and subdirectories as rows and the months as columns
+    """
+
     # get files that have no contributors and remove from set to prevent errors in grouper function
     no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory.dates.isnull()].tolist()
 
@@ -415,8 +534,9 @@ def process_data(
     there will be a column for every month even if there is no "last contribution" date in it. This greatly
     improves the heatmap ploting"""
 
-    # dates based on action so it represents the length of the project
-    min_date = df_actions.created_at.min()
+    # dates based on action so it represents the length of the project, min based on PR
+    # open date to avoid committer inputted dates
+    min_date = df_actions[df_actions["Action"] == "PR Opened"].created_at.min()
     max_date = df_actions.created_at.max()
     dates = pd.date_range(start=min_date, end=max_date, freq="M", inclusive="both")
     df_fill = dates.to_frame(index=False, name="dates")
@@ -436,18 +556,3 @@ def process_data(
         final.loc[files] = None
 
     return final
-
-
-def create_figure(df: pd.DataFrame):
-    fig = px.imshow(
-        df,
-        labels=dict(x="Time", y="Directory Entries", color="Contributors"),
-        color_continuous_scale=px.colors.sequential.deep,
-    )
-
-    fig["layout"]["yaxis"]["tickmode"] = "linear"
-    fig["layout"]["height"] = 700
-    fig["layout"]["coloraxis_colorbar_x"] = -0.15
-    fig["layout"]["yaxis"]["side"] = "right"
-
-    return fig