From 4cc0acf71d35b764250825fa7e583dc35e739901 Mon Sep 17 00:00:00 2001
From: cdolfi <cdolfi@redhat.com>
Date: Fri, 1 Mar 2024 18:02:48 -0500
Subject: [PATCH] heatmap functions and df_actions based on repos

---
 .../visualizations/cntrb_file_heatmap.py      | 153 +++++++++++++++---
 .../contribution_file_heatmap.py              | 146 ++++++++++++++---
 .../visualizations/reviewer_file_heatmap.py   |  37 ++---
 3 files changed, 269 insertions(+), 67 deletions(-)

diff --git a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
index 431b199b..87c90842 100644
--- a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
+++ b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
@@ -224,18 +224,19 @@ def directory_dropdown(repo_id):
 @callback(
     Output(f"{PAGE}-{VIZ_ID}", "figure"),
     [
+        Input("repo-choices", "data"),
         Input(f"repo-{PAGE}-{VIZ_ID}", "value"),
         Input(f"directory-{PAGE}-{VIZ_ID}", "value"),
         Input("bot-switch", "value"),
     ],
     background=True,
 )
-def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
+def cntrb_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch):
     start = time.perf_counter()
     logging.warning(f"{VIZ_ID}- START")
 
     # get dataframes of data from cache
-    df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id])
+    df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id])
 
     # test if there is data
     if df_file.empty or df_actions.empty or df_file_cntbs.empty:
@@ -255,7 +256,7 @@ def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
     return fig
 
 
-def multi_query_helper(repos):
+def multi_query_helper(searchbar_repos, repo):
     """
     For cntrb_file_heatmap_graph-
     hack to put all of the cache-retrieval
@@ -263,32 +264,32 @@ def multi_query_helper(repos):
     """
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # GET ALL DATA FROM POSTGRES CACHE
     df_file = cf.retrieve_from_cache(
         tablename=rfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
     df_actions = cf.retrieve_from_cache(
         tablename=cnq.__name__,
-        repolist=repos,
+        repolist=searchbar_repos,
     )
     df_file_cntrbs = cf.retrieve_from_cache(
         tablename=cpfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
 
     # necessary preprocessing steps that were lifted out of the querying step
@@ -305,6 +306,64 @@ def process_data(
     directory,
     bot_switch,
 ):
+    """
+    Processing steps
+
+        1 - Cleans up file data to only include current files and relate files in the repository to the contributors who have reviewed them in past PRs.
+        2 - For a given level in the directory tree, aggregate the list of contributors for sub-directories and for individual files at the level.
+        3 - For each contributor, identify their most recent contribution.
+        4 - Transforms dataframe where columns are months with counts of "last seen" dates in that month and the rows are the file/subdirectory
+    """
+
+    df_file = df_file_clean(df_file, df_file_cntbs, bot_switch)
+
+    df_dynamic_directory = cntrb_per_directory_value(directory, df_file)
+
+    # work around for using functions, will clean later
+    if df_dynamic_directory.empty:
+        return df_dynamic_directory
+
+    df_dynamic_directory = cntrb_to_last_activity(df_actions, df_dynamic_directory)
+
+    final = file_cntrb_activity_by_month(df_dynamic_directory, df_actions)
+
+    return final
+
+
+def create_figure(df: pd.DataFrame):
+    fig = px.imshow(
+        df,
+        labels=dict(x="Time", y="Directory Entries", color="Contributors"),
+        color_continuous_scale=px.colors.sequential.deep,
+    )
+
+    fig["layout"]["yaxis"]["tickmode"] = "linear"
+    fig["layout"]["height"] = 700
+    fig["layout"]["coloraxis_colorbar_x"] = -0.15
+    fig["layout"]["yaxis"]["side"] = "right"
+
+    return fig
+
+
+def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch):
+    """
+    This function cleans the df_file data and combines it with the related cntrb_ids
+
+    Args:
+    -----
+        df_file : Pandas Dataframe
+            Dataframe with the output of the repo_files_query
+
+        df_file_cntrbs : Pandas Dataframe
+            Dataframe with the output of the cntrb_per_file_query
+
+        bot_switch : boolan
+            T/F for the status of the bot switch
+
+    Returns:
+    --------
+        df_file: df with file and cntrb_ids of contributors that reviewed a pr with that file in it
+    """
     # strings to hold the values for each column (always the same for every row of this query)
     repo_name = df_file["repo_name"].iloc[0]
     repo_path = df_file["repo_path"].iloc[0]
@@ -326,7 +385,7 @@ def process_data(
     df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)
 
     # Left join on df_files to only get the files that are currently in the repository
-    # and the contributors that have ever opened a pr that included edits on the file
+    # and the contributors that have ever reviewed a pr that included edits on the file
     df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
     # replace nan with empty string to avoid errors in list comprehension
     df_file.cntrb_ids.fillna("", inplace=True)
@@ -343,6 +402,26 @@ def process_data(
             axis=1,
         )
 
+    return df_file
+
+
+def cntrb_per_directory_value(directory, df_file):
+    """
+    This function gets the files in the specified directory, groups together any files in
+    subdirectories, and creates a list of their contributors cntrb_ids
+
+    Args:
+    -----
+        directory : string
+            Output from the directory drop down
+
+        df_file : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and their reviewers cntrb_ids
+    """
     # determine directory level to use in later step
     level = directory.count("/")
     if directory == "Top Level Directory":
@@ -377,6 +456,25 @@ def process_data(
         lambda row: set(row.cntrb_ids),
         axis=1,
     )
+    return df_dynamic_directory
+
+
+def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame):
+    """
+    This function creates a df with the files and the the dates of the most recent activity for each cntrb_id.
+
+    Args:
+    -----
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
+
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and the dates of the most recent activity for the reviewers.
+    """
 
     # date reformating
     df_actions["created_at"] = pd.to_datetime(df_actions["created_at"], utc=True)
@@ -406,6 +504,26 @@ def process_data(
     # most recent activity - preprocessing step
     df_dynamic_directory = df_dynamic_directory.explode("dates")
 
+    return df_dynamic_directory
+
+
+def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame):
+    """
+    This function transforms the df_dynamic_directory to be counts of "last seen" contributors by month.
+
+    Args:
+    -----
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
+
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related cntrb_id information
+
+    Returns:
+    --------
+        df_final: df with files and subdirectories as rows and the months as columns
+    """
+
     # get files that have no contributors and remove from set to prevent errors in grouper function
     no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory.dates.isnull()].tolist()
 
@@ -436,18 +554,3 @@ def process_data(
         final.loc[files] = None
 
     return final
-
-
-def create_figure(df: pd.DataFrame):
-    fig = px.imshow(
-        df,
-        labels=dict(x="Time", y="Directory Entries", color="Contributors"),
-        color_continuous_scale=px.colors.sequential.deep,
-    )
-
-    fig["layout"]["yaxis"]["tickmode"] = "linear"
-    fig["layout"]["height"] = 700
-    fig["layout"]["coloraxis_colorbar_x"] = -0.15
-    fig["layout"]["yaxis"]["side"] = "right"
-
-    return fig
diff --git a/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py b/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py
index e7fccbe2..59a86caa 100644
--- a/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py
+++ b/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py
@@ -314,6 +314,65 @@ def process_data(
     directory,
     graph_view,
 ):
+    """
+    Processing steps
+
+        1 - Cleans up file data to only include current files and relate files in the repository to the prs that impact them.
+        2 - For a given level in the directory tree, aggregate the list of prs for sub-directories and for individual files at the level.
+        3 - For each pr, identify their open and merged.
+        4 - Transforms dataframe where columns are months with counts of pr open/merge dates in that month and the rows are the file/subdirectory
+    """
+
+    df_file = df_file_clean(df_file, df_file_pr)
+
+    df_dynamic_directory = pr_per_directory_value(directory, df_file)
+
+    # work around for using functions, will clean later
+    if df_dynamic_directory.empty:
+        return df_dynamic_directory
+
+    df_dynamic_directory = pr_to_dates(df_pr, df_dynamic_directory, graph_view)
+
+    final = file_pr_activity_by_month(df_dynamic_directory, df_pr, graph_view)
+
+    return final
+
+
+def create_figure(df: pd.DataFrame, graph_view):
+    legend_title = "PRs Opened"
+    if graph_view == "merged_at":
+        legend_title = "PRs Merged"
+
+    fig = px.imshow(
+        df,
+        labels=dict(x="Time", y="Directory Entries", color=legend_title),
+        color_continuous_scale=px.colors.sequential.deep,
+    )
+
+    fig["layout"]["yaxis"]["tickmode"] = "linear"
+    fig["layout"]["height"] = 700
+    fig["layout"]["coloraxis_colorbar_x"] = -0.15
+    fig["layout"]["yaxis"]["side"] = "right"
+
+    return fig
+
+
+def df_file_clean(df_file: pd.DataFrame, df_file_pr: pd.DataFrame):
+    """
+    This function cleans the df_file data and combines it with the related pull request ids
+
+    Args:
+    -----
+        df_file : Pandas Dataframe
+            Dataframe with the output of the repo_files_query
+
+        df_file_prs : Pandas Dataframe
+            Dataframe with the output of the pr_file_query
+
+    Returns:
+    --------
+        df_file: df with file and pull_request_ids of prs with that file in it
+    """
     # strings to hold the values for each column (always the same for every row of this query)
     repo_name = df_file["repo_name"].iloc[0]
     repo_path = df_file["repo_path"].iloc[0]
@@ -323,9 +382,6 @@ def process_data(
     path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/"
     df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1]
 
-    # drop columns not in the most recent collection
-    df_file = df_file[df_file["rl_analysis_date"] == df_file["rl_analysis_date"].max()]
-
     # drop unneccessary columns not needed after preprocessing steps
     df_file = df_file.reset_index()
     df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True)
@@ -341,9 +397,29 @@ def process_data(
     df_file_pr = df_file_pr.groupby("file_path")["pull_request_id"].apply(list)
 
     # Left join on df_files to only get the files that are currently in the repository
-    # and the contributors that have ever opened a pr that included edits on the file
+    # and the prs that included edits on the file
     df_file = pd.merge(df_file, df_file_pr, on="file_path", how="left")
 
+    return df_file
+
+
+def pr_per_directory_value(directory, df_file):
+    """
+    This function gets the files in the specified directory, groups together any files in
+    subdirectories, and creates a list of pull_request_ids that touched those files
+
+    Args:
+    -----
+        directory : string
+            Output from the directory drop down
+
+        df_file : Pandas Dataframe
+            Dataframe with file and related pull_request_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and their prs pull_request_ids
+    """
     # determine directory level to use in later step
     level = directory.count("/")
     if directory == "Top Level Directory":
@@ -361,7 +437,7 @@ def process_data(
     group_column = level + 1
 
     # Groupby the level above the selected directory for all files nested in folders are together.
-    # For each, create a list of all of pull request that include that file
+    # For each, create a list of all of the contributors who have contributed
     df_dynamic_directory = (
         df_dynamic_directory.groupby(group_column)["pull_request_id"]
         .sum()
@@ -377,6 +453,27 @@ def process_data(
         lambda row: set(row.pull_request_id),
         axis=1,
     )
+    return df_dynamic_directory
+
+
+def pr_to_dates(df_pr: pd.DataFrame, df_dynamic_directory: pd.DataFrame, graph_view):
+    """
+    This function creates a df with the files and the the open and merge dates of the prs that
+    touch each file or subdirectory.
+
+    Args:
+    -----
+        df_pr : Pandas Dataframe
+            Dataframe with pull request data
+
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related pull_request_id information
+
+    Returns:
+    --------
+        df_dynamic_directory: df with the file and subdirectories and the dates of open and merge dates
+        of the prs that touch each file or subdirectory.
+    """
 
     # date reformating
     df_pr["created_at"] = pd.to_datetime(df_pr["created_at"], utc=True)
@@ -407,6 +504,26 @@ def process_data(
     # reformat into each row being a directory value and a date of one of the pull request dates
     df_dynamic_directory = df_dynamic_directory.explode(graph_view)
 
+    return df_dynamic_directory
+
+
+def file_pr_activity_by_month(df_dynamic_directory: pd.DataFrame, df_pr: pd.DataFrame, graph_view):
+    """
+    This function transforms the df_dynamic_directory to be counts of open or merged prs by month.
+
+    Args:
+    -----
+        df_dynamic_directory : Pandas Dataframe
+            Dataframe with file and related reviewer_id information
+
+        df_pr : Pandas Dataframe
+            Dataframe with pull request data
+
+    Returns:
+    --------
+        df_final: df with files and subdirectories as rows and the months as columns
+    """
+
     # get files that have no pull requests and remove from set to prevent errors in grouper function
     no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory[graph_view].isnull()].tolist()
 
@@ -438,22 +555,3 @@ def process_data(
         final.loc[files] = None
 
     return final
-
-
-def create_figure(df: pd.DataFrame, graph_view):
-    legend_title = "PRs Opened"
-    if graph_view == "merged_at":
-        legend_title = "PRs Merged"
-
-    fig = px.imshow(
-        df,
-        labels=dict(x="Time", y="Directory Entries", color=legend_title),
-        color_continuous_scale=px.colors.sequential.deep,
-    )
-
-    fig["layout"]["yaxis"]["tickmode"] = "linear"
-    fig["layout"]["height"] = 700
-    fig["layout"]["coloraxis_colorbar_x"] = -0.15
-    fig["layout"]["yaxis"]["side"] = "right"
-
-    return fig
diff --git a/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py b/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py
index 2a799b6d..003c9290 100644
--- a/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py
+++ b/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py
@@ -224,18 +224,19 @@ def directory_dropdown(repo_id):
 @callback(
     Output(f"{PAGE}-{VIZ_ID}", "figure"),
     [
+        Input("repo-choices", "data"),
         Input(f"repo-{PAGE}-{VIZ_ID}", "value"),
         Input(f"directory-{PAGE}-{VIZ_ID}", "value"),
         Input("bot-switch", "value"),
     ],
     background=True,
 )
-def reviewer_file_heatmap_graph(repo_id, directory, bot_switch):
+def reviewer_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch):
     start = time.perf_counter()
     logging.warning(f"{VIZ_ID}- START")
 
     # get dataframes of data from cache
-    df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id])
+    df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id])
 
     # test if there is data
     if df_file.empty or df_actions.empty or df_file_cntbs.empty:
@@ -255,7 +256,7 @@ def reviewer_file_heatmap_graph(repo_id, directory, bot_switch):
     return fig
 
 
-def multi_query_helper(repos):
+def multi_query_helper(searchbar_repos, repo):
     """
     For reviewer_file_heatmap_graph-
     hack to put all of the cache-retrieval
@@ -263,32 +264,32 @@ def multi_query_helper(repos):
     """
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # wait for data to asynchronously download and become available.
-    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos):
+    while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo):
         logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
         time.sleep(0.5)
 
     # GET ALL DATA FROM POSTGRES CACHE
     df_file = cf.retrieve_from_cache(
         tablename=rfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
     df_actions = cf.retrieve_from_cache(
         tablename=cnq.__name__,
-        repolist=repos,
+        repolist=searchbar_repos,
     )
     df_file_cntrbs = cf.retrieve_from_cache(
         tablename=cpfq.__name__,
-        repolist=repos,
+        repolist=repo,
     )
 
     # necessary preprocessing steps that were lifted out of the querying step
@@ -400,13 +401,13 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch
             lambda row: [x for x in row.reviewer_ids],
             axis=1,
         )
-
     return df_file
 
 
 def cntrb_per_directory_value(directory, df_file):
     """
-    This function cleans the df_file data and combines it with the related reviewer cntrb_ids
+    This function gets the files in the specified directory, groups together any files in
+    subdirectories, and creates a list of their reviewers cntrb_ids.
 
     Args:
     -----
@@ -459,12 +460,12 @@ def cntrb_per_directory_value(directory, df_file):
 
 def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame):
     """
-    This function created a df with the files and the the dates of the most recent activity for each cntrb_id.
+    This function creates a df with the files and the the dates of the most recent activity for each cntrb_id.
 
     Args:
     -----
-        df_actions : string
-            Output from the directory drop down
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
 
         df_dynamic_directory : Pandas Dataframe
             Dataframe with file and related reviewer_id information
@@ -507,16 +508,16 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da
 
 def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame):
     """
-    This function created a df with the files and the the dates of the most recent activity for each cntrb_id.
+    This function transforms the df_dynamic_directory to be counts of "last seen" reviewers by month.
 
     Args:
     -----
-        df_actions : string
-            Output from the directory drop down
-
         df_dynamic_directory : Pandas Dataframe
             Dataframe with file and related reviewer_id information
 
+        df_actions : Pandas Dataframe
+            Dataframe with contributor activity
+
     Returns:
     --------
         df_final: df with files and subdirectories as rows and the months as columns