From 4cc0acf71d35b764250825fa7e583dc35e739901 Mon Sep 17 00:00:00 2001 From: cdolfi Date: Fri, 1 Mar 2024 18:02:48 -0500 Subject: [PATCH] heatmap functions and df_actions based on repos --- .../visualizations/cntrb_file_heatmap.py | 153 +++++++++++++++--- .../contribution_file_heatmap.py | 146 ++++++++++++++--- .../visualizations/reviewer_file_heatmap.py | 37 ++--- 3 files changed, 269 insertions(+), 67 deletions(-) diff --git a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py index 431b199b..87c90842 100644 --- a/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py +++ b/8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py @@ -224,18 +224,19 @@ def directory_dropdown(repo_id): @callback( Output(f"{PAGE}-{VIZ_ID}", "figure"), [ + Input("repo-choices", "data"), Input(f"repo-{PAGE}-{VIZ_ID}", "value"), Input(f"directory-{PAGE}-{VIZ_ID}", "value"), Input("bot-switch", "value"), ], background=True, ) -def cntrb_file_heatmap_graph(repo_id, directory, bot_switch): +def cntrb_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch): start = time.perf_counter() logging.warning(f"{VIZ_ID}- START") # get dataframes of data from cache - df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id]) + df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id]) # test if there is data if df_file.empty or df_actions.empty or df_file_cntbs.empty: @@ -255,7 +256,7 @@ def cntrb_file_heatmap_graph(repo_id, directory, bot_switch): return fig -def multi_query_helper(repos): +def multi_query_helper(searchbar_repos, repo): """ For cntrb_file_heatmap_graph- hack to put all of the cache-retrieval @@ -263,32 +264,32 @@ def multi_query_helper(repos): """ # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # GET ALL DATA FROM POSTGRES CACHE df_file = cf.retrieve_from_cache( tablename=rfq.__name__, - repolist=repos, + repolist=repo, ) df_actions = cf.retrieve_from_cache( tablename=cnq.__name__, - repolist=repos, + repolist=searchbar_repos, ) df_file_cntrbs = cf.retrieve_from_cache( tablename=cpfq.__name__, - repolist=repos, + repolist=repo, ) # necessary preprocessing steps that were lifted out of the querying step @@ -305,6 +306,64 @@ def process_data( directory, bot_switch, ): + """ + Processing steps + + 1 - Cleans up file data to only include current files and relate files in the repository to the contributors who have reviewed them in past PRs. + 2 - For a given level in the directory tree, aggregate the list of contributors for sub-directories and for individual files at the level. + 3 - For each contributor, identify their most recent contribution. + 4 - Transforms dataframe where columns are months with counts of "last seen" dates in that month and the rows are the file/subdirectory + """ + + df_file = df_file_clean(df_file, df_file_cntbs, bot_switch) + + df_dynamic_directory = cntrb_per_directory_value(directory, df_file) + + # work around for using functions, will clean later + if df_dynamic_directory.empty: + return df_dynamic_directory + + df_dynamic_directory = cntrb_to_last_activity(df_actions, df_dynamic_directory) + + final = file_cntrb_activity_by_month(df_dynamic_directory, df_actions) + + return final + + +def create_figure(df: pd.DataFrame): + fig = px.imshow( + df, + labels=dict(x="Time", y="Directory Entries", color="Contributors"), + color_continuous_scale=px.colors.sequential.deep, + ) + + fig["layout"]["yaxis"]["tickmode"] = "linear" + fig["layout"]["height"] = 700 + fig["layout"]["coloraxis_colorbar_x"] = -0.15 + fig["layout"]["yaxis"]["side"] = "right" + + return fig + + +def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch): + """ + This function cleans the df_file data and combines it with the related cntrb_ids + + Args: + ----- + df_file : Pandas Dataframe + Dataframe with the output of the repo_files_query + + df_file_cntrbs : Pandas Dataframe + Dataframe with the output of the cntrb_per_file_query + + bot_switch : boolan + T/F for the status of the bot switch + + Returns: + -------- + df_file: df with file and cntrb_ids of contributors that reviewed a pr with that file in it + """ # strings to hold the values for each column (always the same for every row of this query) repo_name = df_file["repo_name"].iloc[0] repo_path = df_file["repo_path"].iloc[0] @@ -326,7 +385,7 @@ def process_data( df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True) # Left join on df_files to only get the files that are currently in the repository - # and the contributors that have ever opened a pr that included edits on the file + # and the contributors that have ever reviewed a pr that included edits on the file df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left") # replace nan with empty string to avoid errors in list comprehension df_file.cntrb_ids.fillna("", inplace=True) @@ -343,6 +402,26 @@ def process_data( axis=1, ) + return df_file + + +def cntrb_per_directory_value(directory, df_file): + """ + This function gets the files in the specified directory, groups together any files in + subdirectories, and creates a list of their contributors cntrb_ids + + Args: + ----- + directory : string + Output from the directory drop down + + df_file : Pandas Dataframe + Dataframe with file and related cntrb_id information + + Returns: + -------- + df_dynamic_directory: df with the file and subdirectories and their reviewers cntrb_ids + """ # determine directory level to use in later step level = directory.count("/") if directory == "Top Level Directory": @@ -377,6 +456,25 @@ def process_data( lambda row: set(row.cntrb_ids), axis=1, ) + return df_dynamic_directory + + +def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame): + """ + This function creates a df with the files and the the dates of the most recent activity for each cntrb_id. + + Args: + ----- + df_actions : Pandas Dataframe + Dataframe with contributor activity + + df_dynamic_directory : Pandas Dataframe + Dataframe with file and related cntrb_id information + + Returns: + -------- + df_dynamic_directory: df with the file and subdirectories and the dates of the most recent activity for the reviewers. + """ # date reformating df_actions["created_at"] = pd.to_datetime(df_actions["created_at"], utc=True) @@ -406,6 +504,26 @@ def process_data( # most recent activity - preprocessing step df_dynamic_directory = df_dynamic_directory.explode("dates") + return df_dynamic_directory + + +def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame): + """ + This function transforms the df_dynamic_directory to be counts of "last seen" contributors by month. + + Args: + ----- + df_actions : Pandas Dataframe + Dataframe with contributor activity + + df_dynamic_directory : Pandas Dataframe + Dataframe with file and related cntrb_id information + + Returns: + -------- + df_final: df with files and subdirectories as rows and the months as columns + """ + # get files that have no contributors and remove from set to prevent errors in grouper function no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory.dates.isnull()].tolist() @@ -436,18 +554,3 @@ def process_data( final.loc[files] = None return final - - -def create_figure(df: pd.DataFrame): - fig = px.imshow( - df, - labels=dict(x="Time", y="Directory Entries", color="Contributors"), - color_continuous_scale=px.colors.sequential.deep, - ) - - fig["layout"]["yaxis"]["tickmode"] = "linear" - fig["layout"]["height"] = 700 - fig["layout"]["coloraxis_colorbar_x"] = -0.15 - fig["layout"]["yaxis"]["side"] = "right" - - return fig diff --git a/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py b/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py index e7fccbe2..59a86caa 100644 --- a/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py +++ b/8Knot/pages/codebase/visualizations/contribution_file_heatmap.py @@ -314,6 +314,65 @@ def process_data( directory, graph_view, ): + """ + Processing steps + + 1 - Cleans up file data to only include current files and relate files in the repository to the prs that impact them. + 2 - For a given level in the directory tree, aggregate the list of prs for sub-directories and for individual files at the level. + 3 - For each pr, identify their open and merged. + 4 - Transforms dataframe where columns are months with counts of pr open/merge dates in that month and the rows are the file/subdirectory + """ + + df_file = df_file_clean(df_file, df_file_pr) + + df_dynamic_directory = pr_per_directory_value(directory, df_file) + + # work around for using functions, will clean later + if df_dynamic_directory.empty: + return df_dynamic_directory + + df_dynamic_directory = pr_to_dates(df_pr, df_dynamic_directory, graph_view) + + final = file_pr_activity_by_month(df_dynamic_directory, df_pr, graph_view) + + return final + + +def create_figure(df: pd.DataFrame, graph_view): + legend_title = "PRs Opened" + if graph_view == "merged_at": + legend_title = "PRs Merged" + + fig = px.imshow( + df, + labels=dict(x="Time", y="Directory Entries", color=legend_title), + color_continuous_scale=px.colors.sequential.deep, + ) + + fig["layout"]["yaxis"]["tickmode"] = "linear" + fig["layout"]["height"] = 700 + fig["layout"]["coloraxis_colorbar_x"] = -0.15 + fig["layout"]["yaxis"]["side"] = "right" + + return fig + + +def df_file_clean(df_file: pd.DataFrame, df_file_pr: pd.DataFrame): + """ + This function cleans the df_file data and combines it with the related pull request ids + + Args: + ----- + df_file : Pandas Dataframe + Dataframe with the output of the repo_files_query + + df_file_prs : Pandas Dataframe + Dataframe with the output of the pr_file_query + + Returns: + -------- + df_file: df with file and pull_request_ids of prs with that file in it + """ # strings to hold the values for each column (always the same for every row of this query) repo_name = df_file["repo_name"].iloc[0] repo_path = df_file["repo_path"].iloc[0] @@ -323,9 +382,6 @@ def process_data( path_slice = repo_id + "-" + repo_path + "/" + repo_name + "/" df_file["file_path"] = df_file["file_path"].str.rsplit(path_slice, n=1).str[1] - # drop columns not in the most recent collection - df_file = df_file[df_file["rl_analysis_date"] == df_file["rl_analysis_date"].max()] - # drop unneccessary columns not needed after preprocessing steps df_file = df_file.reset_index() df_file.drop(["index", "repo_name", "repo_path", "rl_analysis_date"], axis=1, inplace=True) @@ -341,9 +397,29 @@ def process_data( df_file_pr = df_file_pr.groupby("file_path")["pull_request_id"].apply(list) # Left join on df_files to only get the files that are currently in the repository - # and the contributors that have ever opened a pr that included edits on the file + # and the prs that included edits on the file df_file = pd.merge(df_file, df_file_pr, on="file_path", how="left") + return df_file + + +def pr_per_directory_value(directory, df_file): + """ + This function gets the files in the specified directory, groups together any files in + subdirectories, and creates a list of pull_request_ids that touched those files + + Args: + ----- + directory : string + Output from the directory drop down + + df_file : Pandas Dataframe + Dataframe with file and related pull_request_id information + + Returns: + -------- + df_dynamic_directory: df with the file and subdirectories and their prs pull_request_ids + """ # determine directory level to use in later step level = directory.count("/") if directory == "Top Level Directory": @@ -361,7 +437,7 @@ def process_data( group_column = level + 1 # Groupby the level above the selected directory for all files nested in folders are together. - # For each, create a list of all of pull request that include that file + # For each, create a list of all of the contributors who have contributed df_dynamic_directory = ( df_dynamic_directory.groupby(group_column)["pull_request_id"] .sum() @@ -377,6 +453,27 @@ def process_data( lambda row: set(row.pull_request_id), axis=1, ) + return df_dynamic_directory + + +def pr_to_dates(df_pr: pd.DataFrame, df_dynamic_directory: pd.DataFrame, graph_view): + """ + This function creates a df with the files and the the open and merge dates of the prs that + touch each file or subdirectory. + + Args: + ----- + df_pr : Pandas Dataframe + Dataframe with pull request data + + df_dynamic_directory : Pandas Dataframe + Dataframe with file and related pull_request_id information + + Returns: + -------- + df_dynamic_directory: df with the file and subdirectories and the dates of open and merge dates + of the prs that touch each file or subdirectory. + """ # date reformating df_pr["created_at"] = pd.to_datetime(df_pr["created_at"], utc=True) @@ -407,6 +504,26 @@ def process_data( # reformat into each row being a directory value and a date of one of the pull request dates df_dynamic_directory = df_dynamic_directory.explode(graph_view) + return df_dynamic_directory + + +def file_pr_activity_by_month(df_dynamic_directory: pd.DataFrame, df_pr: pd.DataFrame, graph_view): + """ + This function transforms the df_dynamic_directory to be counts of open or merged prs by month. + + Args: + ----- + df_dynamic_directory : Pandas Dataframe + Dataframe with file and related reviewer_id information + + df_pr : Pandas Dataframe + Dataframe with pull request data + + Returns: + -------- + df_final: df with files and subdirectories as rows and the months as columns + """ + # get files that have no pull requests and remove from set to prevent errors in grouper function no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory[graph_view].isnull()].tolist() @@ -438,22 +555,3 @@ def process_data( final.loc[files] = None return final - - -def create_figure(df: pd.DataFrame, graph_view): - legend_title = "PRs Opened" - if graph_view == "merged_at": - legend_title = "PRs Merged" - - fig = px.imshow( - df, - labels=dict(x="Time", y="Directory Entries", color=legend_title), - color_continuous_scale=px.colors.sequential.deep, - ) - - fig["layout"]["yaxis"]["tickmode"] = "linear" - fig["layout"]["height"] = 700 - fig["layout"]["coloraxis_colorbar_x"] = -0.15 - fig["layout"]["yaxis"]["side"] = "right" - - return fig diff --git a/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py b/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py index 2a799b6d..003c9290 100644 --- a/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py +++ b/8Knot/pages/codebase/visualizations/reviewer_file_heatmap.py @@ -224,18 +224,19 @@ def directory_dropdown(repo_id): @callback( Output(f"{PAGE}-{VIZ_ID}", "figure"), [ + Input("repo-choices", "data"), Input(f"repo-{PAGE}-{VIZ_ID}", "value"), Input(f"directory-{PAGE}-{VIZ_ID}", "value"), Input("bot-switch", "value"), ], background=True, ) -def reviewer_file_heatmap_graph(repo_id, directory, bot_switch): +def reviewer_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch): start = time.perf_counter() logging.warning(f"{VIZ_ID}- START") # get dataframes of data from cache - df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id]) + df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id]) # test if there is data if df_file.empty or df_actions.empty or df_file_cntbs.empty: @@ -255,7 +256,7 @@ def reviewer_file_heatmap_graph(repo_id, directory, bot_switch): return fig -def multi_query_helper(repos): +def multi_query_helper(searchbar_repos, repo): """ For reviewer_file_heatmap_graph- hack to put all of the cache-retrieval @@ -263,32 +264,32 @@ def multi_query_helper(repos): """ # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # wait for data to asynchronously download and become available. - while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos): + while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo): logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE") time.sleep(0.5) # GET ALL DATA FROM POSTGRES CACHE df_file = cf.retrieve_from_cache( tablename=rfq.__name__, - repolist=repos, + repolist=repo, ) df_actions = cf.retrieve_from_cache( tablename=cnq.__name__, - repolist=repos, + repolist=searchbar_repos, ) df_file_cntrbs = cf.retrieve_from_cache( tablename=cpfq.__name__, - repolist=repos, + repolist=repo, ) # necessary preprocessing steps that were lifted out of the querying step @@ -400,13 +401,13 @@ def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch lambda row: [x for x in row.reviewer_ids], axis=1, ) - return df_file def cntrb_per_directory_value(directory, df_file): """ - This function cleans the df_file data and combines it with the related reviewer cntrb_ids + This function gets the files in the specified directory, groups together any files in + subdirectories, and creates a list of their reviewers cntrb_ids. Args: ----- @@ -459,12 +460,12 @@ def cntrb_per_directory_value(directory, df_file): def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame): """ - This function created a df with the files and the the dates of the most recent activity for each cntrb_id. + This function creates a df with the files and the the dates of the most recent activity for each cntrb_id. Args: ----- - df_actions : string - Output from the directory drop down + df_actions : Pandas Dataframe + Dataframe with contributor activity df_dynamic_directory : Pandas Dataframe Dataframe with file and related reviewer_id information @@ -507,16 +508,16 @@ def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.Da def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame): """ - This function created a df with the files and the the dates of the most recent activity for each cntrb_id. + This function transforms the df_dynamic_directory to be counts of "last seen" reviewers by month. Args: ----- - df_actions : string - Output from the directory drop down - df_dynamic_directory : Pandas Dataframe Dataframe with file and related reviewer_id information + df_actions : Pandas Dataframe + Dataframe with contributor activity + Returns: -------- df_final: df with files and subdirectories as rows and the months as columns