diff --git a/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py b/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py index c2d36cca..46b70abb 100644 --- a/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py +++ b/8Knot/pages/chaoss/visualizations/contrib_importance_pie.py @@ -35,20 +35,18 @@ dbc.PopoverHeader("Graph Info:"), dbc.PopoverBody( """ - For a given action type, visualizes the proportional share of the top k anonymous + AKA Bus factor. For a given action type, this visualizes the proportional share of the top k anonymous contributors, aggregating the remaining contributors as "Other". Suppose Contributor A opens the most PRs of all contributors, accounting for 1/5 of all PRs. If k = 1, then the chart will have one slice for Contributor A accounting for 1/5 of the area, - with the remaining 4/5 representing all other contributors. By default, contributors - who have 'potential-bot-filter' in their login are filtered out. Optionally, contributors - can be filtered out by their logins with custom keyword(s). Note: Some commits may have a + with the remaining 4/5 representing all other contributors. Note: Some commits may have a Contributor ID of 'None' if there is no GitHub account is associated with the email that the contributor committed as. """ ), ], id=f"popover-{PAGE}-{VIZ_ID}", - target=f"popover-target-{PAGE}-{VIZ_ID}", # needs to be the same as dbc.Button id + target=f"popover-target-{PAGE}-{VIZ_ID}", placement="top", is_open=False, ), @@ -134,31 +132,6 @@ ], align="center", ), - dbc.Row( - [ - dbc.Label( - "Filter Out Contributors with Keyword(s) in Login:", - html_for=f"patterns-{PAGE}-{VIZ_ID}", - width="auto", - ), - dbc.Col( - [ - dmc.MultiSelect( - id=f"patterns-{PAGE}-{VIZ_ID}", - placeholder="Bot filter values", - data=[ - {"value": "bot", "label": "bot"}, - ], - classNames={"values": "dmc-multiselect-custom"}, - creatable=True, - searchable=True, - ), - ], - className="me-2", - ), - ], - align="center", - ), dbc.Row( [ dbc.Col( @@ -227,14 +200,13 @@ def graph_title(k, action_type): Input("repo-choices", "data"), Input(f"action-type-{PAGE}-{VIZ_ID}", "value"), Input(f"top-k-contributors-{PAGE}-{VIZ_ID}", "value"), - Input(f"patterns-{PAGE}-{VIZ_ID}", "value"), Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "start_date"), Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "end_date"), Input("bot-switch", "value"), ], background=True, ) -def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date, end_date, bot_switch): +def create_top_k_cntrbs_graph(repolist, action_type, top_k, start_date, end_date, bot_switch): # wait for data to asynchronously download and become available. while not_cached := cf.get_uncached(func_name=ctq.__name__, repolist=repolist): logging.warning(f"{VIZ_ID}- WAITING ON DATA TO BECOME AVAILABLE") @@ -265,7 +237,7 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date df = df[~df["cntrb_id"].isin(app.bots_list)] # function for all data pre processing - df = process_data(df, action_type, top_k, patterns, start_date, end_date) + df = process_data(df, action_type, top_k, start_date, end_date) fig = create_figure(df, action_type) @@ -273,7 +245,7 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date return fig, False -def process_data(df: pd.DataFrame, action_type, top_k, patterns, start_date, end_date): +def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date): # convert to datetime objects rather than strings df["created_at"] = pd.to_datetime(df["created_at"], utc=True) @@ -289,31 +261,23 @@ def process_data(df: pd.DataFrame, action_type, top_k, patterns, start_date, end # subset the df such that it only contains rows where the Action column value is the action type df = df[df["Action"].str.contains(action_type)] - # option to filter out potential bots - if patterns: - # remove rows where login column value contains any keyword in patterns - patterns_mask = df["login"].str.contains("|".join(patterns), na=False) - df = df[~patterns_mask] + # get the number of total contributions of the specific action type + t_sum = df.shape[0] # count the number of contributions for each contributor df = (df.groupby("cntrb_id")["Action"].count()).to_frame() # sort rows according to amount of contributions from greatest to least - df.sort_values(by="cntrb_id", ascending=False, inplace=True) + df.sort_values(by="Action", ascending=False, inplace=True) + df = df.reset_index() # rename Action column to action_type df = df.rename(columns={"Action": action_type}) - # get the number of total contributions - t_sum = df[action_type].sum() - # index df to get first k rows df = df.head(top_k) - # convert cntrb_id from type UUID to String - df["cntrb_id"] = df["cntrb_id"].apply(lambda x: str(x).split("-")[0]) - # get the number of total top k contributions df_sum = df[action_type].sum() diff --git a/8Knot/pages/contributors/visualizations/contrib_importance_pie.py b/8Knot/pages/contributors/visualizations/contrib_importance_pie.py index 5952416b..40cca1a0 100644 --- a/8Knot/pages/contributors/visualizations/contrib_importance_pie.py +++ b/8Knot/pages/contributors/visualizations/contrib_importance_pie.py @@ -35,20 +35,18 @@ dbc.PopoverHeader("Graph Info:"), dbc.PopoverBody( """ - For a given action type, visualizes the proportional share of the top k anonymous + AKA Bus factor. For a given action type, this visualizes the proportional share of the top k anonymous contributors, aggregating the remaining contributors as "Other". Suppose Contributor A opens the most PRs of all contributors, accounting for 1/5 of all PRs. If k = 1, then the chart will have one slice for Contributor A accounting for 1/5 of the area, - with the remaining 4/5 representing all other contributors. By default, contributors - who have 'potential-bot-filter' in their login are filtered out. Optionally, contributors - can be filtered out by their logins with custom keyword(s). Note: Some commits may have a + with the remaining 4/5 representing all other contributors. Note: Some commits may have a Contributor ID of 'None' if there is no Github account is associated with the email that the contributor committed as. """ ), ], id=f"popover-{PAGE}-{VIZ_ID}", - target=f"popover-target-{PAGE}-{VIZ_ID}", # needs to be the same as dbc.Button id + target=f"popover-target-{PAGE}-{VIZ_ID}", placement="top", is_open=False, ), @@ -134,31 +132,6 @@ ], align="center", ), - dbc.Row( - [ - dbc.Label( - "Filter Out Contributors with Keyword(s) in Login:", - html_for=f"patterns-{PAGE}-{VIZ_ID}", - width="auto", - ), - dbc.Col( - [ - dmc.MultiSelect( - id=f"patterns-{PAGE}-{VIZ_ID}", - placeholder="Bot filter values", - data=[ - {"value": "bot", "label": "bot"}, - ], - classNames={"values": "dmc-multiselect-custom"}, - creatable=True, - searchable=True, - ), - ], - className="me-2", - ), - ], - align="center", - ), dbc.Row( [ dbc.Col( @@ -227,14 +200,13 @@ def graph_title(k, action_type): Input("repo-choices", "data"), Input(f"action-type-{PAGE}-{VIZ_ID}", "value"), Input(f"top-k-contributors-{PAGE}-{VIZ_ID}", "value"), - Input(f"patterns-{PAGE}-{VIZ_ID}", "value"), Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "start_date"), Input(f"date-picker-range-{PAGE}-{VIZ_ID}", "end_date"), Input("bot-switch", "value"), ], background=True, ) -def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date, end_date, bot_switch): +def create_top_k_cntrbs_graph(repolist, action_type, top_k, start_date, end_date, bot_switch): # wait for data to asynchronously download and become available. while not_cached := cf.get_uncached(func_name=ctq.__name__, repolist=repolist): logging.warning(f"{VIZ_ID}- WAITING ON DATA TO BECOME AVAILABLE") @@ -266,7 +238,7 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date return dash.no_update, True # function for all data pre processing - df = process_data(df, action_type, top_k, patterns, start_date, end_date) + df = process_data(df, action_type, top_k, start_date, end_date) fig = create_figure(df, action_type) @@ -274,7 +246,7 @@ def create_top_k_cntrbs_graph(repolist, action_type, top_k, patterns, start_date return fig, False -def process_data(df: pd.DataFrame, action_type, top_k, patterns, start_date, end_date): +def process_data(df: pd.DataFrame, action_type, top_k, start_date, end_date): # convert to datetime objects rather than strings df["created_at"] = pd.to_datetime(df["created_at"], utc=True) @@ -290,17 +262,15 @@ def process_data(df: pd.DataFrame, action_type, top_k, patterns, start_date, end # subset the df such that it only contains rows where the Action column value is the action type df = df[df["Action"].str.contains(action_type)] - # option to filter out potential bots - if patterns: - # remove rows where login column value contains any keyword in patterns - patterns_mask = df["login"].str.contains("|".join(patterns), na=False) - df = df[~patterns_mask] + # get the number of total contributions of the specific action type + t_sum = df.shape[0] # count the number of contributions for each contributor df = (df.groupby("cntrb_id")["Action"].count()).to_frame() # sort rows according to amount of contributions from greatest to least - df.sort_values(by="cntrb_id", ascending=False, inplace=True) + df.sort_values(by="Action", ascending=False, inplace=True) + df = df.reset_index() # rename Action column to action_type @@ -312,9 +282,6 @@ def process_data(df: pd.DataFrame, action_type, top_k, patterns, start_date, end # index df to get first k rows df = df.head(top_k) - # convert cntrb_id from type UUID to String - df["cntrb_id"] = df["cntrb_id"].apply(lambda x: str(x).split("-")[0]) - # get the number of total top k contributions df_sum = df[action_type].sum() diff --git a/8Knot/pages/visualization_template/viz_template.py b/8Knot/pages/visualization_template/viz_template.py index 71c4d703..fd3eb093 100644 --- a/8Knot/pages/visualization_template/viz_template.py +++ b/8Knot/pages/visualization_template/viz_template.py @@ -13,6 +13,7 @@ import cache_manager.cache_facade as cf from pages.utils.job_utils import nodata_graph import time +import app """ NOTE: VARIABLES TO CHANGE: @@ -187,6 +188,7 @@ def toggle_popover(n, is_open): [ Input("repo-choices", "data"), Input(f"date-radio-{PAGE}-{VIZ_ID}", "value"), + # Input("bot-switch", "value"), # add additional inputs here ], background=True, @@ -211,6 +213,13 @@ def NAME_OF_VISUALIZATION_graph(repolist, interval): logging.warning(f"{VIZ_ID} - NO DATA AVAILABLE") return nodata_graph + # uncomment if bot filter applies to viz + """ + # remove bot data + if bot_switch: + df = df[~df["cntrb_id"].isin(app.bots_list)] + """ + # function for all data pre processing, COULD HAVE ADDITIONAL INPUTS AND OUTPUTS df = process_data(df, interval) diff --git a/8Knot/queries/query_template.py b/8Knot/queries/query_template.py index c05d5bfb..6b4e62af 100644 --- a/8Knot/queries/query_template.py +++ b/8Knot/queries/query_template.py @@ -39,7 +39,7 @@ def NAME_query(self, repos): """ (Worker Query) - Executes SQL query against Augur database for contributor data. + Executes SQL query against Augur database for GitHub data. Args: -----