Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update main #700

Merged
merged 18 commits into from
Apr 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .wordlist-md
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,11 @@ Docker's
data-center
OAuth
postgres
Podman
filesystem
credsStore
credStore
filesystem
Podman
credsStore
credStore
8 changes: 5 additions & 3 deletions 8Knot/pages/chaoss/visualizations/project_velocity.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,11 @@ def process_data(
# replace all nan to 0
df_consolidated.fillna(value=0, inplace=True)

# log of commits and contribs
df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(math.log)
df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(math.log)
# log of commits and contribs if values are not 0
df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(lambda x: math.log(x) if x != 0 else 0)
df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(
lambda x: math.log(x) if x != 0 else 0
)

# column to hold the weighted values of pr and issues actions summed together
df_consolidated["prs_issues_actions_weighted"] = (
Expand Down
4 changes: 2 additions & 2 deletions 8Knot/pages/codebase/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
[
dbc.Row(
[
dbc.Col(gc_cntrb_file_heatmap, width=12),
dbc.Col(gc_contribution_file_heatmap, width=12),
],
align="center",
style={"marginBottom": ".5%"},
),
dbc.Row(
[
dbc.Col(gc_contribution_file_heatmap, width=12),
dbc.Col(gc_cntrb_file_heatmap, width=12),
],
align="center",
style={"marginBottom": ".5%"},
Expand Down
161 changes: 133 additions & 28 deletions 8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
classNames={"values": "dmc-multiselect-custom"},
searchable=True,
clearable=False,
value="Top Level Directory",
),
],
className="me-2",
Expand Down Expand Up @@ -215,7 +216,7 @@ def directory_dropdown(repo_id):

# add top level directory to the list of directories
directories.insert(0, "Top Level Directory")
logging.warning(f"DIRECTORY DROPDOWN - FINISHED")
logging.warning(f"CNTRB DIRECTORY DROPDOWN - FINISHED")

return directories, "Top Level Directory"

Expand All @@ -224,18 +225,19 @@ def directory_dropdown(repo_id):
@callback(
Output(f"{PAGE}-{VIZ_ID}", "figure"),
[
Input("repo-choices", "data"),
Input(f"repo-{PAGE}-{VIZ_ID}", "value"),
Input(f"directory-{PAGE}-{VIZ_ID}", "value"),
Input("bot-switch", "value"),
],
background=True,
)
def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
def cntrb_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch):
start = time.perf_counter()
logging.warning(f"{VIZ_ID}- START")

# get dataframes of data from cache
df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id])
df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id])

# test if there is data
if df_file.empty or df_actions.empty or df_file_cntbs.empty:
Expand All @@ -255,40 +257,40 @@ def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
return fig


def multi_query_helper(repos):
def multi_query_helper(searchbar_repos, repo):
"""
For cntrb_file_heatmap_graph-
hack to put all of the cache-retrieval
in the same place temporarily
"""

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# GET ALL DATA FROM POSTGRES CACHE
df_file = cf.retrieve_from_cache(
tablename=rfq.__name__,
repolist=repos,
repolist=repo,
)
df_actions = cf.retrieve_from_cache(
tablename=cnq.__name__,
repolist=repos,
repolist=searchbar_repos,
)
df_file_cntrbs = cf.retrieve_from_cache(
tablename=cpfq.__name__,
repolist=repos,
repolist=repo,
)

# necessary preprocessing steps that were lifted out of the querying step
Expand All @@ -305,6 +307,64 @@ def process_data(
directory,
bot_switch,
):
"""
Processing steps

1 - Cleans up file data to only include current files and relate files in the repository to the contributors who have reviewed them in past PRs.
2 - For a given level in the directory tree, aggregate the list of contributors for sub-directories and for individual files at the level.
3 - For each contributor, identify their most recent contribution.
4 - Transforms dataframe where columns are months with counts of "last seen" dates in that month and the rows are the file/subdirectory
"""

df_file = df_file_clean(df_file, df_file_cntbs, bot_switch)

df_dynamic_directory = cntrb_per_directory_value(directory, df_file)

# work around for using functions, will clean later
if df_dynamic_directory.empty:
return df_dynamic_directory

df_dynamic_directory = cntrb_to_last_activity(df_actions, df_dynamic_directory)

final = file_cntrb_activity_by_month(df_dynamic_directory, df_actions)

return final


def create_figure(df: pd.DataFrame):
fig = px.imshow(
df,
labels=dict(x="Time", y="Directory Entries", color="Contributors"),
color_continuous_scale=px.colors.sequential.deep,
)

fig["layout"]["yaxis"]["tickmode"] = "linear"
fig["layout"]["height"] = 700
fig["layout"]["coloraxis_colorbar_x"] = -0.15
fig["layout"]["yaxis"]["side"] = "right"

return fig


def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch):
"""
This function cleans the df_file data and combines it with the related cntrb_ids

Args:
-----
df_file : Pandas Dataframe
Dataframe with the output of the repo_files_query

df_file_cntrbs : Pandas Dataframe
Dataframe with the output of the cntrb_per_file_query

bot_switch : boolan
T/F for the status of the bot switch

Returns:
--------
df_file: df with file and cntrb_ids of contributors that reviewed a pr with that file in it
"""
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df_file["repo_name"].iloc[0]
repo_path = df_file["repo_path"].iloc[0]
Expand All @@ -326,7 +386,7 @@ def process_data(
df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever opened a pr that included edits on the file
# and the contributors that have ever reviewed a pr that included edits on the file
df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
# replace nan with empty string to avoid errors in list comprehension
df_file.cntrb_ids.fillna("", inplace=True)
Expand All @@ -343,6 +403,26 @@ def process_data(
axis=1,
)

return df_file


def cntrb_per_directory_value(directory, df_file):
"""
This function gets the files in the specified directory, groups together any files in
subdirectories, and creates a list of their contributors cntrb_ids

Args:
-----
directory : string
Output from the directory drop down

df_file : Pandas Dataframe
Dataframe with file and related cntrb_id information

Returns:
--------
df_dynamic_directory: df with the file and subdirectories and their reviewers cntrb_ids
"""
# determine directory level to use in later step
level = directory.count("/")
if directory == "Top Level Directory":
Expand Down Expand Up @@ -377,6 +457,25 @@ def process_data(
lambda row: set(row.cntrb_ids),
axis=1,
)
return df_dynamic_directory


def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame):
"""
This function creates a df with the files and the the dates of the most recent activity for each cntrb_id.

Args:
-----
df_actions : Pandas Dataframe
Dataframe with contributor activity

df_dynamic_directory : Pandas Dataframe
Dataframe with file and related cntrb_id information

Returns:
--------
df_dynamic_directory: df with the file and subdirectories and the dates of the most recent activity for the reviewers.
"""

# date reformating
df_actions["created_at"] = pd.to_datetime(df_actions["created_at"], utc=True)
Expand Down Expand Up @@ -406,6 +505,26 @@ def process_data(
# most recent activity - preprocessing step
df_dynamic_directory = df_dynamic_directory.explode("dates")

return df_dynamic_directory


def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame):
"""
This function transforms the df_dynamic_directory to be counts of "last seen" contributors by month.

Args:
-----
df_actions : Pandas Dataframe
Dataframe with contributor activity

df_dynamic_directory : Pandas Dataframe
Dataframe with file and related cntrb_id information

Returns:
--------
df_final: df with files and subdirectories as rows and the months as columns
"""

# get files that have no contributors and remove from set to prevent errors in grouper function
no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory.dates.isnull()].tolist()

Expand All @@ -415,8 +534,9 @@ def process_data(
there will be a column for every month even if there is no "last contribution" date in it. This greatly
improves the heatmap ploting"""

# dates based on action so it represents the length of the project
min_date = df_actions.created_at.min()
# dates based on action so it represents the length of the project, min based on PR
# open date to avoid committer inputted dates
min_date = df_actions[df_actions["Action"] == "PR Opened"].created_at.min()
max_date = df_actions.created_at.max()
dates = pd.date_range(start=min_date, end=max_date, freq="M", inclusive="both")
df_fill = dates.to_frame(index=False, name="dates")
Expand All @@ -436,18 +556,3 @@ def process_data(
final.loc[files] = None

return final


def create_figure(df: pd.DataFrame):
fig = px.imshow(
df,
labels=dict(x="Time", y="Directory Entries", color="Contributors"),
color_continuous_scale=px.colors.sequential.deep,
)

fig["layout"]["yaxis"]["tickmode"] = "linear"
fig["layout"]["height"] = 700
fig["layout"]["coloraxis_colorbar_x"] = -0.15
fig["layout"]["yaxis"]["side"] = "right"

return fig
Loading
Loading