Skip to content

Commit

Permalink
Merge pull request oss-aspen#700 from oss-aspen/dev
Browse files Browse the repository at this point in the history
Update main
  • Loading branch information
cdolfi authored Apr 13, 2024
2 parents be51d41 + 2c9bd1b commit 4980eb6
Show file tree
Hide file tree
Showing 11 changed files with 353 additions and 185 deletions.
8 changes: 8 additions & 0 deletions .wordlist-md
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,11 @@ Docker's
data-center
OAuth
postgres
Podman
filesystem
credsStore
credStore
filesystem
Podman
credsStore
credStore
8 changes: 5 additions & 3 deletions 8Knot/pages/chaoss/visualizations/project_velocity.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,9 +331,11 @@ def process_data(
# replace all nan to 0
df_consolidated.fillna(value=0, inplace=True)

# log of commits and contribs
df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(math.log)
df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(math.log)
# log of commits and contribs if values are not 0
df_consolidated["log_num_commits"] = df_consolidated["Commit"].apply(lambda x: math.log(x) if x != 0 else 0)
df_consolidated["log_num_contrib"] = df_consolidated["num_unique_contributors"].apply(
lambda x: math.log(x) if x != 0 else 0
)

# column to hold the weighted values of pr and issues actions summed together
df_consolidated["prs_issues_actions_weighted"] = (
Expand Down
4 changes: 2 additions & 2 deletions 8Knot/pages/codebase/codebase.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@
[
dbc.Row(
[
dbc.Col(gc_cntrb_file_heatmap, width=12),
dbc.Col(gc_contribution_file_heatmap, width=12),
],
align="center",
style={"marginBottom": ".5%"},
),
dbc.Row(
[
dbc.Col(gc_contribution_file_heatmap, width=12),
dbc.Col(gc_cntrb_file_heatmap, width=12),
],
align="center",
style={"marginBottom": ".5%"},
Expand Down
161 changes: 133 additions & 28 deletions 8Knot/pages/codebase/visualizations/cntrb_file_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
classNames={"values": "dmc-multiselect-custom"},
searchable=True,
clearable=False,
value="Top Level Directory",
),
],
className="me-2",
Expand Down Expand Up @@ -215,7 +216,7 @@ def directory_dropdown(repo_id):

# add top level directory to the list of directories
directories.insert(0, "Top Level Directory")
logging.warning(f"DIRECTORY DROPDOWN - FINISHED")
logging.warning(f"CNTRB DIRECTORY DROPDOWN - FINISHED")

return directories, "Top Level Directory"

Expand All @@ -224,18 +225,19 @@ def directory_dropdown(repo_id):
@callback(
Output(f"{PAGE}-{VIZ_ID}", "figure"),
[
Input("repo-choices", "data"),
Input(f"repo-{PAGE}-{VIZ_ID}", "value"),
Input(f"directory-{PAGE}-{VIZ_ID}", "value"),
Input("bot-switch", "value"),
],
background=True,
)
def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
def cntrb_file_heatmap_graph(searchbar_repos, repo_id, directory, bot_switch):
start = time.perf_counter()
logging.warning(f"{VIZ_ID}- START")

# get dataframes of data from cache
df_file, df_actions, df_file_cntbs = multi_query_helper([repo_id])
df_file, df_actions, df_file_cntbs = multi_query_helper(searchbar_repos, [repo_id])

# test if there is data
if df_file.empty or df_actions.empty or df_file_cntbs.empty:
Expand All @@ -255,40 +257,40 @@ def cntrb_file_heatmap_graph(repo_id, directory, bot_switch):
return fig


def multi_query_helper(repos):
def multi_query_helper(searchbar_repos, repo):
"""
For cntrb_file_heatmap_graph-
hack to put all of the cache-retrieval
in the same place temporarily
"""

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=rfq.__name__, repolist=repo):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=cnq.__name__, repolist=searchbar_repos):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# wait for data to asynchronously download and become available.
while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repos):
while not_cached := cf.get_uncached(func_name=cpfq.__name__, repolist=repo):
logging.warning(f"CONTRIBUTOR FILE HEATMAP - WAITING ON DATA TO BECOME AVAILABLE")
time.sleep(0.5)

# GET ALL DATA FROM POSTGRES CACHE
df_file = cf.retrieve_from_cache(
tablename=rfq.__name__,
repolist=repos,
repolist=repo,
)
df_actions = cf.retrieve_from_cache(
tablename=cnq.__name__,
repolist=repos,
repolist=searchbar_repos,
)
df_file_cntrbs = cf.retrieve_from_cache(
tablename=cpfq.__name__,
repolist=repos,
repolist=repo,
)

# necessary preprocessing steps that were lifted out of the querying step
Expand All @@ -305,6 +307,64 @@ def process_data(
directory,
bot_switch,
):
"""
Processing steps
1 - Cleans up file data to only include current files and relate files in the repository to the contributors who have reviewed them in past PRs.
2 - For a given level in the directory tree, aggregate the list of contributors for sub-directories and for individual files at the level.
3 - For each contributor, identify their most recent contribution.
4 - Transforms dataframe where columns are months with counts of "last seen" dates in that month and the rows are the file/subdirectory
"""

df_file = df_file_clean(df_file, df_file_cntbs, bot_switch)

df_dynamic_directory = cntrb_per_directory_value(directory, df_file)

# work around for using functions, will clean later
if df_dynamic_directory.empty:
return df_dynamic_directory

df_dynamic_directory = cntrb_to_last_activity(df_actions, df_dynamic_directory)

final = file_cntrb_activity_by_month(df_dynamic_directory, df_actions)

return final


def create_figure(df: pd.DataFrame):
fig = px.imshow(
df,
labels=dict(x="Time", y="Directory Entries", color="Contributors"),
color_continuous_scale=px.colors.sequential.deep,
)

fig["layout"]["yaxis"]["tickmode"] = "linear"
fig["layout"]["height"] = 700
fig["layout"]["coloraxis_colorbar_x"] = -0.15
fig["layout"]["yaxis"]["side"] = "right"

return fig


def df_file_clean(df_file: pd.DataFrame, df_file_cntbs: pd.DataFrame, bot_switch):
"""
This function cleans the df_file data and combines it with the related cntrb_ids
Args:
-----
df_file : Pandas Dataframe
Dataframe with the output of the repo_files_query
df_file_cntrbs : Pandas Dataframe
Dataframe with the output of the cntrb_per_file_query
bot_switch : boolan
T/F for the status of the bot switch
Returns:
--------
df_file: df with file and cntrb_ids of contributors that reviewed a pr with that file in it
"""
# strings to hold the values for each column (always the same for every row of this query)
repo_name = df_file["repo_name"].iloc[0]
repo_path = df_file["repo_path"].iloc[0]
Expand All @@ -326,7 +386,7 @@ def process_data(
df_file_cntbs.drop(["repo_id", "reviewer_ids"], axis=1, inplace=True)

# Left join on df_files to only get the files that are currently in the repository
# and the contributors that have ever opened a pr that included edits on the file
# and the contributors that have ever reviewed a pr that included edits on the file
df_file = pd.merge(df_file, df_file_cntbs, on="file_path", how="left")
# replace nan with empty string to avoid errors in list comprehension
df_file.cntrb_ids.fillna("", inplace=True)
Expand All @@ -343,6 +403,26 @@ def process_data(
axis=1,
)

return df_file


def cntrb_per_directory_value(directory, df_file):
"""
This function gets the files in the specified directory, groups together any files in
subdirectories, and creates a list of their contributors cntrb_ids
Args:
-----
directory : string
Output from the directory drop down
df_file : Pandas Dataframe
Dataframe with file and related cntrb_id information
Returns:
--------
df_dynamic_directory: df with the file and subdirectories and their reviewers cntrb_ids
"""
# determine directory level to use in later step
level = directory.count("/")
if directory == "Top Level Directory":
Expand Down Expand Up @@ -377,6 +457,25 @@ def process_data(
lambda row: set(row.cntrb_ids),
axis=1,
)
return df_dynamic_directory


def cntrb_to_last_activity(df_actions: pd.DataFrame, df_dynamic_directory: pd.DataFrame):
"""
This function creates a df with the files and the the dates of the most recent activity for each cntrb_id.
Args:
-----
df_actions : Pandas Dataframe
Dataframe with contributor activity
df_dynamic_directory : Pandas Dataframe
Dataframe with file and related cntrb_id information
Returns:
--------
df_dynamic_directory: df with the file and subdirectories and the dates of the most recent activity for the reviewers.
"""

# date reformating
df_actions["created_at"] = pd.to_datetime(df_actions["created_at"], utc=True)
Expand Down Expand Up @@ -406,6 +505,26 @@ def process_data(
# most recent activity - preprocessing step
df_dynamic_directory = df_dynamic_directory.explode("dates")

return df_dynamic_directory


def file_cntrb_activity_by_month(df_dynamic_directory: pd.DataFrame, df_actions: pd.DataFrame):
"""
This function transforms the df_dynamic_directory to be counts of "last seen" contributors by month.
Args:
-----
df_actions : Pandas Dataframe
Dataframe with contributor activity
df_dynamic_directory : Pandas Dataframe
Dataframe with file and related cntrb_id information
Returns:
--------
df_final: df with files and subdirectories as rows and the months as columns
"""

# get files that have no contributors and remove from set to prevent errors in grouper function
no_contribs = df_dynamic_directory["directory_value"][df_dynamic_directory.dates.isnull()].tolist()

Expand All @@ -415,8 +534,9 @@ def process_data(
there will be a column for every month even if there is no "last contribution" date in it. This greatly
improves the heatmap ploting"""

# dates based on action so it represents the length of the project
min_date = df_actions.created_at.min()
# dates based on action so it represents the length of the project, min based on PR
# open date to avoid committer inputted dates
min_date = df_actions[df_actions["Action"] == "PR Opened"].created_at.min()
max_date = df_actions.created_at.max()
dates = pd.date_range(start=min_date, end=max_date, freq="M", inclusive="both")
df_fill = dates.to_frame(index=False, name="dates")
Expand All @@ -436,18 +556,3 @@ def process_data(
final.loc[files] = None

return final


def create_figure(df: pd.DataFrame):
fig = px.imshow(
df,
labels=dict(x="Time", y="Directory Entries", color="Contributors"),
color_continuous_scale=px.colors.sequential.deep,
)

fig["layout"]["yaxis"]["tickmode"] = "linear"
fig["layout"]["height"] = 700
fig["layout"]["coloraxis_colorbar_x"] = -0.15
fig["layout"]["yaxis"]["side"] = "right"

return fig
Loading

0 comments on commit 4980eb6

Please sign in to comment.