Skip to content

Commit

Permalink
[Issue #2471] Updates SprintBurndown to use GitHubIssues dataset (#…
Browse files Browse the repository at this point in the history
…2618)

- Replaces the `SprintBoard` dataset with `GitHubIssues` in the
`SprintBurndown` and `SprintBurnup` metrics
- Reuses intermediate steps to calculate burnup/burndown across both
`SprintBurnup` and `SprintBurndown`
  • Loading branch information
widal001 authored Oct 31, 2024
1 parent f8c6c8e commit 2be4951
Show file tree
Hide file tree
Showing 11 changed files with 700 additions and 528 deletions.
3 changes: 1 addition & 2 deletions analytics/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,7 @@ sprint-burndown:
@echo "=> Running sprint burndown report"
@echo "====================================================="
$(POETRY) analytics calculate sprint_burndown \
--sprint-file $(SPRINT_FILE) \
--issue-file $(ISSUE_FILE) \
--issue-file $(DELIVERY_FILE) \
--output-dir $(OUTPUT_DIR) \
--sprint "$(SPRINT)" \
--unit $(UNIT) \
Expand Down
13 changes: 2 additions & 11 deletions analytics/src/analytics/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

from analytics.datasets.deliverable_tasks import DeliverableTasks
from analytics.datasets.issues import GitHubIssues
from analytics.datasets.sprint_board import SprintBoard
from analytics.integrations import db, github, slack
from analytics.metrics.base import BaseMetric, Unit
from analytics.metrics.burndown import SprintBurndown
Expand Down Expand Up @@ -133,7 +132,6 @@ def export_github_data(

@metrics_app.command(name="sprint_burndown")
def calculate_sprint_burndown(
sprint_file: Annotated[str, SPRINT_FILE_ARG],
issue_file: Annotated[str, ISSUE_FILE_ARG],
sprint: Annotated[str, SPRINT_ARG],
unit: Annotated[Unit, UNIT_ARG] = Unit.points.value, # type: ignore[assignment]
Expand All @@ -144,10 +142,7 @@ def calculate_sprint_burndown(
) -> None:
"""Calculate the burndown for a particular sprint."""
# load the input data
sprint_data = SprintBoard.load_from_json_files(
sprint_file=sprint_file,
issue_file=issue_file,
)
sprint_data = GitHubIssues.from_json(issue_file)
# calculate burndown
burndown = SprintBurndown(sprint_data, sprint=sprint, unit=unit)
show_and_or_post_results(
Expand All @@ -160,7 +155,6 @@ def calculate_sprint_burndown(

@metrics_app.command(name="sprint_burnup")
def calculate_sprint_burnup(
sprint_file: Annotated[str, SPRINT_FILE_ARG],
issue_file: Annotated[str, ISSUE_FILE_ARG],
sprint: Annotated[str, SPRINT_ARG],
unit: Annotated[Unit, UNIT_ARG] = Unit.points.value, # type: ignore[assignment]
Expand All @@ -171,10 +165,7 @@ def calculate_sprint_burnup(
) -> None:
"""Calculate the burnup of a particular sprint."""
# load the input data
sprint_data = SprintBoard.load_from_json_files(
sprint_file=sprint_file,
issue_file=issue_file,
)
sprint_data = GitHubIssues.from_json(issue_file)
# calculate burnup
burnup = SprintBurnup(sprint_data, sprint=sprint, unit=unit)
show_and_or_post_results(
Expand Down
63 changes: 59 additions & 4 deletions analytics/src/analytics/datasets/issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from enum import Enum
from typing import Self

from pandas import DataFrame
import pandas as pd
from pydantic import BaseModel, Field, ValidationError

from analytics.datasets.base import BaseDataset
Expand Down Expand Up @@ -69,15 +69,70 @@ class IssueMetadata(BaseModel):
class GitHubIssues(BaseDataset):
"""GitHub issues with metadata about their parents (Epics and Deliverables) and sprints."""

def __init__(self, df: DataFrame) -> None:
def __init__(self, df: pd.DataFrame) -> None:
"""Initialize the GitHub Issues dataset."""
self.opened_col = "issue_created_at"
self.opened_col = "issue_opened_at"
self.closed_col = "issue_closed_at"
self.points_col = "issue_points"
self.sprint_col = "sprint_name"
self.sprint_start_col = "sprint_start"
self.sprint_end_col = "sprint_end"
self.date_cols = [
self.sprint_start_col,
self.sprint_end_col,
self.opened_col,
self.closed_col,
]
# Convert date cols into dates
for col in self.date_cols:
# strip off the timestamp portion of the date
df[col] = pd.to_datetime(df[col]).dt.floor("d")
super().__init__(df)

def sprint_start(self, sprint: str) -> pd.Timestamp:
"""Return the date on which a given sprint started."""
sprint_mask = self.df[self.sprint_col] == sprint
return self.df.loc[sprint_mask, self.sprint_start_col].min()

def sprint_end(self, sprint: str) -> pd.Timestamp:
"""Return the date on which a given sprint ended."""
sprint_mask = self.df[self.sprint_col] == sprint
return self.df.loc[sprint_mask, self.sprint_end_col].max()

@property
def sprints(self) -> pd.DataFrame:
"""Return the unique list of sprints with their start and end dates."""
sprint_cols = [self.sprint_col, self.sprint_start_col, self.sprint_end_col]
return self.df[sprint_cols].drop_duplicates()

@property
def current_sprint(self) -> str | None:
"""Return the name of the current sprint, if a sprint is currently active."""
return self.get_sprint_name_from_date(pd.Timestamp.today().floor("d"))

def get_sprint_name_from_date(self, date: pd.Timestamp) -> str | None:
"""Get the name of a sprint from a given date, if that date falls in a sprint."""
# fmt: off
date_filter = (
(self.sprints[self.sprint_start_col] < date) # after sprint start
& (self.sprints[self.sprint_end_col] >= date) # before sprint end
)
# fmt: on
matching_sprints = self.sprints.loc[date_filter, self.sprint_col]
# if there aren't any sprints return None
if len(matching_sprints) == 0:
return None
# if there are, return the first value as a string
return str(matching_sprints.squeeze())

def to_dict(self) -> list[dict]:
"""Convert this dataset to a python dictionary."""
# Convert date cols into dates
for col in self.date_cols:
# strip off the timestamp portion of the date
self.df[col] = self.df[col].dt.strftime("%Y-%m-%d")
return super().to_dict()

@classmethod
def load_from_json_files(
cls,
Expand All @@ -94,7 +149,7 @@ def load_from_json_files(
lookup = populate_issue_lookup_table(lookup, sprint_data_in)
# Flatten and write issue level data to output file
issues = flatten_issue_data(lookup)
return cls(DataFrame(data=issues))
return cls(pd.DataFrame(data=issues))


# ===============================================================
Expand Down
145 changes: 30 additions & 115 deletions analytics/src/analytics/metrics/burndown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@

from __future__ import annotations

from typing import TYPE_CHECKING, Literal
from typing import TYPE_CHECKING

import pandas as pd
import plotly.express as px
from numpy import nan

from analytics.datasets.sprint_board import SprintBoard
from analytics.datasets.issues import GitHubIssues
from analytics.metrics.base import BaseMetric, Statistic, Unit
from analytics.metrics.utils import Columns, sum_tix_by_day

if TYPE_CHECKING:
from plotly.graph_objects import Figure


class SprintBurndown(BaseMetric[SprintBoard]):
class SprintBurndown(BaseMetric[GitHubIssues]):
"""Calculates the running total of open issues per day in the sprint."""

def __init__(
self,
dataset: SprintBoard,
dataset: GitHubIssues,
sprint: str,
unit: Unit,
) -> None:
Expand All @@ -34,36 +34,34 @@ def __init__(
self.sprint = self._get_and_validate_sprint_name(sprint)
self.sprint_data = self._isolate_data_for_this_sprint()
self.date_col = "date"
self.points_col = "points"
self.opened_col = dataset.opened_col # type: ignore[attr-defined]
self.closed_col = dataset.closed_col # type: ignore[attr-defined]
self.columns = Columns(
opened_at_col=dataset.opened_col,
closed_at_col=dataset.closed_col,
unit_col=dataset.points_col if unit == Unit.points else unit.value,
date_col=self.date_col,
)
self.unit = unit
# Set the value of the unit column based on
# whether we're summing issues or story points
self.unit_col = dataset.points_col if unit == Unit.points else unit.value
super().__init__(dataset)

def calculate(self) -> pd.DataFrame:
"""
Calculate the sprint burndown.
Notes
-----
Sprint burndown is calculated with the following algorithm:
1. Isolate the records that belong to the given sprint
2. Get the range of dates over which these issues were opened and closed
3. Count the number of issues opened and closed on each day of that range
4. Calculate the delta between opened and closed issues per day
5. Cumulatively sum those deltas to get the running total of open tix
"""
"""Calculate the sprint burnup."""
# make a copy of columns and rows we need to calculate burndown for this sprint
burndown_cols = [self.opened_col, self.closed_col, self.points_col]
df_sprint = self.sprint_data[burndown_cols].copy()
# get the date range over which tix were created and closed
df_tix_range = self._get_tix_date_range(df_sprint)
# get the number of tix opened and closed each day
df_opened = self._get_daily_tix_counts_by_status(df_sprint, "opened")
df_closed = self._get_daily_tix_counts_by_status(df_sprint, "closed")
# combine the daily opened and closed counts to get total open per day
return self._get_cum_sum_of_open_tix(df_tix_range, df_opened, df_closed)
burnup_cols = [
self.dataset.opened_col,
self.dataset.closed_col,
self.dataset.points_col,
]
df_sprint = self.sprint_data[burnup_cols].copy()
# Count the number of tickets opened, closed, and remaining by day
return sum_tix_by_day(
df=df_sprint,
cols=self.columns,
unit=self.unit,
sprint_end=self.dataset.sprint_end(self.sprint),
)

def plot_results(self) -> Figure:
"""Plot the sprint burndown using a plotly line chart."""
Expand All @@ -74,7 +72,7 @@ def plot_results(self) -> Figure:
sprint_end = self.dataset.sprint_end(self.sprint)
date_mask = self.results[self.date_col].between(
sprint_start,
min(sprint_end, pd.Timestamp.today(tz="utc")),
min(sprint_end, pd.Timestamp.today()),
)
df = self.results[date_mask]
# create a line chart from the data in self.results
Expand Down Expand Up @@ -108,7 +106,7 @@ def get_stats(self) -> dict[str, Statistic]:
total_closed = int(df["closed"].sum())
pct_closed = round(total_closed / total_opened * 100, 2)
# get the percentage of tickets that were ticketed
is_pointed = self.sprint_data[Unit.points.value] >= 1
is_pointed = self.sprint_data[self.dataset.points_col] >= 1
issues_pointed = len(self.sprint_data[is_pointed])
issues_total = len(self.sprint_data)
pct_pointed = round(issues_pointed / issues_total * 100, 2)
Expand Down Expand Up @@ -153,86 +151,3 @@ def _isolate_data_for_this_sprint(self) -> pd.DataFrame:
"""Filter out issues that are not assigned to the current sprint."""
sprint_filter = self.dataset.df[self.dataset.sprint_col] == self.sprint
return self.dataset.df[sprint_filter]

def _get_daily_tix_counts_by_status(
self,
df: pd.DataFrame,
status: Literal["opened", "closed"],
) -> pd.DataFrame:
"""
Count the number of issues or points opened or closed by date.
Notes
-----
It does this by:
- Grouping on the created_date or opened_date column, depending on status
- Counting the total number of rows per group
"""
# create local copies of the key column names
agg_col = self.opened_col if status == "opened" else self.closed_col
unit_col = self.unit.value
key_cols = [agg_col, unit_col]
# create a dummy column to sum per row if the unit is tasks
if self.unit == Unit.issues:
df[unit_col] = 1
# isolate the key columns, group by open or closed date, then sum the units
df_agg = df[key_cols].groupby(agg_col, as_index=False).agg({unit_col: "sum"})
return df_agg.rename(columns={agg_col: self.date_col, unit_col: status})

def _get_tix_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get the date range over which issues were created and closed.
Notes
-----
It does this by:
- Finding the date when the sprint ends
- Finding the earliest date a issue was created
- Finding the latest date a issue was closed
- Creating a row for each day between the earliest date a ticket was opened
and either the sprint end _or_ the latest date an issue was closed,
whichever is the later date.
"""
# get earliest date an issue was opened and latest date one was closed
sprint_end = self.dataset.sprint_end(self.sprint)
opened_min = df[self.opened_col].min()
closed_max = df[self.closed_col].max()
closed_max = sprint_end if closed_max is nan else max(sprint_end, closed_max)
# creates a dataframe with one row for each day between min and max date
return pd.DataFrame(
pd.date_range(opened_min, closed_max),
columns=[self.date_col],
)

def _get_cum_sum_of_open_tix(
self,
dates: pd.DataFrame,
opened: pd.DataFrame,
closed: pd.DataFrame,
) -> pd.DataFrame:
"""
Get the cumulative sum of open issues per day.
Notes
-----
It does this by:
- Left joining the full date range to the daily open and closed counts
so that we have a row for each day of the range, with a column for tix
opened and a column for tix closed on that day
- Subtracting closed from opened to get the "delta" on each day in the range
- Cumulatively summing the deltas to get the running total of open tix
"""
# left join the full date range to open and closed counts
df = (
dates.merge(opened, on=self.date_col, how="left")
.merge(closed, on=self.date_col, how="left")
.fillna(0)
)
# calculate the difference between opened and closed each day
df["delta"] = df["opened"] - df["closed"]
# cumulatively sum the deltas to get the running total
df["total_open"] = df["delta"].cumsum()
return df
Loading

0 comments on commit 2be4951

Please sign in to comment.