[Issue #2471] Updates SprintBurndown to use GitHubIssues dataset (#…

…2618) - Replaces the `SprintBoard` dataset with `GitHubIssues` in the `SprintBurndown` and `SprintBurnup` metrics - Reuses intermediate steps to calculate burnup/burndown across both `SprintBurnup` and `SprintBurndown`
HHS · Oct 31, 2024 · 2be4951 · 2be4951
1 parent f8c6c8e
commit 2be4951
Show file tree

Hide file tree

Showing 11 changed files with 700 additions and 528 deletions.
diff --git a/analytics/Makefile b/analytics/Makefile
@@ -190,8 +190,7 @@ sprint-burndown:
 	@echo "=> Running sprint burndown report"
 	@echo "====================================================="
 	$(POETRY) analytics calculate sprint_burndown \
-	--sprint-file $(SPRINT_FILE) \
-	--issue-file $(ISSUE_FILE) \
+	--issue-file $(DELIVERY_FILE) \
 	--output-dir $(OUTPUT_DIR) \
 	--sprint  "$(SPRINT)" \
 	--unit $(UNIT) \

diff --git a/analytics/src/analytics/cli.py b/analytics/src/analytics/cli.py
@@ -11,7 +11,6 @@
 
 from analytics.datasets.deliverable_tasks import DeliverableTasks
 from analytics.datasets.issues import GitHubIssues
-from analytics.datasets.sprint_board import SprintBoard
 from analytics.integrations import db, github, slack
 from analytics.metrics.base import BaseMetric, Unit
 from analytics.metrics.burndown import SprintBurndown
@@ -133,7 +132,6 @@ def export_github_data(
 
 @metrics_app.command(name="sprint_burndown")
 def calculate_sprint_burndown(
-    sprint_file: Annotated[str, SPRINT_FILE_ARG],
     issue_file: Annotated[str, ISSUE_FILE_ARG],
     sprint: Annotated[str, SPRINT_ARG],
     unit: Annotated[Unit, UNIT_ARG] = Unit.points.value,  # type: ignore[assignment]
@@ -144,10 +142,7 @@ def calculate_sprint_burndown(
 ) -> None:
     """Calculate the burndown for a particular sprint."""
     # load the input data
-    sprint_data = SprintBoard.load_from_json_files(
-        sprint_file=sprint_file,
-        issue_file=issue_file,
-    )
+    sprint_data = GitHubIssues.from_json(issue_file)
     # calculate burndown
     burndown = SprintBurndown(sprint_data, sprint=sprint, unit=unit)
     show_and_or_post_results(
@@ -160,7 +155,6 @@ def calculate_sprint_burndown(
 
 @metrics_app.command(name="sprint_burnup")
 def calculate_sprint_burnup(
-    sprint_file: Annotated[str, SPRINT_FILE_ARG],
     issue_file: Annotated[str, ISSUE_FILE_ARG],
     sprint: Annotated[str, SPRINT_ARG],
     unit: Annotated[Unit, UNIT_ARG] = Unit.points.value,  # type: ignore[assignment]
@@ -171,10 +165,7 @@ def calculate_sprint_burnup(
 ) -> None:
     """Calculate the burnup of a particular sprint."""
     # load the input data
-    sprint_data = SprintBoard.load_from_json_files(
-        sprint_file=sprint_file,
-        issue_file=issue_file,
-    )
+    sprint_data = GitHubIssues.from_json(issue_file)
     # calculate burnup
     burnup = SprintBurnup(sprint_data, sprint=sprint, unit=unit)
     show_and_or_post_results(

diff --git a/analytics/src/analytics/datasets/issues.py b/analytics/src/analytics/datasets/issues.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from typing import Self
 
-from pandas import DataFrame
+import pandas as pd
 from pydantic import BaseModel, Field, ValidationError
 
 from analytics.datasets.base import BaseDataset
@@ -69,15 +69,70 @@ class IssueMetadata(BaseModel):
 class GitHubIssues(BaseDataset):
     """GitHub issues with metadata about their parents (Epics and Deliverables) and sprints."""
 
-    def __init__(self, df: DataFrame) -> None:
+    def __init__(self, df: pd.DataFrame) -> None:
         """Initialize the GitHub Issues dataset."""
-        self.opened_col = "issue_created_at"
+        self.opened_col = "issue_opened_at"
         self.closed_col = "issue_closed_at"
+        self.points_col = "issue_points"
         self.sprint_col = "sprint_name"
         self.sprint_start_col = "sprint_start"
         self.sprint_end_col = "sprint_end"
+        self.date_cols = [
+            self.sprint_start_col,
+            self.sprint_end_col,
+            self.opened_col,
+            self.closed_col,
+        ]
+        # Convert date cols into dates
+        for col in self.date_cols:
+            # strip off the timestamp portion of the date
+            df[col] = pd.to_datetime(df[col]).dt.floor("d")
         super().__init__(df)
 
+    def sprint_start(self, sprint: str) -> pd.Timestamp:
+        """Return the date on which a given sprint started."""
+        sprint_mask = self.df[self.sprint_col] == sprint
+        return self.df.loc[sprint_mask, self.sprint_start_col].min()
+
+    def sprint_end(self, sprint: str) -> pd.Timestamp:
+        """Return the date on which a given sprint ended."""
+        sprint_mask = self.df[self.sprint_col] == sprint
+        return self.df.loc[sprint_mask, self.sprint_end_col].max()
+
+    @property
+    def sprints(self) -> pd.DataFrame:
+        """Return the unique list of sprints with their start and end dates."""
+        sprint_cols = [self.sprint_col, self.sprint_start_col, self.sprint_end_col]
+        return self.df[sprint_cols].drop_duplicates()
+
+    @property
+    def current_sprint(self) -> str | None:
+        """Return the name of the current sprint, if a sprint is currently active."""
+        return self.get_sprint_name_from_date(pd.Timestamp.today().floor("d"))
+
+    def get_sprint_name_from_date(self, date: pd.Timestamp) -> str | None:
+        """Get the name of a sprint from a given date, if that date falls in a sprint."""
+        # fmt: off
+        date_filter = (
+            (self.sprints[self.sprint_start_col] < date)  # after sprint start
+            & (self.sprints[self.sprint_end_col] >= date)  # before sprint end
+        )
+        # fmt: on
+        matching_sprints = self.sprints.loc[date_filter, self.sprint_col]
+        # if there aren't any sprints return None
+        if len(matching_sprints) == 0:
+            return None
+        # if there are, return the first value as a string
+        return str(matching_sprints.squeeze())
+
+    def to_dict(self) -> list[dict]:
+        """Convert this dataset to a python dictionary."""
+        # Convert date cols into dates
+        for col in self.date_cols:
+            # strip off the timestamp portion of the date
+            self.df[col] = self.df[col].dt.strftime("%Y-%m-%d")
+        return super().to_dict()
+
     @classmethod
     def load_from_json_files(
         cls,
@@ -94,7 +149,7 @@ def load_from_json_files(
         lookup = populate_issue_lookup_table(lookup, sprint_data_in)
         # Flatten and write issue level data to output file
         issues = flatten_issue_data(lookup)
-        return cls(DataFrame(data=issues))
+        return cls(pd.DataFrame(data=issues))
 
 
 # ===============================================================

diff --git a/analytics/src/analytics/metrics/burndown.py b/analytics/src/analytics/metrics/burndown.py
@@ -7,25 +7,25 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Literal
+from typing import TYPE_CHECKING
 
 import pandas as pd
 import plotly.express as px
-from numpy import nan
 
-from analytics.datasets.sprint_board import SprintBoard
+from analytics.datasets.issues import GitHubIssues
 from analytics.metrics.base import BaseMetric, Statistic, Unit
+from analytics.metrics.utils import Columns, sum_tix_by_day
 
 if TYPE_CHECKING:
     from plotly.graph_objects import Figure
 
 
-class SprintBurndown(BaseMetric[SprintBoard]):
+class SprintBurndown(BaseMetric[GitHubIssues]):
     """Calculates the running total of open issues per day in the sprint."""
 
     def __init__(
         self,
-        dataset: SprintBoard,
+        dataset: GitHubIssues,
         sprint: str,
         unit: Unit,
     ) -> None:
@@ -34,36 +34,34 @@ def __init__(
         self.sprint = self._get_and_validate_sprint_name(sprint)
         self.sprint_data = self._isolate_data_for_this_sprint()
         self.date_col = "date"
-        self.points_col = "points"
-        self.opened_col = dataset.opened_col  # type: ignore[attr-defined]
-        self.closed_col = dataset.closed_col  # type: ignore[attr-defined]
+        self.columns = Columns(
+            opened_at_col=dataset.opened_col,
+            closed_at_col=dataset.closed_col,
+            unit_col=dataset.points_col if unit == Unit.points else unit.value,
+            date_col=self.date_col,
+        )
         self.unit = unit
+        # Set the value of the unit column based on
+        # whether we're summing issues or story points
+        self.unit_col = dataset.points_col if unit == Unit.points else unit.value
         super().__init__(dataset)
 
     def calculate(self) -> pd.DataFrame:
-        """
-        Calculate the sprint burndown.
-
-        Notes
-        -----
-        Sprint burndown is calculated with the following algorithm:
-        1. Isolate the records that belong to the given sprint
-        2. Get the range of dates over which these issues were opened and closed
-        3. Count the number of issues opened and closed on each day of that range
-        4. Calculate the delta between opened and closed issues per day
-        5. Cumulatively sum those deltas to get the running total of open tix
-
-        """
+        """Calculate the sprint burnup."""
         # make a copy of columns and rows we need to calculate burndown for this sprint
-        burndown_cols = [self.opened_col, self.closed_col, self.points_col]
-        df_sprint = self.sprint_data[burndown_cols].copy()
-        # get the date range over which tix were created and closed
-        df_tix_range = self._get_tix_date_range(df_sprint)
-        # get the number of tix opened and closed each day
-        df_opened = self._get_daily_tix_counts_by_status(df_sprint, "opened")
-        df_closed = self._get_daily_tix_counts_by_status(df_sprint, "closed")
-        # combine the daily opened and closed counts to get total open per day
-        return self._get_cum_sum_of_open_tix(df_tix_range, df_opened, df_closed)
+        burnup_cols = [
+            self.dataset.opened_col,
+            self.dataset.closed_col,
+            self.dataset.points_col,
+        ]
+        df_sprint = self.sprint_data[burnup_cols].copy()
+        # Count the number of tickets opened, closed, and remaining by day
+        return sum_tix_by_day(
+            df=df_sprint,
+            cols=self.columns,
+            unit=self.unit,
+            sprint_end=self.dataset.sprint_end(self.sprint),
+        )
 
     def plot_results(self) -> Figure:
         """Plot the sprint burndown using a plotly line chart."""
@@ -74,7 +72,7 @@ def plot_results(self) -> Figure:
         sprint_end = self.dataset.sprint_end(self.sprint)
         date_mask = self.results[self.date_col].between(
             sprint_start,
-            min(sprint_end, pd.Timestamp.today(tz="utc")),
+            min(sprint_end, pd.Timestamp.today()),
         )
         df = self.results[date_mask]
         # create a line chart from the data in self.results
@@ -108,7 +106,7 @@ def get_stats(self) -> dict[str, Statistic]:
         total_closed = int(df["closed"].sum())
         pct_closed = round(total_closed / total_opened * 100, 2)
         # get the percentage of tickets that were ticketed
-        is_pointed = self.sprint_data[Unit.points.value] >= 1
+        is_pointed = self.sprint_data[self.dataset.points_col] >= 1
         issues_pointed = len(self.sprint_data[is_pointed])
         issues_total = len(self.sprint_data)
         pct_pointed = round(issues_pointed / issues_total * 100, 2)
@@ -153,86 +151,3 @@ def _isolate_data_for_this_sprint(self) -> pd.DataFrame:
         """Filter out issues that are not assigned to the current sprint."""
         sprint_filter = self.dataset.df[self.dataset.sprint_col] == self.sprint
         return self.dataset.df[sprint_filter]
-
-    def _get_daily_tix_counts_by_status(
-        self,
-        df: pd.DataFrame,
-        status: Literal["opened", "closed"],
-    ) -> pd.DataFrame:
-        """
-        Count the number of issues or points opened or closed by date.
-
-        Notes
-        -----
-        It does this by:
-        - Grouping on the created_date or opened_date column, depending on status
-        - Counting the total number of rows per group
-
-        """
-        # create local copies of the key column names
-        agg_col = self.opened_col if status == "opened" else self.closed_col
-        unit_col = self.unit.value
-        key_cols = [agg_col, unit_col]
-        # create a dummy column to sum per row if the unit is tasks
-        if self.unit == Unit.issues:
-            df[unit_col] = 1
-        # isolate the key columns, group by open or closed date, then sum the units
-        df_agg = df[key_cols].groupby(agg_col, as_index=False).agg({unit_col: "sum"})
-        return df_agg.rename(columns={agg_col: self.date_col, unit_col: status})
-
-    def _get_tix_date_range(self, df: pd.DataFrame) -> pd.DataFrame:
-        """
-        Get the date range over which issues were created and closed.
-
-        Notes
-        -----
-        It does this by:
-        - Finding the date when the sprint ends
-        - Finding the earliest date a issue was created
-        - Finding the latest date a issue was closed
-        - Creating a row for each day between the earliest date a ticket was opened
-          and either the sprint end _or_ the latest date an issue was closed,
-          whichever is the later date.
-
-        """
-        # get earliest date an issue was opened and latest date one was closed
-        sprint_end = self.dataset.sprint_end(self.sprint)
-        opened_min = df[self.opened_col].min()
-        closed_max = df[self.closed_col].max()
-        closed_max = sprint_end if closed_max is nan else max(sprint_end, closed_max)
-        # creates a dataframe with one row for each day between min and max date
-        return pd.DataFrame(
-            pd.date_range(opened_min, closed_max),
-            columns=[self.date_col],
-        )
-
-    def _get_cum_sum_of_open_tix(
-        self,
-        dates: pd.DataFrame,
-        opened: pd.DataFrame,
-        closed: pd.DataFrame,
-    ) -> pd.DataFrame:
-        """
-        Get the cumulative sum of open issues per day.
-
-        Notes
-        -----
-        It does this by:
-        - Left joining the full date range to the daily open and closed counts
-          so that we have a row for each day of the range, with a column for tix
-          opened and a column for tix closed on that day
-        - Subtracting closed from opened to get the "delta" on each day in the range
-        - Cumulatively summing the deltas to get the running total of open tix
-
-        """
-        # left join the full date range to open and closed counts
-        df = (
-            dates.merge(opened, on=self.date_col, how="left")
-            .merge(closed, on=self.date_col, how="left")
-            .fillna(0)
-        )
-        # calculate the difference between opened and closed each day
-        df["delta"] = df["opened"] - df["closed"]
-        # cumulatively sum the deltas to get the running total
-        df["total_open"] = df["delta"].cumsum()
-        return df