Add project planning voting boxplot generation script (#3364)

* Add initial file & requirements * Close the graph to prevent overlap * Add README, improve documentation * Ignore jupyter notebooks * Fix images
WordPress · Nov 21, 2023 · 2d2e50e · 2d2e50e
1 parent 368ec8f
commit 2d2e50e
Show file tree

Hide file tree

Showing 7 changed files with 1,880 additions and 0 deletions.
diff --git a/utilities/project_planning/.gitignore b/utilities/project_planning/.gitignore
@@ -0,0 +1,3 @@
+data/
+output/
+*.ipynb
diff --git a/utilities/project_planning/Pipfile b/utilities/project_planning/Pipfile
@@ -0,0 +1,16 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+pandas = "*"
+click = "*"
+matplotlib = "*"
+jupyter = "*"
+openpyxl = "*"
+
+[dev-packages]
+
+[requires]
+python_version = "3.11"
diff --git a/utilities/project_planning/Pipfile.lock b/utilities/project_planning/Pipfile.lock
diff --git a/utilities/project_planning/README.md b/utilities/project_planning/README.md
@@ -0,0 +1,53 @@
+# Project Planning Utilities
+
+This directory contains utilities for project planning. See the below
+descriptions for each script.
+
+## Graph Project Voting
+
+The Openverse maintainers have historically prioritized projects for the next
+year by creating a list of projects and then voting on their effort and impact.
+Instructions provided to maintainers are as follows:
+
+> **Instructions**:
+>
+> Provide each project in the sheet a value for each category. The scales aren't
+> a perfect, measurable thing, so use your best judgement and instinct. A notes
+> field is also provided, please use this for notes to yourself after all the
+> values are combined when discussion occurs. All of the projects also link back
+> to the description provided for them by the project author. Consider projects
+> relatively to each other. For "Effort", consider the amount of work it would
+> take for external contributor(s) to complete the work if the work is
+> well-documented and outlined. This includes all aspects of effort: planning,
+> design, implementation. For columns denoted with "(fib)" use the Fibonacci
+> sequence 2, 3, 5, 8, 13 (where 2 is the smallest and 13 is the largest); for
+> columns denoted with a number range, use that range instead.
+>
+> The three voting categories are:
+>
+> 1. **Effort**: The amount of work the project will take to complete. 2
+>    requiring the least effort, 13 requiring the > most.
+> 2. **Impact**: How impactful the project will be to the success of the project
+>    and our goals for the year. 2 being the least impactful, 13 being the most
+>    impactful.
+> 3. **Confidence**: How confident you are in the values you've provided. 1 is
+>    essentially no confidence, 2 is average confidence, and 3 is high
+>    confidence.
+
+This script is used to ingest the output of the voting and produce box plots for
+effort and impact respectively, with each box colored by the average confidence
+for that project.
+
+The input file is an Excel spreadsheet which looks like the following:
+
+![Excel spreadsheet](./_docs/example_spreadsheet_screenshot.png)
+
+The input file should have one "sheet" per voter, with each sheet's title being
+the member's name. Each sheet should be a copy of the first sheet, named
+"Template", which has all the same columns/information but with the votes filled
+in.
+
+The output is two box plots, one for effort and one for impact, which look like
+the following:
+
+![Box plot for effort](./_docs/example_effort.png)
diff --git a/utilities/project_planning/_docs/example_effort.png b/utilities/project_planning/_docs/example_effort.png
diff --git a/utilities/project_planning/_docs/example_spreadsheet_screenshot.png b/utilities/project_planning/_docs/example_spreadsheet_screenshot.png
diff --git a/utilities/project_planning/graph_project_voting.py b/utilities/project_planning/graph_project_voting.py
@@ -0,0 +1,137 @@
+"""
+Script for generating graphs of project voting results.
+
+See the README for more information.
+"""
+from datetime import datetime
+from pathlib import Path
+
+import click
+import matplotlib.colors as mcolors
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+INPUT_FILE = Path(__file__).parent / "data" / "votes.xlsx"
+OUTPUT_PATH = Path(__file__).parent / "output"
+
+COLUMN_EFFORT = "Effort (fib)"
+COLUMN_IMPACT = "Impact (fib)"
+COLUMN_CONFIDENCE = "Confidence (1-3)"
+
+
+def get_columns_by_members(
+    frames: dict[str, pd.DataFrame],
+    members: list[str],
+    projects: pd.Series,
+    column: str,
+):
+    """
+    Create a new DataFrame which pulls out the provided column from each of the member
+    sheets, and sets the index to the project names.
+    """
+    data = pd.DataFrame([frames[name][column] for name in members], index=members)
+    # The data is transposed here because the DataFrame constructor creates a DataFrame
+    # with the projects as the columns, and the members as the index, whereas we want
+    # the projects as the index.
+    return data.T.set_index(projects)
+
+
+def plot_votes(
+    data: pd.DataFrame, color_by: pd.Series, column: str, year: int, output_path: Path
+):
+    """
+    Create and save a box plot of the provided data, with the boxes colored by the
+    provided color_by data.
+    """
+    # Create the box plot
+    ax, bp = data.T.boxplot(
+        # Specify a large figure size to both increase the resolution of the image, and
+        # provide enough space for the x-axis labels.
+        figsize=(10, 10),
+        # Specific parameter needed in order to color the boxes
+        patch_artist=True,
+        # Return both the axes and boxplot objects, rather than just the axes
+        return_type="both",
+    )
+    # Set the x-axis labels (project names) vertically to prevent collision
+    ax.set_xticklabels(ax.get_xmajorticklabels(), rotation=90)
+    # Only show the Fibonacci values labeled on the y-axis
+    plt.yticks([2, 3, 5, 8, 13])
+    # Set the title of the graph
+    ax.set_title(f"Vote Distribution: {column} - {year}")
+    # Create a colormap that transitions from red to green (lime is used specifically
+    # because it creates a more vibrant green than green does).
+    cmap = mcolors.LinearSegmentedColormap.from_list("", ["red", "lime"])
+    # Create a color normalizer (confidence values are between 1 and 3)
+    norm = mcolors.Normalize(vmin=1, vmax=3)
+    # Apply colors to each box plot.
+    for i, color_value in enumerate(color_by.values):
+        # Compute the color using the colormap and normalizer
+        color = cmap(norm(color_value))
+        # Get current box
+        box = bp["boxes"][i]
+        # Set box face color
+        box.set_facecolor(color)
+        # Change color of the whiskers & caps (the actual list of each is twice as long
+        # as # the number of boxes, because there are two each per box)
+        for aspect_name in ["whiskers", "caps"]:
+            for aspect in bp[aspect_name][i * 2 : (i + 1) * 2]:
+                aspect.set_color(color)
+                aspect.set_linewidth(2)
+
+    # This is required in order to ensure nothing is cut off
+    plt.tight_layout()
+    output_file = output_path / f"{column.split()[0]}_{year}.png"
+    print(f"Saving file {output_file}")
+    plt.savefig(output_file)
+    # Clear the figure so the next one starts fresh
+    plt.close()
+
+
+@click.command()
+@click.option(
+    "--output",
+    help="Output directory",
+    type=click.Path(path_type=Path),
+    default=OUTPUT_PATH,
+)
+@click.option(
+    "--input-file",
+    help="Input Excel document to use",
+    type=click.Path(path_type=Path),
+    default=INPUT_FILE,
+)
+def main(output: Path, input_file: Path):
+    # Ensure the output folder exists
+    output.mkdir(parents=True, exist_ok=True)
+
+    print(f"Reading input file: {input_file}")
+    # Read the input file
+    frames = pd.read_excel(
+        input_file,
+        # Include all sheets
+        sheet_name=None,
+        # Skip the first 5 rows, which are the instructional text
+        skiprows=5,
+        # Use the first row as the header
+        header=0,
+    )
+    # Pull the project names out of the template sheet
+    projects = frames["Template"]["Name"]
+    # Use the name of the frames as the list of voting members
+    members = list(frames.keys())[1:]
+    # This is planning for the *next* year, e.g. one beyond the current one
+    planning_year = datetime.now().year + 1
+
+    effort = get_columns_by_members(frames, members, projects, COLUMN_EFFORT)
+    impact = get_columns_by_members(frames, members, projects, COLUMN_IMPACT)
+    confidence = get_columns_by_members(frames, members, projects, COLUMN_CONFIDENCE)
+    average_confidence = confidence.mean(axis=1)
+
+    plot_votes(effort, average_confidence, COLUMN_EFFORT, planning_year, output)
+    plot_votes(impact, average_confidence, COLUMN_IMPACT, planning_year, output)
+
+
+if __name__ == "__main__":
+    main()
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    data/
+    output/
+    *.ipynb