From 2ad0f40e66ada4f145236456ca98ee1394c79445 Mon Sep 17 00:00:00 2001
From: John Baldwin <jlbaldwin@gmail.com>
Date: Tue, 12 May 2020 20:13:29 -0400
Subject: [PATCH 1/4] New code: figures.pipeline.enrollment_metrics

This module processes data to provide course enrollment metrics. Course
enrollment is a unique learner/course pair.

See the source code in this commit for documentation
---
 figures/pipeline/enrollment_metrics.py    | 272 +++++++++++++++++++
 tests/pipeline/test_enrollment_metrics.py | 312 ++++++++++++++++++++++
 2 files changed, 584 insertions(+)
 create mode 100644 figures/pipeline/enrollment_metrics.py
 create mode 100644 tests/pipeline/test_enrollment_metrics.py

diff --git a/figures/pipeline/enrollment_metrics.py b/figures/pipeline/enrollment_metrics.py
new file mode 100644
index 00000000..1e2479cf
--- /dev/null
+++ b/figures/pipeline/enrollment_metrics.py
@@ -0,0 +1,272 @@
+"""This module collects metrics for a course enrollment (learner/course pair)
+
+# Overview
+
+This is new code for Figures 0.3.10. Its purpose is to address pipeline
+performance by reducing calls to `CourseGradeFactory().read.
+
+Calls are reduced by checking if the learner has a `StudentModule` record
+modified after the most recent `LearnerCourseGradeMetrics` record `date_for`
+field
+
+
+# Logic
+
+```
+for each learner+course
+    get newest lgcm record
+    get newest sm record
+
+    if not lcgm and not sm
+        # Learner has not started course
+        skip collection for this learner+course
+    else if lcgm and sm
+        # we have saved history so we can check to update
+        if lcgm.modified is older than sm.modified:
+            collect new course data
+            return new metrics object created
+
+    else if not lcgm and sm
+        # Learner started course after last collection action
+        collect new course data
+        return new metrics object created
+
+    else  # lcgm and not sm
+        # We had collected data previously, but the upstream went away
+        # This could happen with data cleansing, like GPPR requests
+        # Likely other causes, too
+        log condition to figures PipelineError log
+        return None
+```
+
+# NOTE: Change 'lcgm' in this file to either 'enrollment' or 'lp' when an
+        abbreviation is acceptible. This is to get this module forward ready for
+        reworking `LearnerCourseGradeMetrics`
+
+"""
+
+from datetime import datetime
+from decimal import Decimal
+
+from django.utils.timezone import utc
+
+from figures.metrics import LearnerCourseGrades
+from figures.models import LearnerCourseGradeMetrics, PipelineError
+from figures.pipeline.logger import log_error
+from figures.sites import (
+    get_site_for_course,
+    course_enrollments_for_course,
+    student_modules_for_course_enrollment,
+    UnlinkedCourseError,
+    )
+
+
+def bulk_calculate_course_progress_data(course_id, date_for=None):
+    """Calculates the average progress for a set of course enrollments
+
+    How it works
+    1. collects progress percent for each course enrollment
+        1.1. If up to date enrollment metrics record already exists, use that
+    2. calculate and return the average of these enrollments
+
+    TODO: Update to filter on active users
+
+    Questions:
+    - What makes a learner an active learner?
+
+    """
+    progress_percentages = []
+
+    if not date_for:
+        date_for = datetime.utcnow().replace(tzinfo=utc).date()
+
+    for ce in course_enrollments_for_course(course_id):
+        metrics = collect_metrics_for_enrollment(course_enrollment=ce,
+                                                 date_for=date_for)
+        if metrics:
+            progress_percentages.append(metrics.progress_percent)
+        else:
+            # Log this for troubleshooting
+            error_data = dict(
+                msg=('Unable to create or retrieve enrollment metrics ' +
+                     'for user {} and course {}'.format(ce.user.username,
+                                                        str(ce.course_id)))
+            )
+            log_error(
+                error_data=error_data,
+                error_type=PipelineError.COURSE_DATA,
+                user=ce.user,
+                course_id=str(course_id)
+            )
+
+    return dict(
+        average_progress=calculate_average_progress(progress_percentages),
+    )
+
+
+def calculate_average_progress(progress_percentages):
+    """Calcuates average progress from a list of values
+
+    TODO: How do we want to handle malformed data?
+    """
+    if progress_percentages:
+        average_progress = float(sum(progress_percentages)) / float(len(progress_percentages))
+        average_progress = float(Decimal(average_progress).quantize(Decimal('.00')))
+    else:
+        average_progress = 0.0
+    return average_progress
+
+
+def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
+    """Collect metrics for enrollment (learner+course)
+
+    This function performs course enrollment merics collection for the given
+    unique enrollment identified by the learner and course. It is initial code
+    in refactoring the pipeline for learner progress
+
+    Important models here are:
+    * StudentModule (SM)
+    * LearnerCourseGradeMetrics (LCGM)
+    * CourseEnrollment (CE)
+
+    # Workflow
+
+    * Get date of most recent StudentModule record, for the enrollment record
+    * Get date of most recent LearnerCourseGradeMetrics record, LCGM, for the
+      enrollment
+    * if LCGM does not exist or SM.modified is more recent than LCGM.modified then
+        * collect course_data via CourseEnrollmentFactory().read(...)
+        * create new LearnerCourseGradesMetics record with the updated data
+        * retain reference to tne new record
+    * else return the existing LearnerCourseGradesMetics record if it exists
+    * else return None if no existing LearnerCourseGradesMetics record
+    """
+    if not date_for:
+        date_for = datetime.utcnow().replace(tzinfo=utc).date()
+    site = get_site_for_course(course_enrollment.course_id)
+    if not site:
+        raise UnlinkedCourseError('No site found for course "{}"'.format(
+            course_enrollment.course_id))
+
+    most_recent_sm = student_modules_for_course_enrollment(
+        course_enrollment).order_by('modified').last()
+
+    most_recent_lcgm = LearnerCourseGradeMetrics.objects.filter(
+        user=course_enrollment.user,
+        course_id=str(course_enrollment.course_id)).order_by('date_for').last()
+
+    if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
+        progress_data = _collect_progress_data(most_recent_sm)
+        metrics = _add_enrollment_metrics_record(site=site,
+                                                 course_enrollment=course_enrollment,
+                                                 progress_data=progress_data,
+                                                 date_for=date_for)
+    else:
+        metrics = most_recent_lcgm
+    return metrics
+
+
+def _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
+    """Returns True if we need to update our learner progress, False otherwise
+
+    See the #Logic section in this module's docstring
+
+    If we need to check that the records match the same user and course, we
+    can do something like:
+
+    ```
+    class RecordMismatchError(Exception):
+        pass
+
+
+    def rec_matches_user_and_course(lcgm, sm):
+        return lcgm.user == sm.student and lcgm.course_id == sm.course_id
+    ```
+
+    And in this function add the check when we have both records:
+
+    ```
+        if not rec_matches_user_and_course(most_recent_lcgm, most_recent_sm):
+            rec_msg = '{}(user_id={}, course_id="{}"'
+            msg1 = rec_msg.format('lcgm',
+                                  most_recent_lcgm.user.id,
+                                  most_recent_lcgm.course_id)
+            msg2 = rec_msg.format('sm',
+                                  most_recent_sm.student.id,
+                                  most_recent_sm.course_id)
+            raise RecordMismatchError(msg1 + ':' + msg2)
+    ```
+    """
+    # First assume we need to update the enrollment metrics record
+    needs_update = True
+    if not most_recent_lcgm and not most_recent_sm:
+        # Learner has not started coursework
+        needs_update = False
+    elif most_recent_lcgm and most_recent_sm:
+        # Learner has past course activity
+        needs_update = most_recent_lcgm.date_for < most_recent_sm.modified.date()
+    elif not most_recent_lcgm and most_recent_sm:
+        # No LCGM recs, so Learner started on course after last collection
+        # This could also happen
+        # If this is the irst time collection is run for the learner+course
+        # if an unhandled error prevents LCGM from saving
+        # if the learner's LCGM recs were deleted
+        needs_update = True
+    elif most_recent_lcgm and not most_recent_sm:
+        # This shouldn't happen. We log this state as an error
+
+        # using 'COURSE_DATA' for the pipeline error type. Although we should
+        # revisit logging and error tracking in Figures to define a clear
+        # approach that has clear an intuitive contexts for the logging event
+        #
+        # We neede to decide:
+        #
+        # 1. Is this always an error state? Could this condition happen naturally?
+        #
+        # 2. Which error type should be created and what is the most applicable
+        #    context
+        #    Candidates
+        #    - Enrollment (learner+course) context
+        #    - Data state context - Why would we have an LCGM
+        #
+        # So we hold off updating PipelineError error choises initially until
+        # we can think carefully on how we tag pipeline errors
+        #
+        error_data = dict(
+            msg='LearnerCourseGradeMetrics record exists without StudentModule',
+        )
+        log_error(
+            error_data=error_data,
+            error_type=PipelineError.COURSE_DATA,
+            user=most_recent_lcgm.user,
+            course_id=str(most_recent_lcgm.course_id)
+        )
+        needs_update = False
+    return needs_update
+
+
+def _add_enrollment_metrics_record(site, course_enrollment, progress_data, date_for):
+    """Convenience function to save progress metrics to Figures
+    """
+    return LearnerCourseGradeMetrics.objects.create(
+        site=site,
+        user=course_enrollment.user,
+        course_id=str(course_enrollment.course_id),
+        date_for=date_for,
+        points_possible=progress_data['points_possible'],
+        points_earned=progress_data['points_earned'],
+        sections_worked=progress_data['sections_worked'],
+        sections_possible=progress_data['count']
+        )
+
+
+def _collect_progress_data(student_module):
+    """Get new progress data for the learner/course
+
+    Uses `figures.metrics.LearnerCourseGrades` to retrieve progress data via
+    `CourseGradeFactory().read(...)` and calculate progress percentage
+    """
+    lcg = LearnerCourseGrades(user_id=student_module.student_id,
+                              course_id=student_module.course_id)
+    course_progress_details = lcg.progress()
+    return course_progress_details
diff --git a/tests/pipeline/test_enrollment_metrics.py b/tests/pipeline/test_enrollment_metrics.py
new file mode 100644
index 00000000..199ab11c
--- /dev/null
+++ b/tests/pipeline/test_enrollment_metrics.py
@@ -0,0 +1,312 @@
+
+from datetime import datetime
+from dateutil.relativedelta import relativedelta
+import mock
+import pytest
+
+from django.utils.timezone import utc
+
+from courseware.models import StudentModule
+
+from figures.models import LearnerCourseGradeMetrics
+
+from figures.pipeline.enrollment_metrics import (
+    calculate_average_progress,
+    bulk_calculate_course_progress_data,
+    collect_metrics_for_enrollment,
+    _enrollment_metrics_needs_update,
+    _add_enrollment_metrics_record,
+    _collect_progress_data,
+)
+
+from tests.factories import (
+    CourseEnrollmentFactory,
+    CourseOverviewFactory,
+    LearnerCourseGradeMetricsFactory,
+    SiteFactory,
+    StudentModuleFactory,
+)
+
+
+@pytest.mark.django_db
+def test_bulk_calculate_course_progress_data_happy_path(db, monkeypatch):
+    """Tests 'bulk_calculate_course_progress_data' function
+
+    The function under test iterates over a set of course enrollment records,
+    So we create a couple of records to iterate over and mock the collect
+    function
+    """
+    course_overview = CourseOverviewFactory()
+    course_enrollments = [CourseEnrollmentFactory(
+        course_id=course_overview.id) for i in range(2)]
+    mapping = {ce.course_id: LearnerCourseGradeMetricsFactory(
+        course_id=str(ce.course_id),
+        user=ce.user,
+        sections_worked=1,
+        sections_possible=2) for ce in course_enrollments
+    }
+
+    def mock_metrics(course_enrollment, **_kwargs):
+        return mapping[course_enrollment.course_id]
+
+    monkeypatch.setattr('figures.pipeline.enrollment_metrics.collect_metrics_for_enrollment',
+                        mock_metrics)
+    data = bulk_calculate_course_progress_data(course_overview.id)
+    assert data['average_progress'] == 0.5
+
+
+@pytest.mark.parametrize('progress_percentages, expected_result', [
+    (None, 0.0),
+    ([], 0.0),
+    ([0, 0.25, 0.5, 0.75, 1.0], 0.5),
+])
+def test_calculate_average_progress_happy_path(progress_percentages, expected_result):
+    """Tests 'calculate_average_progress_happy_path' function
+
+    The function under test is a simple average calculation with formatting
+    """
+    average_progress = calculate_average_progress(progress_percentages)
+    assert average_progress == expected_result
+
+
+@pytest.mark.django_db
+class TestCollectMetricsForEnrollment(object):
+    """Tests 'collect_metrics_for_enrollment' function
+
+    The function under test does the following:
+
+    Given a course enrollment,
+      1. Get the most recent StudentModule (SM) record, if it exists
+      2. Get the most recent LearnerCourseGradeMetrics (LCGM) record, if it exists
+      3. Decide if we need to add a new LCGM record for the enrollment or use
+         the most recent one, if it exists
+      4. Collect new data from the platform if we need to update the enrollment's
+         metrics
+      5. Add a new LCGM record if we are collecting new metrics data
+      6. Return the new LCGM record if created, else return the existing record
+         found
+    """
+    @pytest.fixture(autouse=True)
+    def setup(self, db):
+        self.date_1 = datetime(2020, 2, 2, tzinfo=utc)
+        self.date_2 = self.date_1 + relativedelta(months=1)  # future of date_1
+        self.course_enrollment = CourseEnrollmentFactory()
+        self.student_modules = [
+            StudentModuleFactory(student=self.course_enrollment.user,
+                                 course_id=self.course_enrollment.course_id,
+                                 modified=self.date_1),
+            StudentModuleFactory(student=self.course_enrollment.user,
+                                 course_id=self.course_enrollment.course_id,
+                                 modified=self.date_2)]
+        self.progress_data = dict(points_possible=100,
+                                  points_earned=25,
+                                  sections_worked=4,
+                                  count=5)
+
+    def check_response_metrics(self, obj, data):
+        """Helper method checks LCGM object's data
+        """
+        assert obj.points_possible == data['points_possible']
+        assert obj.points_earned == data['points_earned']
+        assert obj.sections_worked == data['sections_worked']
+        assert obj.sections_possible == data['count']
+
+    def create_lcgm(self, date_for):
+        """Helper to create an LCGM record with the given `date_for`
+        """
+        return LearnerCourseGradeMetricsFactory(
+            course_id=str(self.course_enrollment.course_id),
+            user=self.course_enrollment.user,
+            date_for=date_for,
+            points_possible=self.progress_data['points_possible'],
+            points_earned=self.progress_data['points_earned'],
+            sections_worked=self.progress_data['sections_worked'],
+            sections_possible=self.progress_data['count']
+        )
+
+    def test_needs_update_no_lcgm(self, monkeypatch):
+        """We have an SM record, but no LCGM record
+
+        The function under test should return a new LCGM record
+        """
+        assert not LearnerCourseGradeMetrics.objects.count()
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics.get_site_for_course',
+                            lambda val: SiteFactory())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics._collect_progress_data',
+                            lambda val: self.progress_data)
+        metrics = collect_metrics_for_enrollment(
+            course_enrollment=self.course_enrollment,
+            )
+
+        self.check_response_metrics(metrics, self.progress_data)
+        assert LearnerCourseGradeMetrics.objects.count() == 1
+
+    def test_needs_update_has_lcgm(self, monkeypatch):
+        """We have an LCGM record, but it is not up to date with the SM
+
+        The function under test should return a new LCGM
+
+        TODO: Add test where we pass in `date_for` to the function under test
+        """
+        lcgm = self.create_lcgm(date_for=self.date_1)
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics.get_site_for_course',
+                            lambda val: SiteFactory())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics._collect_progress_data',
+                            lambda val: self.progress_data)
+        metrics = collect_metrics_for_enrollment(
+            course_enrollment=self.course_enrollment,
+            )
+
+        self.check_response_metrics(metrics, self.progress_data)
+        assert metrics != lcgm
+
+        # This works because we pick dates in the past. Better is to  fix this
+        # by either using freezegun or monkeypatching `_add_enrollment_metrics_record
+        assert metrics.date_for >= self.date_2.date()
+
+    def test_does_not_need_update(self, monkeypatch):
+        """We have an LCGM record that is up to date with the SM
+
+        The function under test should return the LCGM
+
+        TODO: Add test where we pass in `date_for` to the function under test
+        """
+        lcgm = self.create_lcgm(date_for=self.date_2.date())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics.get_site_for_course',
+                            lambda val: SiteFactory())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics._collect_progress_data',
+                            lambda val: self.progress_data)
+        metrics = collect_metrics_for_enrollment(
+            course_enrollment=self.course_enrollment,
+            )
+
+        self.check_response_metrics(metrics, self.progress_data)
+        assert metrics == lcgm
+        assert metrics.date_for == self.date_2.date()
+
+    def test_no_update_no_lcgm_no_sm(self, monkeypatch):
+        """We have neither an LCGM nor an SM record
+
+        The function under test should return None
+        """
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics.get_site_for_course',
+                            lambda val: SiteFactory())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics._collect_progress_data',
+                            lambda val: self.progress_data)
+        # Create a course enrollment for which we won't have student module records
+        metrics = collect_metrics_for_enrollment(
+            course_enrollment=CourseEnrollmentFactory(),
+            )
+        assert not metrics
+
+    def test_no_update_has_lcgm_no_sm(self, monkeypatch):
+        """We have an LCGM but not an SM record
+
+        The function under test should return the existing LCGM
+        """
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics.get_site_for_course',
+                            lambda val: SiteFactory())
+        monkeypatch.setattr('figures.pipeline.enrollment_metrics._collect_progress_data',
+                            lambda val: self.progress_data)
+        # Create a course enrollment for which we won't have student module records
+        ce = CourseEnrollmentFactory()
+        lcgm = LearnerCourseGradeMetricsFactory(course_id=ce.course_id, user=ce.user)
+        metrics = collect_metrics_for_enrollment(course_enrollment=ce)
+        assert metrics == lcgm
+        assert not StudentModule.objects.filter(course_id=ce.course_id)
+
+
+@pytest.mark.django_db
+class TestEnrollmentMetricsUpdateCheck(object):
+    """Tests the `_enrollment_metrics_needs_update` function performs as expected
+
+    Quick test of the different states and expected results. Should be reworked
+
+    We should be able to collapse these tests into one using parametrize. However,
+    we don't want conditionals and some tests call for testing dates and others
+    existance. So for now, a little less DRY with individual tests for each
+    conditional in the function
+    """
+    @pytest.fixture(autouse=True)
+    def setup(self, db):
+        self.student_module = StudentModuleFactory()
+
+    def test_existence_no_lcgm_no_sm_is_false(self):
+        """Test if there is not a LearnerCourseGradeMetrics  record and not a
+        SiteModule, then it returns false
+        """
+        assert not _enrollment_metrics_needs_update(None, None)
+
+    def test_existence_no_lcgm_yes_sm_is_true(self):
+        assert _enrollment_metrics_needs_update(None, self.student_module)
+
+    def test_existence_yes_lcgm_no_sm_is_false(self):
+        path = 'figures.pipeline.enrollment_metrics.log_error'
+        with mock.patch(path) as mock_log_error:
+            assert not _enrollment_metrics_needs_update(LearnerCourseGradeMetricsFactory(), None)
+            mock_log_error.assert_called()
+
+    def test_dates_lcgm_is_current_is_false(self):
+        lcgm = LearnerCourseGradeMetricsFactory(
+            date_for=self.student_module.modified.date())
+        assert not _enrollment_metrics_needs_update(lcgm, self.student_module)
+
+    def test_dates_lcgm_is_future_is_false(self):
+        """
+        Note: This should probably be an error state
+        """
+        lcgm = LearnerCourseGradeMetricsFactory(
+            date_for=self.student_module.modified.date() + relativedelta(days=1))
+        assert not _enrollment_metrics_needs_update(lcgm, self.student_module)
+
+    def test_dates_lcgm_is_past_is_true(self):
+        lcgm = LearnerCourseGradeMetricsFactory(
+            date_for=self.student_module.modified.date() - relativedelta(days=1))
+        assert _enrollment_metrics_needs_update(lcgm, self.student_module)
+
+
+@pytest.mark.django_db
+class TestAddEnrollmentMetricsRecord(object):
+    """Tests the `_add_enrollment_metrics_record` function
+
+    """
+    @pytest.fixture(autouse=True)
+    def setup(self, db):
+        self.site = SiteFactory()
+        self.course_enrollment = CourseEnrollmentFactory()
+        self.date_for = datetime.utcnow().replace(tzinfo=utc).date()
+
+    def test_happy_path(self):
+        """Basic creation check
+        """
+        progress_data = dict(
+            points_possible=10,
+            points_earned=5,
+            count=22,  # sections possible
+            sections_worked=11
+        )
+        obj = _add_enrollment_metrics_record(site=self.site,
+                                             course_enrollment=self.course_enrollment,
+                                             progress_data=progress_data,
+                                             date_for=self.date_for)
+        assert obj
+
+
+@pytest.mark.django_db
+def test_collect_progress_data(db, monkeypatch):
+    """Tests the `_collect_progress_data` function
+
+    The function under test instantiates a LearnerCourseGrade object, calls its
+    `progress`
+
+    We can use the default values created from the mock `LearnerCourseGrades`
+    class
+    """
+    student_module = StudentModuleFactory()
+    progress_data = _collect_progress_data(student_module)
+
+    # Simply checking the keys
+    assert set(progress_data.keys()) == set(['count',
+                                             'sections_worked',
+                                             'points_possible',
+                                             'points_earned'])

From 57f93c5db2f0df0ce8628524f0b2f9f88579c4c8 Mon Sep 17 00:00:00 2001
From: John Baldwin <jlbaldwin@gmail.com>
Date: Tue, 12 May 2020 20:46:36 -0400
Subject: [PATCH 2/4] Added new functions to `figures.sites` to support query
 abstractions

- Added `course_enrollments_for_course`
- Added `student_modules_for_course_enrollment`

This is to support the course progress pipeline performance improvement
code restructuring. The purpose of adding these functions is to move
forward with using `figures.sites` as the source of platform data. To
note, there are a number of places in the code that do not follow this.
As we rework Figures, we will refactor code to use `figures.sites`
instead of directly querying the platform models.

Note the function name change from the existing functions. We are moving
away from using the `get_` prefix for the site functions.
---
 figures/sites.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/figures/sites.py b/figures/sites.py
index 75ff78a1..dc88a473 100644
--- a/figures/sites.py
+++ b/figures/sites.py
@@ -39,6 +39,22 @@ class CourseNotInSiteError(CrossSiteResourceError):
     pass
 
 
+class UnlinkedCourseError(Exception):
+    """Raise when we need to fail if we can't get the site for a course
+
+    This will happen in multisite mode if the course is not linked to the site.
+    In Tahoe Hawthorn, we use the Appsembler fork of `edx-organizations` to map
+    courses to sites. For community and standalone deployments, we don't expect
+    courses to map to sites, so we just return the app instance's default site.
+
+    * This is new to support enrollment metrics rework (May 2020)
+    * We need to evaluate if we want to make sites.get_site_for_course` to
+      raise this automatically. But first we need to make sure we have test
+      coverage to make sure we don't introduce new bugs
+    """
+    pass
+
+
 def site_to_id(site):
     """
     Helper to cast site or site id to id
@@ -181,3 +197,19 @@ def get_student_modules_for_course_in_site(site, course_id):
 def get_student_modules_for_site(site):
     course_ids = get_course_keys_for_site(site)
     return StudentModule.objects.filter(course_id__in=course_ids)
+
+
+def course_enrollments_for_course(course_id):
+    """Return a queryset of all `CourseEnrollment` records for a course
+
+    Relies on the fact that course_ids are globally unique
+    """
+    return CourseEnrollment.objects.filter(course_id=as_course_key(course_id))
+
+
+def student_modules_for_course_enrollment(ce):
+    """Return a queryset of all `StudentModule` records for a `CourseEnrollment`1
+
+    Relies on the fact that course_ids are globally unique
+    """
+    return StudentModule.objects.filter(student=ce.user, course_id=ce.course_id)

From 3d8218482e6df712817d390f1bfd09aaf903b09a Mon Sep 17 00:00:00 2001
From: John Baldwin <jlbaldwin@gmail.com>
Date: Wed, 13 May 2020 00:36:54 -0400
Subject: [PATCH 3/4] Course daily metrics pipeline update to support
 enrollment metrics rework

- Updated figures.pipeline.course_daily_metrics loader to use the new
enrollment pipeline code
- Renamed `get_average_progress` to `get_average_progress_deprecated`
and retained if needed to help debug the new enrollment metrics
- Updated tests to support these changes
---
 figures/pipeline/course_daily_metrics.py    | 17 ++++++++++----
 tests/pipeline/test_course_daily_metrics.py | 25 +++++++++++----------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/figures/pipeline/course_daily_metrics.py b/figures/pipeline/course_daily_metrics.py
index 9bc4e2a0..d301b691 100644
--- a/figures/pipeline/course_daily_metrics.py
+++ b/figures/pipeline/course_daily_metrics.py
@@ -28,6 +28,7 @@
 from figures.models import CourseDailyMetrics, PipelineError
 from figures.pipeline.logger import log_error
 import figures.pipeline.loaders
+from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data
 from figures.serializers import CourseIndexSerializer
 from figures.compat import GeneratedCertificate
 import figures.sites
@@ -81,7 +82,7 @@ def get_active_learner_ids_today(course_id, date_for):
         ).values_list('student__id', flat=True).distinct()
 
 
-def get_average_progress(course_id, date_for, course_enrollments):
+def get_average_progress_deprecated(course_id, date_for, course_enrollments):
     """Collects and aggregates raw course grades data
     """
     progress = []
@@ -225,6 +226,9 @@ def extract(self, course_id, date_for=None, **_kwargs):
                 average_days_to_complete=data.get('average_days_to_complete, None'),
                 num_learners_completed=data['num_learners_completed'],
             )
+        TODO: refactor this class
+        Add lazy loading method to load course enrollments
+        - Create a method for each metric field
         """
 
         # Update args if not assigned
@@ -247,18 +251,23 @@ def extract(self, course_id, date_for=None, **_kwargs):
         # we can do a lambda for course_enrollments to get the count
 
         data['enrollment_count'] = course_enrollments.count()
+
         active_learner_ids_today = get_active_learner_ids_today(
             course_id, date_for,)
         if active_learner_ids_today:
             active_learners_today = active_learner_ids_today.count()
         else:
             active_learners_today = 0
-
         data['active_learners_today'] = active_learners_today
-        data['average_progress'] = get_average_progress(
-            course_id, date_for, course_enrollments,)
+
+        # Average progress
+        progress_data = bulk_calculate_course_progress_data(course_id=course_id,
+                                                            date_for=date_for)
+        data['average_progress'] = progress_data['average_progress']
+
         data['average_days_to_complete'] = get_average_days_to_complete(
             course_id, date_for,)
+
         data['num_learners_completed'] = get_num_learners_completed(
             course_id, date_for,)
 
diff --git a/tests/pipeline/test_course_daily_metrics.py b/tests/pipeline/test_course_daily_metrics.py
index d898c8f7..daa687d6 100644
--- a/tests/pipeline/test_course_daily_metrics.py
+++ b/tests/pipeline/test_course_daily_metrics.py
@@ -158,7 +158,7 @@ def test_get_active_learner_ids_today(self):
             course_id=self.course_overview.id, date_for=self.today)
         assert recs.count() == len(self.course_enrollments)
 
-    def test_get_average_progress(self):
+    def test_get_average_progress_deprecated(self):
         """
         [John] This test needs work. The function it is testing needs work too
         for testability. We don't want to reproduce the function's behavior, we
@@ -167,7 +167,7 @@ def test_get_average_progress(self):
         """
         course_enrollments = CourseEnrollment.objects.filter(
             course_id=self.course_overview.id)
-        actual = pipeline_cdm.get_average_progress(
+        actual = pipeline_cdm.get_average_progress_deprecated(
             course_id=self.course_overview.id,
             date_for=self.today,
             course_enrollments=course_enrollments
@@ -183,13 +183,13 @@ def test_get_average_progress(self):
         'figures.metrics.LearnerCourseGrades.course_progress',
         side_effect=PermissionDenied('mock-failure')
     )
-    def test_get_average_progress_error(self, mock_lcg):
+    def test_get_average_progress_deprecated_has_error(self, mock_lcg):
 
         assert PipelineError.objects.count() == 0
         course_enrollments = CourseEnrollment.objects.filter(
             course_id=self.course_overview.id)
 
-        results = pipeline_cdm.get_average_progress(
+        results = pipeline_cdm.get_average_progress_deprecated(
                 course_id=self.course_overview.id,
                 date_for=self.today,
                 course_enrollments=course_enrollments
@@ -236,8 +236,12 @@ def setup(self, db):
         self.course_enrollments = [CourseEnrollmentFactory() for i in range(1, 5)]
         self.student_module = StudentModuleFactory()
 
-    def test_extract(self):
+    def test_extract(self, monkeypatch):
         course_id = self.course_enrollments[0].course_id
+        monkeypatch.setattr(figures.pipeline.course_daily_metrics,
+                            'bulk_calculate_course_progress_data',
+                            lambda **_kwargs: dict(average_progress=0.5))
+
         results = pipeline_cdm.CourseDailyMetricsExtractor().extract(course_id)
         assert results
 
@@ -298,14 +302,11 @@ def test_load_existing(self):
     @pytest.mark.parametrize('average_progress', [-1.0, -0.01, 1.01])
     def test_load_invalid_data(self, monkeypatch, average_progress):
 
-        def mock_get_average_progress(course_id, date_for, course_enrollments):
-            return average_progress
-
         course_id = self.course_enrollments[0].course_id
-        monkeypatch.setattr(
-            figures.pipeline.course_daily_metrics,
-            'get_average_progress',
-            mock_get_average_progress)
+        monkeypatch.setattr(figures.pipeline.course_daily_metrics,
+                            'bulk_calculate_course_progress_data',
+                            lambda **_kwargs: dict(average_progress=average_progress))
+
         with pytest.raises(ValidationError) as execinfo:
             assert CourseDailyMetrics.objects.count() == 0
             cdm, created = pipeline_cdm.CourseDailyMetricsLoader(course_id).load()

From d40dc7807f88be6888dced2e25c54796ca6fb0e3 Mon Sep 17 00:00:00 2001
From: John Baldwin <jlbaldwin@gmail.com>
Date: Sat, 16 May 2020 17:47:25 -0400
Subject: [PATCH 4/4] Addresses Pylint failure "Instance of 'list' has no
 'order_by' member (no-member)"

See: https://github.com/PyCQA/pylint-django/issues/165

Also updated documentation for enrollment metrics
---
 figures/pipeline/enrollment_metrics.py | 32 ++++++++++++++++----------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/figures/pipeline/enrollment_metrics.py b/figures/pipeline/enrollment_metrics.py
index 1e2479cf..2cb72a3b 100644
--- a/figures/pipeline/enrollment_metrics.py
+++ b/figures/pipeline/enrollment_metrics.py
@@ -13,22 +13,24 @@
 # Logic
 
 ```
-for each learner+course
+
+for each learner+course  # See bulk_calculate_course_progress_data
     get newest lgcm record
     get newest sm record
 
+    # Check if update needed, see _enrollment_metrics_needs_update
     if not lcgm and not sm
         # Learner has not started course
         skip collection for this learner+course
     else if lcgm and sm
         # we have saved history so we can check to update
-        if lcgm.modified is older than sm.modified:
-            collect new course data
+        if lcgm.modified is older than sm.modified
+            collect new course data  # See _collect_progress_data
             return new metrics object created
 
     else if not lcgm and sm
         # Learner started course after last collection action
-        collect new course data
+        collect new course data  # See _collect_progress_data
         return new metrics object created
 
     else  # lcgm and not sm
@@ -39,10 +41,9 @@
         return None
 ```
 
-# NOTE: Change 'lcgm' in this file to either 'enrollment' or 'lp' when an
-        abbreviation is acceptible. This is to get this module forward ready for
-        reworking `LearnerCourseGradeMetrics`
-
+# NOTE: We plan to change 'lcgm' in this file to either 'enrollment' or 'LP'
+       (for Learner Progress) when an abbreviation is acceptible. This is to get
+       this module forward ready for reworking `LearnerCourseGradeMetrics`
 """
 
 from datetime import datetime
@@ -148,12 +149,19 @@ def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
         raise UnlinkedCourseError('No site found for course "{}"'.format(
             course_enrollment.course_id))
 
-    most_recent_sm = student_modules_for_course_enrollment(
-        course_enrollment).order_by('modified').last()
+    # The following are two different ways to avoide the dreaded error
+    #     "Instance of 'list' has no 'order_by' member (no-member)"
+    # See: https://github.com/PyCQA/pylint-django/issues/165
+    student_modules = student_modules_for_course_enrollment(ce=course_enrollment)
+    if student_modules:
+        most_recent_sm = student_modules.latest('modified')
+    else:
+        most_recent_sm = None
 
-    most_recent_lcgm = LearnerCourseGradeMetrics.objects.filter(
+    lcgm = LearnerCourseGradeMetrics.objects.filter(
         user=course_enrollment.user,
-        course_id=str(course_enrollment.course_id)).order_by('date_for').last()
+        course_id=str(course_enrollment.course_id))
+    most_recent_lcgm = lcgm.order_by('date_for').last()  # pylint: disable=E1101
 
     if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
         progress_data = _collect_progress_data(most_recent_sm)