Merge pull request #214 from appsembler/john/rework-course-progress

Enrollment Metrics Rework
appsembler · May 17, 2020 · 7961f4f · 7961f4f
2 parents 9eb0331 + d40dc78
commit 7961f4f
Show file tree

Hide file tree

Showing 5 changed files with 650 additions and 16 deletions.
diff --git a/figures/pipeline/course_daily_metrics.py b/figures/pipeline/course_daily_metrics.py
@@ -28,6 +28,7 @@
 from figures.models import CourseDailyMetrics, PipelineError
 from figures.pipeline.logger import log_error
 import figures.pipeline.loaders
+from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data
 from figures.serializers import CourseIndexSerializer
 from figures.compat import GeneratedCertificate
 import figures.sites
@@ -81,7 +82,7 @@ def get_active_learner_ids_today(course_id, date_for):
         ).values_list('student__id', flat=True).distinct()
 
 
-def get_average_progress(course_id, date_for, course_enrollments):
+def get_average_progress_deprecated(course_id, date_for, course_enrollments):
     """Collects and aggregates raw course grades data
     """
     progress = []
@@ -225,6 +226,9 @@ def extract(self, course_id, date_for=None, **_kwargs):
                 average_days_to_complete=data.get('average_days_to_complete, None'),
                 num_learners_completed=data['num_learners_completed'],
             )
+        TODO: refactor this class
+        Add lazy loading method to load course enrollments
+        - Create a method for each metric field
         """
 
         # Update args if not assigned
@@ -247,18 +251,23 @@ def extract(self, course_id, date_for=None, **_kwargs):
         # we can do a lambda for course_enrollments to get the count
 
         data['enrollment_count'] = course_enrollments.count()
+
         active_learner_ids_today = get_active_learner_ids_today(
             course_id, date_for,)
         if active_learner_ids_today:
             active_learners_today = active_learner_ids_today.count()
         else:
             active_learners_today = 0
-
         data['active_learners_today'] = active_learners_today
-        data['average_progress'] = get_average_progress(
-            course_id, date_for, course_enrollments,)
+
+        # Average progress
+        progress_data = bulk_calculate_course_progress_data(course_id=course_id,
+                                                            date_for=date_for)
+        data['average_progress'] = progress_data['average_progress']
+
         data['average_days_to_complete'] = get_average_days_to_complete(
             course_id, date_for,)
+
         data['num_learners_completed'] = get_num_learners_completed(
             course_id, date_for,)
 

diff --git a/figures/pipeline/enrollment_metrics.py b/figures/pipeline/enrollment_metrics.py
@@ -0,0 +1,280 @@
+"""This module collects metrics for a course enrollment (learner/course pair)
+
+# Overview
+
+This is new code for Figures 0.3.10. Its purpose is to address pipeline
+performance by reducing calls to `CourseGradeFactory().read.
+
+Calls are reduced by checking if the learner has a `StudentModule` record
+modified after the most recent `LearnerCourseGradeMetrics` record `date_for`
+field
+
+
+# Logic
+
+```
+
+for each learner+course  # See bulk_calculate_course_progress_data
+    get newest lgcm record
+    get newest sm record
+
+    # Check if update needed, see _enrollment_metrics_needs_update
+    if not lcgm and not sm
+        # Learner has not started course
+        skip collection for this learner+course
+    else if lcgm and sm
+        # we have saved history so we can check to update
+        if lcgm.modified is older than sm.modified
+            collect new course data  # See _collect_progress_data
+            return new metrics object created
+
+    else if not lcgm and sm
+        # Learner started course after last collection action
+        collect new course data  # See _collect_progress_data
+        return new metrics object created
+
+    else  # lcgm and not sm
+        # We had collected data previously, but the upstream went away
+        # This could happen with data cleansing, like GPPR requests
+        # Likely other causes, too
+        log condition to figures PipelineError log
+        return None
+```
+
+# NOTE: We plan to change 'lcgm' in this file to either 'enrollment' or 'LP'
+       (for Learner Progress) when an abbreviation is acceptible. This is to get
+       this module forward ready for reworking `LearnerCourseGradeMetrics`
+"""
+
+from datetime import datetime
+from decimal import Decimal
+
+from django.utils.timezone import utc
+
+from figures.metrics import LearnerCourseGrades
+from figures.models import LearnerCourseGradeMetrics, PipelineError
+from figures.pipeline.logger import log_error
+from figures.sites import (
+    get_site_for_course,
+    course_enrollments_for_course,
+    student_modules_for_course_enrollment,
+    UnlinkedCourseError,
+    )
+
+
+def bulk_calculate_course_progress_data(course_id, date_for=None):
+    """Calculates the average progress for a set of course enrollments
+
+    How it works
+    1. collects progress percent for each course enrollment
+        1.1. If up to date enrollment metrics record already exists, use that
+    2. calculate and return the average of these enrollments
+
+    TODO: Update to filter on active users
+
+    Questions:
+    - What makes a learner an active learner?
+
+    """
+    progress_percentages = []
+
+    if not date_for:
+        date_for = datetime.utcnow().replace(tzinfo=utc).date()
+
+    for ce in course_enrollments_for_course(course_id):
+        metrics = collect_metrics_for_enrollment(course_enrollment=ce,
+                                                 date_for=date_for)
+        if metrics:
+            progress_percentages.append(metrics.progress_percent)
+        else:
+            # Log this for troubleshooting
+            error_data = dict(
+                msg=('Unable to create or retrieve enrollment metrics ' +
+                     'for user {} and course {}'.format(ce.user.username,
+                                                        str(ce.course_id)))
+            )
+            log_error(
+                error_data=error_data,
+                error_type=PipelineError.COURSE_DATA,
+                user=ce.user,
+                course_id=str(course_id)
+            )
+
+    return dict(
+        average_progress=calculate_average_progress(progress_percentages),
+    )
+
+
+def calculate_average_progress(progress_percentages):
+    """Calcuates average progress from a list of values
+
+    TODO: How do we want to handle malformed data?
+    """
+    if progress_percentages:
+        average_progress = float(sum(progress_percentages)) / float(len(progress_percentages))
+        average_progress = float(Decimal(average_progress).quantize(Decimal('.00')))
+    else:
+        average_progress = 0.0
+    return average_progress
+
+
+def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
+    """Collect metrics for enrollment (learner+course)
+
+    This function performs course enrollment merics collection for the given
+    unique enrollment identified by the learner and course. It is initial code
+    in refactoring the pipeline for learner progress
+
+    Important models here are:
+    * StudentModule (SM)
+    * LearnerCourseGradeMetrics (LCGM)
+    * CourseEnrollment (CE)
+
+    # Workflow
+
+    * Get date of most recent StudentModule record, for the enrollment record
+    * Get date of most recent LearnerCourseGradeMetrics record, LCGM, for the
+      enrollment
+    * if LCGM does not exist or SM.modified is more recent than LCGM.modified then
+        * collect course_data via CourseEnrollmentFactory().read(...)
+        * create new LearnerCourseGradesMetics record with the updated data
+        * retain reference to tne new record
+    * else return the existing LearnerCourseGradesMetics record if it exists
+    * else return None if no existing LearnerCourseGradesMetics record
+    """
+    if not date_for:
+        date_for = datetime.utcnow().replace(tzinfo=utc).date()
+    site = get_site_for_course(course_enrollment.course_id)
+    if not site:
+        raise UnlinkedCourseError('No site found for course "{}"'.format(
+            course_enrollment.course_id))
+
+    # The following are two different ways to avoide the dreaded error
+    #     "Instance of 'list' has no 'order_by' member (no-member)"
+    # See: https://github.com/PyCQA/pylint-django/issues/165
+    student_modules = student_modules_for_course_enrollment(ce=course_enrollment)
+    if student_modules:
+        most_recent_sm = student_modules.latest('modified')
+    else:
+        most_recent_sm = None
+
+    lcgm = LearnerCourseGradeMetrics.objects.filter(
+        user=course_enrollment.user,
+        course_id=str(course_enrollment.course_id))
+    most_recent_lcgm = lcgm.order_by('date_for').last()  # pylint: disable=E1101
+
+    if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
+        progress_data = _collect_progress_data(most_recent_sm)
+        metrics = _add_enrollment_metrics_record(site=site,
+                                                 course_enrollment=course_enrollment,
+                                                 progress_data=progress_data,
+                                                 date_for=date_for)
+    else:
+        metrics = most_recent_lcgm
+    return metrics
+
+
+def _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
+    """Returns True if we need to update our learner progress, False otherwise
+
+    See the #Logic section in this module's docstring
+
+    If we need to check that the records match the same user and course, we
+    can do something like:
+
+    ```
+    class RecordMismatchError(Exception):
+        pass
+
+
+    def rec_matches_user_and_course(lcgm, sm):
+        return lcgm.user == sm.student and lcgm.course_id == sm.course_id
+    ```
+
+    And in this function add the check when we have both records:
+
+    ```
+        if not rec_matches_user_and_course(most_recent_lcgm, most_recent_sm):
+            rec_msg = '{}(user_id={}, course_id="{}"'
+            msg1 = rec_msg.format('lcgm',
+                                  most_recent_lcgm.user.id,
+                                  most_recent_lcgm.course_id)
+            msg2 = rec_msg.format('sm',
+                                  most_recent_sm.student.id,
+                                  most_recent_sm.course_id)
+            raise RecordMismatchError(msg1 + ':' + msg2)
+    ```
+    """
+    # First assume we need to update the enrollment metrics record
+    needs_update = True
+    if not most_recent_lcgm and not most_recent_sm:
+        # Learner has not started coursework
+        needs_update = False
+    elif most_recent_lcgm and most_recent_sm:
+        # Learner has past course activity
+        needs_update = most_recent_lcgm.date_for < most_recent_sm.modified.date()
+    elif not most_recent_lcgm and most_recent_sm:
+        # No LCGM recs, so Learner started on course after last collection
+        # This could also happen
+        # If this is the irst time collection is run for the learner+course
+        # if an unhandled error prevents LCGM from saving
+        # if the learner's LCGM recs were deleted
+        needs_update = True
+    elif most_recent_lcgm and not most_recent_sm:
+        # This shouldn't happen. We log this state as an error
+
+        # using 'COURSE_DATA' for the pipeline error type. Although we should
+        # revisit logging and error tracking in Figures to define a clear
+        # approach that has clear an intuitive contexts for the logging event
+        #
+        # We neede to decide:
+        #
+        # 1. Is this always an error state? Could this condition happen naturally?
+        #
+        # 2. Which error type should be created and what is the most applicable
+        #    context
+        #    Candidates
+        #    - Enrollment (learner+course) context
+        #    - Data state context - Why would we have an LCGM
+        #
+        # So we hold off updating PipelineError error choises initially until
+        # we can think carefully on how we tag pipeline errors
+        #
+        error_data = dict(
+            msg='LearnerCourseGradeMetrics record exists without StudentModule',
+        )
+        log_error(
+            error_data=error_data,
+            error_type=PipelineError.COURSE_DATA,
+            user=most_recent_lcgm.user,
+            course_id=str(most_recent_lcgm.course_id)
+        )
+        needs_update = False
+    return needs_update
+
+
+def _add_enrollment_metrics_record(site, course_enrollment, progress_data, date_for):
+    """Convenience function to save progress metrics to Figures
+    """
+    return LearnerCourseGradeMetrics.objects.create(
+        site=site,
+        user=course_enrollment.user,
+        course_id=str(course_enrollment.course_id),
+        date_for=date_for,
+        points_possible=progress_data['points_possible'],
+        points_earned=progress_data['points_earned'],
+        sections_worked=progress_data['sections_worked'],
+        sections_possible=progress_data['count']
+        )
+
+
+def _collect_progress_data(student_module):
+    """Get new progress data for the learner/course
+
+    Uses `figures.metrics.LearnerCourseGrades` to retrieve progress data via
+    `CourseGradeFactory().read(...)` and calculate progress percentage
+    """
+    lcg = LearnerCourseGrades(user_id=student_module.student_id,
+                              course_id=student_module.course_id)
+    course_progress_details = lcg.progress()
+    return course_progress_details
diff --git a/figures/sites.py b/figures/sites.py
@@ -39,6 +39,22 @@ class CourseNotInSiteError(CrossSiteResourceError):
     pass
 
 
+class UnlinkedCourseError(Exception):
+    """Raise when we need to fail if we can't get the site for a course
+
+    This will happen in multisite mode if the course is not linked to the site.
+    In Tahoe Hawthorn, we use the Appsembler fork of `edx-organizations` to map
+    courses to sites. For community and standalone deployments, we don't expect
+    courses to map to sites, so we just return the app instance's default site.
+
+    * This is new to support enrollment metrics rework (May 2020)
+    * We need to evaluate if we want to make sites.get_site_for_course` to
+      raise this automatically. But first we need to make sure we have test
+      coverage to make sure we don't introduce new bugs
+    """
+    pass
+
+
 def site_to_id(site):
     """
     Helper to cast site or site id to id
@@ -181,3 +197,19 @@ def get_student_modules_for_course_in_site(site, course_id):
 def get_student_modules_for_site(site):
     course_ids = get_course_keys_for_site(site)
     return StudentModule.objects.filter(course_id__in=course_ids)
+
+
+def course_enrollments_for_course(course_id):
+    """Return a queryset of all `CourseEnrollment` records for a course
+
+    Relies on the fact that course_ids are globally unique
+    """
+    return CourseEnrollment.objects.filter(course_id=as_course_key(course_id))
+
+
+def student_modules_for_course_enrollment(ce):
+    """Return a queryset of all `StudentModule` records for a `CourseEnrollment`1
+
+    Relies on the fact that course_ids are globally unique
+    """
+    return StudentModule.objects.filter(student=ce.user, course_id=ce.course_id)