Skip to content

Commit

Permalink
Merge pull request #214 from appsembler/john/rework-course-progress
Browse files Browse the repository at this point in the history
Enrollment Metrics Rework
  • Loading branch information
johnbaldwin authored May 17, 2020
2 parents 9eb0331 + d40dc78 commit 7961f4f
Show file tree
Hide file tree
Showing 5 changed files with 650 additions and 16 deletions.
17 changes: 13 additions & 4 deletions figures/pipeline/course_daily_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from figures.models import CourseDailyMetrics, PipelineError
from figures.pipeline.logger import log_error
import figures.pipeline.loaders
from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data
from figures.serializers import CourseIndexSerializer
from figures.compat import GeneratedCertificate
import figures.sites
Expand Down Expand Up @@ -81,7 +82,7 @@ def get_active_learner_ids_today(course_id, date_for):
).values_list('student__id', flat=True).distinct()


def get_average_progress(course_id, date_for, course_enrollments):
def get_average_progress_deprecated(course_id, date_for, course_enrollments):
"""Collects and aggregates raw course grades data
"""
progress = []
Expand Down Expand Up @@ -225,6 +226,9 @@ def extract(self, course_id, date_for=None, **_kwargs):
average_days_to_complete=data.get('average_days_to_complete, None'),
num_learners_completed=data['num_learners_completed'],
)
TODO: refactor this class
Add lazy loading method to load course enrollments
- Create a method for each metric field
"""

# Update args if not assigned
Expand All @@ -247,18 +251,23 @@ def extract(self, course_id, date_for=None, **_kwargs):
# we can do a lambda for course_enrollments to get the count

data['enrollment_count'] = course_enrollments.count()

active_learner_ids_today = get_active_learner_ids_today(
course_id, date_for,)
if active_learner_ids_today:
active_learners_today = active_learner_ids_today.count()
else:
active_learners_today = 0

data['active_learners_today'] = active_learners_today
data['average_progress'] = get_average_progress(
course_id, date_for, course_enrollments,)

# Average progress
progress_data = bulk_calculate_course_progress_data(course_id=course_id,
date_for=date_for)
data['average_progress'] = progress_data['average_progress']

data['average_days_to_complete'] = get_average_days_to_complete(
course_id, date_for,)

data['num_learners_completed'] = get_num_learners_completed(
course_id, date_for,)

Expand Down
280 changes: 280 additions & 0 deletions figures/pipeline/enrollment_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,280 @@
"""This module collects metrics for a course enrollment (learner/course pair)
# Overview
This is new code for Figures 0.3.10. Its purpose is to address pipeline
performance by reducing calls to `CourseGradeFactory().read.
Calls are reduced by checking if the learner has a `StudentModule` record
modified after the most recent `LearnerCourseGradeMetrics` record `date_for`
field
# Logic
```
for each learner+course # See bulk_calculate_course_progress_data
get newest lgcm record
get newest sm record
# Check if update needed, see _enrollment_metrics_needs_update
if not lcgm and not sm
# Learner has not started course
skip collection for this learner+course
else if lcgm and sm
# we have saved history so we can check to update
if lcgm.modified is older than sm.modified
collect new course data # See _collect_progress_data
return new metrics object created
else if not lcgm and sm
# Learner started course after last collection action
collect new course data # See _collect_progress_data
return new metrics object created
else # lcgm and not sm
# We had collected data previously, but the upstream went away
# This could happen with data cleansing, like GPPR requests
# Likely other causes, too
log condition to figures PipelineError log
return None
```
# NOTE: We plan to change 'lcgm' in this file to either 'enrollment' or 'LP'
(for Learner Progress) when an abbreviation is acceptible. This is to get
this module forward ready for reworking `LearnerCourseGradeMetrics`
"""

from datetime import datetime
from decimal import Decimal

from django.utils.timezone import utc

from figures.metrics import LearnerCourseGrades
from figures.models import LearnerCourseGradeMetrics, PipelineError
from figures.pipeline.logger import log_error
from figures.sites import (
get_site_for_course,
course_enrollments_for_course,
student_modules_for_course_enrollment,
UnlinkedCourseError,
)


def bulk_calculate_course_progress_data(course_id, date_for=None):
"""Calculates the average progress for a set of course enrollments
How it works
1. collects progress percent for each course enrollment
1.1. If up to date enrollment metrics record already exists, use that
2. calculate and return the average of these enrollments
TODO: Update to filter on active users
Questions:
- What makes a learner an active learner?
"""
progress_percentages = []

if not date_for:
date_for = datetime.utcnow().replace(tzinfo=utc).date()

for ce in course_enrollments_for_course(course_id):
metrics = collect_metrics_for_enrollment(course_enrollment=ce,
date_for=date_for)
if metrics:
progress_percentages.append(metrics.progress_percent)
else:
# Log this for troubleshooting
error_data = dict(
msg=('Unable to create or retrieve enrollment metrics ' +
'for user {} and course {}'.format(ce.user.username,
str(ce.course_id)))
)
log_error(
error_data=error_data,
error_type=PipelineError.COURSE_DATA,
user=ce.user,
course_id=str(course_id)
)

return dict(
average_progress=calculate_average_progress(progress_percentages),
)


def calculate_average_progress(progress_percentages):
"""Calcuates average progress from a list of values
TODO: How do we want to handle malformed data?
"""
if progress_percentages:
average_progress = float(sum(progress_percentages)) / float(len(progress_percentages))
average_progress = float(Decimal(average_progress).quantize(Decimal('.00')))
else:
average_progress = 0.0
return average_progress


def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
"""Collect metrics for enrollment (learner+course)
This function performs course enrollment merics collection for the given
unique enrollment identified by the learner and course. It is initial code
in refactoring the pipeline for learner progress
Important models here are:
* StudentModule (SM)
* LearnerCourseGradeMetrics (LCGM)
* CourseEnrollment (CE)
# Workflow
* Get date of most recent StudentModule record, for the enrollment record
* Get date of most recent LearnerCourseGradeMetrics record, LCGM, for the
enrollment
* if LCGM does not exist or SM.modified is more recent than LCGM.modified then
* collect course_data via CourseEnrollmentFactory().read(...)
* create new LearnerCourseGradesMetics record with the updated data
* retain reference to tne new record
* else return the existing LearnerCourseGradesMetics record if it exists
* else return None if no existing LearnerCourseGradesMetics record
"""
if not date_for:
date_for = datetime.utcnow().replace(tzinfo=utc).date()
site = get_site_for_course(course_enrollment.course_id)
if not site:
raise UnlinkedCourseError('No site found for course "{}"'.format(
course_enrollment.course_id))

# The following are two different ways to avoide the dreaded error
# "Instance of 'list' has no 'order_by' member (no-member)"
# See: https://github.com/PyCQA/pylint-django/issues/165
student_modules = student_modules_for_course_enrollment(ce=course_enrollment)
if student_modules:
most_recent_sm = student_modules.latest('modified')
else:
most_recent_sm = None

lcgm = LearnerCourseGradeMetrics.objects.filter(
user=course_enrollment.user,
course_id=str(course_enrollment.course_id))
most_recent_lcgm = lcgm.order_by('date_for').last() # pylint: disable=E1101

if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
progress_data = _collect_progress_data(most_recent_sm)
metrics = _add_enrollment_metrics_record(site=site,
course_enrollment=course_enrollment,
progress_data=progress_data,
date_for=date_for)
else:
metrics = most_recent_lcgm
return metrics


def _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
"""Returns True if we need to update our learner progress, False otherwise
See the #Logic section in this module's docstring
If we need to check that the records match the same user and course, we
can do something like:
```
class RecordMismatchError(Exception):
pass
def rec_matches_user_and_course(lcgm, sm):
return lcgm.user == sm.student and lcgm.course_id == sm.course_id
```
And in this function add the check when we have both records:
```
if not rec_matches_user_and_course(most_recent_lcgm, most_recent_sm):
rec_msg = '{}(user_id={}, course_id="{}"'
msg1 = rec_msg.format('lcgm',
most_recent_lcgm.user.id,
most_recent_lcgm.course_id)
msg2 = rec_msg.format('sm',
most_recent_sm.student.id,
most_recent_sm.course_id)
raise RecordMismatchError(msg1 + ':' + msg2)
```
"""
# First assume we need to update the enrollment metrics record
needs_update = True
if not most_recent_lcgm and not most_recent_sm:
# Learner has not started coursework
needs_update = False
elif most_recent_lcgm and most_recent_sm:
# Learner has past course activity
needs_update = most_recent_lcgm.date_for < most_recent_sm.modified.date()
elif not most_recent_lcgm and most_recent_sm:
# No LCGM recs, so Learner started on course after last collection
# This could also happen
# If this is the irst time collection is run for the learner+course
# if an unhandled error prevents LCGM from saving
# if the learner's LCGM recs were deleted
needs_update = True
elif most_recent_lcgm and not most_recent_sm:
# This shouldn't happen. We log this state as an error

# using 'COURSE_DATA' for the pipeline error type. Although we should
# revisit logging and error tracking in Figures to define a clear
# approach that has clear an intuitive contexts for the logging event
#
# We neede to decide:
#
# 1. Is this always an error state? Could this condition happen naturally?
#
# 2. Which error type should be created and what is the most applicable
# context
# Candidates
# - Enrollment (learner+course) context
# - Data state context - Why would we have an LCGM
#
# So we hold off updating PipelineError error choises initially until
# we can think carefully on how we tag pipeline errors
#
error_data = dict(
msg='LearnerCourseGradeMetrics record exists without StudentModule',
)
log_error(
error_data=error_data,
error_type=PipelineError.COURSE_DATA,
user=most_recent_lcgm.user,
course_id=str(most_recent_lcgm.course_id)
)
needs_update = False
return needs_update


def _add_enrollment_metrics_record(site, course_enrollment, progress_data, date_for):
"""Convenience function to save progress metrics to Figures
"""
return LearnerCourseGradeMetrics.objects.create(
site=site,
user=course_enrollment.user,
course_id=str(course_enrollment.course_id),
date_for=date_for,
points_possible=progress_data['points_possible'],
points_earned=progress_data['points_earned'],
sections_worked=progress_data['sections_worked'],
sections_possible=progress_data['count']
)


def _collect_progress_data(student_module):
"""Get new progress data for the learner/course
Uses `figures.metrics.LearnerCourseGrades` to retrieve progress data via
`CourseGradeFactory().read(...)` and calculate progress percentage
"""
lcg = LearnerCourseGrades(user_id=student_module.student_id,
course_id=student_module.course_id)
course_progress_details = lcg.progress()
return course_progress_details
32 changes: 32 additions & 0 deletions figures/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ class CourseNotInSiteError(CrossSiteResourceError):
pass


class UnlinkedCourseError(Exception):
"""Raise when we need to fail if we can't get the site for a course
This will happen in multisite mode if the course is not linked to the site.
In Tahoe Hawthorn, we use the Appsembler fork of `edx-organizations` to map
courses to sites. For community and standalone deployments, we don't expect
courses to map to sites, so we just return the app instance's default site.
* This is new to support enrollment metrics rework (May 2020)
* We need to evaluate if we want to make sites.get_site_for_course` to
raise this automatically. But first we need to make sure we have test
coverage to make sure we don't introduce new bugs
"""
pass


def site_to_id(site):
"""
Helper to cast site or site id to id
Expand Down Expand Up @@ -181,3 +197,19 @@ def get_student_modules_for_course_in_site(site, course_id):
def get_student_modules_for_site(site):
course_ids = get_course_keys_for_site(site)
return StudentModule.objects.filter(course_id__in=course_ids)


def course_enrollments_for_course(course_id):
"""Return a queryset of all `CourseEnrollment` records for a course
Relies on the fact that course_ids are globally unique
"""
return CourseEnrollment.objects.filter(course_id=as_course_key(course_id))


def student_modules_for_course_enrollment(ce):
"""Return a queryset of all `StudentModule` records for a `CourseEnrollment`1
Relies on the fact that course_ids are globally unique
"""
return StudentModule.objects.filter(student=ce.user, course_id=ce.course_id)
Loading

0 comments on commit 7961f4f

Please sign in to comment.