Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enrollment Metrics Rework #214

Merged
merged 5 commits into from
May 17, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 13 additions & 4 deletions figures/pipeline/course_daily_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from figures.models import CourseDailyMetrics, PipelineError
from figures.pipeline.logger import log_error
import figures.pipeline.loaders
from figures.pipeline.enrollment_metrics import bulk_calculate_course_progress_data
from figures.serializers import CourseIndexSerializer
from figures.compat import GeneratedCertificate
import figures.sites
Expand Down Expand Up @@ -81,7 +82,7 @@ def get_active_learner_ids_today(course_id, date_for):
).values_list('student__id', flat=True).distinct()


def get_average_progress(course_id, date_for, course_enrollments):
def get_average_progress_deprecated(course_id, date_for, course_enrollments):
"""Collects and aggregates raw course grades data
"""
progress = []
Expand Down Expand Up @@ -225,6 +226,9 @@ def extract(self, course_id, date_for=None, **_kwargs):
average_days_to_complete=data.get('average_days_to_complete, None'),
num_learners_completed=data['num_learners_completed'],
)
TODO: refactor this class
Add lazy loading method to load course enrollments
- Create a method for each metric field
"""

# Update args if not assigned
Expand All @@ -247,18 +251,23 @@ def extract(self, course_id, date_for=None, **_kwargs):
# we can do a lambda for course_enrollments to get the count

data['enrollment_count'] = course_enrollments.count()

active_learner_ids_today = get_active_learner_ids_today(
course_id, date_for,)
if active_learner_ids_today:
active_learners_today = active_learner_ids_today.count()
else:
active_learners_today = 0

data['active_learners_today'] = active_learners_today
data['average_progress'] = get_average_progress(
course_id, date_for, course_enrollments,)

# Average progress
progress_data = bulk_calculate_course_progress_data(course_id=course_id,
date_for=date_for)
data['average_progress'] = progress_data['average_progress']

data['average_days_to_complete'] = get_average_days_to_complete(
course_id, date_for,)

data['num_learners_completed'] = get_num_learners_completed(
course_id, date_for,)

Expand Down
272 changes: 272 additions & 0 deletions figures/pipeline/enrollment_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,272 @@
"""This module collects metrics for a course enrollment (learner/course pair)

# Overview

This is new code for Figures 0.3.10. Its purpose is to address pipeline
performance by reducing calls to `CourseGradeFactory().read.

Calls are reduced by checking if the learner has a `StudentModule` record
modified after the most recent `LearnerCourseGradeMetrics` record `date_for`
field


# Logic

```
for each learner+course
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great section! Would you mind adding references in the logic to which function I should take a look at e.g.

Suggested change
for each learner+course
for each learner+course # see `main` function

Copy link
Contributor Author

@johnbaldwin johnbaldwin May 13, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do!
Edit: After trying this out, I decided to be sparse with referencing which functions get called and focused on readability

get newest lgcm record
get newest sm record

if not lcgm and not sm
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if not lcgm and not sm
if not lcgm and not sm # see `func_123`

# Learner has not started course
skip collection for this learner+course
else if lcgm and sm
# we have saved history so we can check to update
if lcgm.modified is older than sm.modified:
collect new course data
return new metrics object created

else if not lcgm and sm
# Learner started course after last collection action
collect new course data
return new metrics object created

else # lcgm and not sm
# We had collected data previously, but the upstream went away
# This could happen with data cleansing, like GPPR requests
# Likely other causes, too
log condition to figures PipelineError log
return None
```

# NOTE: Change 'lcgm' in this file to either 'enrollment' or 'lp' when an
abbreviation is acceptible. This is to get this module forward ready for
reworking `LearnerCourseGradeMetrics`

"""

from datetime import datetime
from decimal import Decimal

from django.utils.timezone import utc

from figures.metrics import LearnerCourseGrades
from figures.models import LearnerCourseGradeMetrics, PipelineError
from figures.pipeline.logger import log_error
from figures.sites import (
get_site_for_course,
course_enrollments_for_course,
student_modules_for_course_enrollment,
UnlinkedCourseError,
)


def bulk_calculate_course_progress_data(course_id, date_for=None):
"""Calculates the average progress for a set of course enrollments

How it works
1. collects progress percent for each course enrollment
1.1. If up to date enrollment metrics record already exists, use that
2. calculate and return the average of these enrollments

TODO: Update to filter on active users

Questions:
- What makes a learner an active learner?

"""
progress_percentages = []

if not date_for:
date_for = datetime.utcnow().replace(tzinfo=utc).date()

for ce in course_enrollments_for_course(course_id):
metrics = collect_metrics_for_enrollment(course_enrollment=ce,
date_for=date_for)
if metrics:
progress_percentages.append(metrics.progress_percent)
else:
# Log this for troubleshooting
error_data = dict(
msg=('Unable to create or retrieve enrollment metrics ' +
'for user {} and course {}'.format(ce.user.username,
str(ce.course_id)))
)
log_error(
error_data=error_data,
error_type=PipelineError.COURSE_DATA,
user=ce.user,
course_id=str(course_id)
)

return dict(
average_progress=calculate_average_progress(progress_percentages),
)


def calculate_average_progress(progress_percentages):
"""Calcuates average progress from a list of values

TODO: How do we want to handle malformed data?
"""
if progress_percentages:
average_progress = float(sum(progress_percentages)) / float(len(progress_percentages))
average_progress = float(Decimal(average_progress).quantize(Decimal('.00')))
else:
average_progress = 0.0
return average_progress


def collect_metrics_for_enrollment(course_enrollment, date_for=None, **_kwargs):
"""Collect metrics for enrollment (learner+course)

This function performs course enrollment merics collection for the given
unique enrollment identified by the learner and course. It is initial code
in refactoring the pipeline for learner progress

Important models here are:
* StudentModule (SM)
* LearnerCourseGradeMetrics (LCGM)
* CourseEnrollment (CE)

# Workflow

* Get date of most recent StudentModule record, for the enrollment record
* Get date of most recent LearnerCourseGradeMetrics record, LCGM, for the
enrollment
* if LCGM does not exist or SM.modified is more recent than LCGM.modified then
* collect course_data via CourseEnrollmentFactory().read(...)
* create new LearnerCourseGradesMetics record with the updated data
* retain reference to tne new record
* else return the existing LearnerCourseGradesMetics record if it exists
* else return None if no existing LearnerCourseGradesMetics record
"""
if not date_for:
date_for = datetime.utcnow().replace(tzinfo=utc).date()
site = get_site_for_course(course_enrollment.course_id)
if not site:
raise UnlinkedCourseError('No site found for course "{}"'.format(
course_enrollment.course_id))

most_recent_sm = student_modules_for_course_enrollment(
course_enrollment).order_by('modified').last()

most_recent_lcgm = LearnerCourseGradeMetrics.objects.filter(
user=course_enrollment.user,
course_id=str(course_enrollment.course_id)).order_by('date_for').last()

if _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
progress_data = _collect_progress_data(most_recent_sm)
metrics = _add_enrollment_metrics_record(site=site,
course_enrollment=course_enrollment,
progress_data=progress_data,
date_for=date_for)
else:
metrics = most_recent_lcgm
return metrics


def _enrollment_metrics_needs_update(most_recent_lcgm, most_recent_sm):
"""Returns True if we need to update our learner progress, False otherwise

See the #Logic section in this module's docstring

If we need to check that the records match the same user and course, we
can do something like:

```
class RecordMismatchError(Exception):
pass


def rec_matches_user_and_course(lcgm, sm):
return lcgm.user == sm.student and lcgm.course_id == sm.course_id
```

And in this function add the check when we have both records:

```
if not rec_matches_user_and_course(most_recent_lcgm, most_recent_sm):
rec_msg = '{}(user_id={}, course_id="{}"'
msg1 = rec_msg.format('lcgm',
most_recent_lcgm.user.id,
most_recent_lcgm.course_id)
msg2 = rec_msg.format('sm',
most_recent_sm.student.id,
most_recent_sm.course_id)
raise RecordMismatchError(msg1 + ':' + msg2)
```
"""
# First assume we need to update the enrollment metrics record
needs_update = True
if not most_recent_lcgm and not most_recent_sm:
# Learner has not started coursework
needs_update = False
elif most_recent_lcgm and most_recent_sm:
# Learner has past course activity
needs_update = most_recent_lcgm.date_for < most_recent_sm.modified.date()
elif not most_recent_lcgm and most_recent_sm:
# No LCGM recs, so Learner started on course after last collection
# This could also happen
# If this is the irst time collection is run for the learner+course
# if an unhandled error prevents LCGM from saving
# if the learner's LCGM recs were deleted
needs_update = True
elif most_recent_lcgm and not most_recent_sm:
# This shouldn't happen. We log this state as an error

# using 'COURSE_DATA' for the pipeline error type. Although we should
# revisit logging and error tracking in Figures to define a clear
# approach that has clear an intuitive contexts for the logging event
#
# We neede to decide:
#
# 1. Is this always an error state? Could this condition happen naturally?
#
# 2. Which error type should be created and what is the most applicable
# context
# Candidates
# - Enrollment (learner+course) context
# - Data state context - Why would we have an LCGM
#
# So we hold off updating PipelineError error choises initially until
# we can think carefully on how we tag pipeline errors
#
error_data = dict(
msg='LearnerCourseGradeMetrics record exists without StudentModule',
)
log_error(
error_data=error_data,
error_type=PipelineError.COURSE_DATA,
user=most_recent_lcgm.user,
course_id=str(most_recent_lcgm.course_id)
)
needs_update = False
return needs_update


def _add_enrollment_metrics_record(site, course_enrollment, progress_data, date_for):
"""Convenience function to save progress metrics to Figures
"""
return LearnerCourseGradeMetrics.objects.create(
site=site,
user=course_enrollment.user,
course_id=str(course_enrollment.course_id),
date_for=date_for,
points_possible=progress_data['points_possible'],
points_earned=progress_data['points_earned'],
sections_worked=progress_data['sections_worked'],
sections_possible=progress_data['count']
)


def _collect_progress_data(student_module):
"""Get new progress data for the learner/course

Uses `figures.metrics.LearnerCourseGrades` to retrieve progress data via
`CourseGradeFactory().read(...)` and calculate progress percentage
"""
lcg = LearnerCourseGrades(user_id=student_module.student_id,
course_id=student_module.course_id)
course_progress_details = lcg.progress()
return course_progress_details
32 changes: 32 additions & 0 deletions figures/sites.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,22 @@ class CourseNotInSiteError(CrossSiteResourceError):
pass


class UnlinkedCourseError(Exception):
"""Raise when we need to fail if we can't get the site for a course

This will happen in multisite mode if the course is not linked to the site.
In Tahoe Hawthorn, we use the Appsembler fork of `edx-organizations` to map
courses to sites. For community and standalone deployments, we don't expect
courses to map to sites, so we just return the app instance's default site.

* This is new to support enrollment metrics rework (May 2020)
* We need to evaluate if we want to make sites.get_site_for_course` to
raise this automatically. But first we need to make sure we have test
coverage to make sure we don't introduce new bugs
"""
pass


def site_to_id(site):
"""
Helper to cast site or site id to id
Expand Down Expand Up @@ -181,3 +197,19 @@ def get_student_modules_for_course_in_site(site, course_id):
def get_student_modules_for_site(site):
course_ids = get_course_keys_for_site(site)
return StudentModule.objects.filter(course_id__in=course_ids)


def course_enrollments_for_course(course_id):
"""Return a queryset of all `CourseEnrollment` records for a course

Relies on the fact that course_ids are globally unique
"""
return CourseEnrollment.objects.filter(course_id=as_course_key(course_id))


def student_modules_for_course_enrollment(ce):
"""Return a queryset of all `StudentModule` records for a `CourseEnrollment`1

Relies on the fact that course_ids are globally unique
"""
return StudentModule.objects.filter(student=ce.user, course_id=ce.course_id)
Loading