diff --git a/README.rst b/README.rst index 5209af9d..9c1cd3d1 100644 --- a/README.rst +++ b/README.rst @@ -48,6 +48,15 @@ These require tables to be seeded first. To do this, add 'unit-test-seeds' to `` ``dbt test --selector all_tests`` will run all data/generic/unit tests. +Removing old models +******************* + +dbt does not automatically remove models that have been deleted from this project. As we remove models we will add them to a macro that can be manually run to clean up things which are no longer needed. This can be important to prevent stale materialized views from breaking when schemas change, and to prevent unnecessary inserts writes to tables that aren't used. + +If you need a model that has been removed due to custom reporting you should either move that model to the system you use to manage your custom schema (such as your own dbt package) instead of letting the old version remain. This will let you explicitly upgrade it as necessary. + +``dbt run-operation remove_deprecated_models`` will drop the relations and ``dbt -d run-operation remove_deprecated_models`` will drop with debug information showing the commands that are run. + More Help ========= diff --git a/macros/remove_deprecated_models.sql b/macros/remove_deprecated_models.sql new file mode 100644 index 00000000..a0205ee4 --- /dev/null +++ b/macros/remove_deprecated_models.sql @@ -0,0 +1,48 @@ +{% macro do_drop(type, schema, relation) %} + -- Drop a relation, types are "view", "table", or "mv". + -- "mv" will drop both the expected view and destination table. + {% if type == "mv" %} + {% do do_drop("view", schema, relation ~ "_mv") %} + {% do do_drop("table", schema, relation) %} + {% else %} + {% set cmd = "drop " ~ type ~ " if exists " ~ schema ~ "." ~ relation ~ ";" %} + {% print(cmd) %} + {% do run_query(cmd) %} + {% endif %} +{% endmacro %} + +{% macro remove_deprecated_models() %} + {% set xapi = env_var("ASPECTS_XAPI_DATABASE", "xapi") %} + {% set reporting = env_var("DBT_PROFILE_TARGET_DATABASE", "reporting") %} + {% set event_sink = env_var("ASPECTS_EVENT_SINK_DATABASE", "event_sink") %} + + {{ + print( + "Running remove_deprecated_models on " + ~ xapi + ~ ", " + ~ reporting + ~ ", " + ~ event_sink + ~ "." + ) + }} + + -- https://github.com/openedx/aspects-dbt/pull/111/ + {% do do_drop("mv", xapi, "completion_events") %} + {% do do_drop("view", reporting, "fact_completions") %} + {% do do_drop("view", xapi, "fact_forum_interactions") %} + {% do do_drop("mv", xapi, "forum_events") %} + {% do do_drop("view", reporting, "fact_grades") %} + {% do do_drop("view", reporting, "learner_summary") %} + {% do do_drop("view", reporting, "fact_navigation_dropoff") %} + {% do do_drop("view", reporting, "fact_learner_problem_summary") %} + {% do do_drop("view", reporting, "fact_problem_engagement") %} + {% do do_drop("view", reporting, "fact_problem_engagement_per_subsection") %} + {% do do_drop("view", reporting, "fact_problem_responses_extended") %} + {% do do_drop("view", reporting, "dim_at_risk_learners") %} + {% do do_drop("view", reporting, "fact_transcript_usage") %} + {% do do_drop("view", reporting, "fact_watched_video_segments") %} + {% do do_drop("mv", reporting, "video_transcript_events") %} + +{% endmacro %} diff --git a/models/completion/completion_events.sql b/models/completion/completion_events.sql deleted file mode 100644 index a962af24..00000000 --- a/models/completion/completion_events.sql +++ /dev/null @@ -1,26 +0,0 @@ -{{ - config( - materialized="materialized_view", - schema=env_var("ASPECTS_XAPI_DATABASE", "xapi"), - engine=get_engine("ReplacingMergeTree()"), - primary_key="(org, course_key, verb_id)", - order_by="(org, course_key, verb_id, emission_time, actor_id, object_id, event_id)", - partition_by="(toYYYYMM(emission_time))", - ttl=env_var("ASPECTS_DATA_TTL_EXPRESSION", ""), - ) -}} - -select - event_id, - CAST(emission_time, 'DateTime') as emission_time, - actor_id, - object_id, - course_key, - org, - verb_id, - JSON_VALUE( - event, - '$.result.extensions."https://w3id.org/xapi/cmi5/result/extensions/progress"' - ) as progress_percent -from {{ ref("xapi_events_all_parsed") }} -where verb_id = 'http://adlnet.gov/expapi/verbs/progressed' diff --git a/models/completion/fact_completions.sql b/models/completion/fact_completions.sql deleted file mode 100644 index 2e733305..00000000 --- a/models/completion/fact_completions.sql +++ /dev/null @@ -1,61 +0,0 @@ -with - completions as ( - select - emission_time, - org, - course_key, - actor_id, - progress_percent, - if( - object_id like '%/course/%', - splitByString('/course/', object_id)[-1], - splitByString('/xblock/', object_id)[-1] - ) as entity_id, - cast(progress_percent as Float) / 100 as scaled_progress - from {{ ref("completion_events") }} - ) - -select - completions.emission_time as emission_time, - completions.org as org, - completions.course_key as course_key, - courses.course_name as course_name, - courses.course_run as course_run, - completions.entity_id as entity_id, - if(blocks.block_name != '', blocks.block_name, courses.course_name) as entity_name, - if( - blocks.block_name != '', blocks.display_name_with_location, null - ) as entity_name_with_location, - completions.actor_id as actor_id, - cast(completions.scaled_progress as Float) as scaled_progress, - case - when scaled_progress >= 0.9 - then '90-100%' - when scaled_progress >= 0.8 and scaled_progress < 0.9 - then '80-89%' - when scaled_progress >= 0.7 and scaled_progress < 0.8 - then '70-79%' - when scaled_progress >= 0.6 and scaled_progress < 0.7 - then '60-69%' - when scaled_progress >= 0.5 and scaled_progress < 0.6 - then '50-59%' - when scaled_progress >= 0.4 and scaled_progress < 0.5 - then '40-49%' - when scaled_progress >= 0.3 and scaled_progress < 0.4 - then '30-39%' - when scaled_progress >= 0.2 and scaled_progress < 0.3 - then '20-29%' - when scaled_progress >= 0.1 and scaled_progress < 0.2 - then '10-19%' - else '0-9%' - end as completion_bucket, - users.username as username, - users.name as name, - users.email as email -from completions -join {{ ref("course_names") }} courses on completions.course_key = courses.course_key -left join - {{ ref("course_block_names") }} blocks on completions.entity_id = blocks.location -left outer join - {{ ref("dim_user_pii") }} users - on toUUID(completions.actor_id) = users.external_user_id diff --git a/models/completion/schema.yml b/models/completion/schema.yml deleted file mode 100644 index ade06b83..00000000 --- a/models/completion/schema.yml +++ /dev/null @@ -1,77 +0,0 @@ -version: 2 - -models: - - name: fact_completions - database: "{{ env_var('DBT_PROFILE_TARGET_DATABASE', 'reporting') }}" - description: "One record per completion events for component" - columns: - - name: emission_time - description: "Timestamp, to the second, of when this event was emitted" - data_type: DateTime64(3) - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: entity_id - description: "The block ID or course key for the graded entity" - data_type: String - - name: entity_name - data_type: String - description: "The name of the graded entity (course or block)" - - name: entity_name_with_location - data_type: Nullable(String) - description: "The entity's display name with section, subsection, and unit prepended to the name. This provides additional context when looking at block names and can help data consumers understand which block they are analyzing" - - name: actor_id - data_type: String - description: "The xAPI actor identifier" - - name: scaled_progress - description: "A ratio between 0 and 1, inclusive, of the learner's progress" - data_type: Float32 - - name: completion_bucket - description: "A displayable value of progress sorted into 10% buckets. Useful for grouping progress together to show high-level learner performance" - data_type: String - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - - name: completion_events - description: "A materialized view for xAPI events related to course completions" - columns: - - name: event_id - data_type: uuid - description: "The unique identifier for the event" - - name: emission_time - data_type: datetime - description: "The time the event was emitted" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: object_id - data_type: string - description: "The xAPI object identifier" - - name: course_key - data_type: string - description: "The course identifier" - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: verb_id - data_type: string - description: "The xAPI verb identifier" - - name: progress_percent - data_type: string - description: "The percentage of the xAPI object completed" diff --git a/models/courses/course_block_names.sql b/models/courses/course_block_names.sql index b331fc45..e8dffd77 100644 --- a/models/courses/course_block_names.sql +++ b/models/courses/course_block_names.sql @@ -19,7 +19,6 @@ }, ) }} - select location, block_name, course_key, graded, course_order, display_name_with_location from {{ ref("most_recent_course_blocks") }} diff --git a/models/courses/schema.yml b/models/courses/schema.yml index cc326ac6..4b59c4df 100644 --- a/models/courses/schema.yml +++ b/models/courses/schema.yml @@ -45,7 +45,7 @@ models: description: "The type of block. This can be a section, subsection, unit, or the block type" - name: course_block_names - description: "A table of course blocks with their names" + description: "An in-memory dictionary of course blocks with their display names and additional metadata. Only stores the most recent row per block location." columns: - name: location data_type: String @@ -65,6 +65,21 @@ models: - name: course_order data_type: Int32 description: "The sort order of this block in the course across all course blocks" + - name: section + data_type: Int32 + description: "The section number that this block falls under in the course. Starts at 1." + - name: subsection + data_type: Int32 + description: "The subsection number that this block falls under in the section. Starts at 1." + - name: unit + data_type: Int32 + description: "The unit number that this block falls under in the subsection. Starts at 1." + - name: dump_id + data_type: UUID + description: "The UUID of the event sink run that published this block to ClickHouse. When a course is published all blocks inside it are sent with the same dump_id." + - name: time_last_dumped + data_type: String + description: "The Datetime of the event sink run that published this block to ClickHouse. When a course is published all blocks inside it are sent with the same time_last_dumped." - name: most_recent_course_blocks description: "A materialized view of course blocks with their display names and additional metadata. Only stores the most recent row per block location." @@ -101,7 +116,7 @@ models: description: "The UUID of the event sink run that published this block to ClickHouse. When a course is published all blocks inside it are sent with the same dump_id." - name: time_last_dumped data_type: String - description: "The Datetime of the event sink run that published this block to ClickHouse. When a course is published all blocks inside it are sent with the same time_last_dumped." + description: "The datetime of the event sink run that published this block to ClickHouse. When a course is published all blocks inside it are sent with the same time_last_dumped." - name: course_names description: "A table of courses with their names" diff --git a/models/forum/fact_forum_interactions.sql b/models/forum/fact_forum_interactions.sql deleted file mode 100644 index 3da949aa..00000000 --- a/models/forum/fact_forum_interactions.sql +++ /dev/null @@ -1,17 +0,0 @@ -select - forum.event_id as event_id, - forum.emission_time as emission_time, - forum.org as org, - forum.course_key as course_key, - courses.course_name as course_name, - courses.course_run as course_run, - forum.object_id as object_id, - forum.actor_id as actor_id, - forum.verb_id as verb_id, - users.username as username, - users.name as name, - users.email as email -from {{ ref("forum_events") }} forum -join {{ ref("course_names") }} courses on (forum.course_key = courses.course_key) -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id diff --git a/models/forum/forum_events.sql b/models/forum/forum_events.sql deleted file mode 100644 index 32ee108f..00000000 --- a/models/forum/forum_events.sql +++ /dev/null @@ -1,24 +0,0 @@ -{{ - config( - materialized="materialized_view", - schema=env_var("ASPECTS_XAPI_DATABASE", "xapi"), - engine=get_engine("ReplacingMergeTree()"), - primary_key="(org, course_key, verb_id)", - order_by="(org, course_key, verb_id, emission_time, actor_id, object_id, event_id)", - partition_by="(toYYYYMM(emission_time))", - ttl=env_var("ASPECTS_DATA_TTL_EXPRESSION", ""), - ) -}} - -select - event_id, - CAST(emission_time, 'DateTime') as emission_time, - org, - course_key, - object_id, - actor_id, - verb_id -from {{ ref("xapi_events_all_parsed") }} -where - JSON_VALUE(event, '$.object.definition.type') - = 'http://id.tincanapi.com/activitytype/discussion' diff --git a/models/forum/schema.yml b/models/forum/schema.yml deleted file mode 100644 index 2abbaa84..00000000 --- a/models/forum/schema.yml +++ /dev/null @@ -1,67 +0,0 @@ -version: 2 - -models: - - name: fact_forum_interactions - description: "One record per forum interaction" - columns: - - name: event_id - data_type: UUID - description: "The unique identifier for the event" - - name: emission_time - data_type: DateTime64(3) - description: "Timestamp, to the second, of when this event was emitted" - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: object_id - data_type: String - description: "The xAPI object identifier" - - name: actor_id - data_type: String - description: "The xAPI actor identifier" - - name: verb_id - data_type: LowCardinality(String) - description: "The xAPI verb identifier" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - - name: forum_events - description: "One record per forum event" - columns: - - name: event_id - data_type: uuid - description: "The unique identifier for the event" - - name: emission_time - data_type: datetime - description: "The time the event was emitted" - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course identifier" - - name: object_id - data_type: string - description: "The xAPI object identifier" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: verb_id - data_type: string - description: "The xAPI verb identifier" diff --git a/models/grading/fact_grades.sql b/models/grading/fact_grades.sql deleted file mode 100644 index 8ce028b7..00000000 --- a/models/grading/fact_grades.sql +++ /dev/null @@ -1,52 +0,0 @@ -with - grades as ( - select - emission_time, - org, - course_key, - case - when object_id like '%/course/%' - then 'course' - when object_id like '%@sequential+block@%' - then 'subsection' - when object_id like '%@problem+block@%' - then 'problem' - end as grade_type, - if( - grade_type = 'course', - splitByString('/course/', object_id)[-1], - splitByString('/xblock/', object_id)[-1] - ) as entity_id, - actor_id, - scaled_score - from {{ ref("grading_events") }} - where - verb_id in ( - 'http://id.tincanapi.com/verb/earned', - 'https://w3id.org/xapi/acrossx/verbs/evaluated' - ) - ) - -select - grades.emission_time as emission_time, - grades.org as org, - grades.course_key as course_key, - courses.course_name as course_name, - courses.course_run as course_run, - grades.entity_id as entity_id, - if(blocks.block_name != '', blocks.block_name, courses.course_name) as entity_name, - if( - blocks.block_name != '', blocks.display_name_with_location, null - ) as entity_name_with_location, - grades.grade_type as grade_type, - grades.actor_id as actor_id, - grades.scaled_score as scaled_score, - {{ get_bucket("scaled_score") }} as grade_bucket, - users.username as username, - users.name as name, - users.email as email -from grades -join {{ ref("course_names") }} courses on grades.course_key = courses.course_key -left join {{ ref("course_block_names") }} blocks on grades.entity_id = blocks.location -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id diff --git a/models/grading/schema.yml b/models/grading/schema.yml index 3b871476..a74d005a 100644 --- a/models/grading/schema.yml +++ b/models/grading/schema.yml @@ -1,58 +1,6 @@ version: 2 models: - - name: fact_grades - description: "One record per grading event for courses, subsections, and problems" - columns: - - name: emission_time - description: "Timestamp, to the second, of when this event was emitted" - data_type: DateTime64(3) - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: entity_id - description: "The block ID or course key for the graded entity" - data_type: String - - name: entity_name - data_type: String - description: "The name of the graded entity (course or block)" - - name: entity_name_with_location - data_type: Nullable(String) - description: "The entity's display name with section, subsection, and unit prepended to the name. This provides additional context when looking at block names and can help data consumers understand which block they are analyzing" - - name: grade_type - description: "The type of object graded" - data_tests: - - accepted_values: - values: ["course", "subsection", "problem"] - data_type: Nullable(String) - - name: actor_id - data_type: String - description: "The xAPI actor identifier" - - name: scaled_score - description: "A ratio between 0 and 1, inclusive, of the learner's grade" - data_type: Float32 - - name: grade_bucket - description: "A displayable value of grades sorted into 10% buckets. Useful for grouping grades together to show high-level learner performance" - data_type: String - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: fact_learner_course_grade description: "One record per learner per course for the most recent grade" columns: diff --git a/models/learners/learner_summary.sql b/models/learners/learner_summary.sql deleted file mode 100644 index d3c34034..00000000 --- a/models/learners/learner_summary.sql +++ /dev/null @@ -1,36 +0,0 @@ -with - latest_emission_time as ( - select course_key, actor_id, MAX(emission_time) as last_visited - from {{ ref("fact_navigation") }} - group by course_key, actor_id - ), - enrollment_status as ( - select course_key, actor_id, MAX(emission_time) as max_emission_time - from {{ ref("fact_enrollment_status") }} - group by course_key, actor_id - ) -select - fss.org as org, - fss.course_key as course_key, - fss.actor_id as actor_id, - fss.course_name as course_name, - fss.course_run as course_run, - fss.approving_state as approving_state, - fss.enrollment_mode as enrollment_mode, - fss.enrollment_status as enrollment_status, - fss.course_grade as course_grade, - fss.grade_bucket as grade_bucket, - fss.username as username, - fss.name as name, - fss.email as email, - fes.max_emission_time as emission_time, - let.last_visited as last_visited -from {{ ref("fact_student_status") }} fss -left join - enrollment_status fes - on fss.course_key = fes.course_key - and fss.actor_id = fes.actor_id -left join - latest_emission_time let - on fss.course_key = let.course_key - and fss.actor_id = let.actor_id diff --git a/models/learners/schema.yml b/models/learners/schema.yml deleted file mode 100644 index 8e6b67d0..00000000 --- a/models/learners/schema.yml +++ /dev/null @@ -1,53 +0,0 @@ -version: 2 - -models: - - name: learner_summary - description: Summary of a learner with their grade and enrollment status - columns: - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course key for the course" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: course_name - data_type: string - description: "The name of the course" - - name: course_run - data_type: string - description: "The course run for the course" - - name: approving_state - data_type: string - description: "The most recent approving state for the learner" - - name: enrollment_mode - data_type: string - description: "The mode of enrollment" - - name: enrollment_status - description: "Whether a learner is actively enrolled in a course" - data_tests: - - accepted_values: - values: ["registered", "unregistered"] - - name: course_grade - data_type: float64 - description: "The most recent grade for the learner" - - name: grade_bucket - data_type: string - description: "A displayable value of grades sorted into 10% buckets. Useful for grouping grades together to show high-level learner performance" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: emission_time - data_type: DateTime - description: "The timestamp, to the second, of the most recent enrollment action for this learner and course." - - name: last_visited - data_type: DateTime - description: "The timestamp, to the second, of the most recent time the learner visited the course." diff --git a/models/navigation/fact_navigation_dropoff.sql b/models/navigation/fact_navigation_dropoff.sql deleted file mode 100644 index 9d0c6949..00000000 --- a/models/navigation/fact_navigation_dropoff.sql +++ /dev/null @@ -1,86 +0,0 @@ -with - blocks as ( - select - org, - course_key, - display_name_with_location, - hierarchy_location, - course_order - from {{ ref("dim_course_blocks") }} - where block_id like '%@chapter+block@%' or block_id like '%@sequential+block@%' - ), - page_views_by_section as ( - -- section: x:0:0 - -- take just the first number from the hierarchy location - select - date(emission_time) as emission_date, - org, - course_key, - {{ section_from_display("block_name_with_location") }} as section_number, - actor_id, - count(*) as total_views - from {{ ref("fact_navigation") }} - group by emission_date, org, course_key, section_number, actor_id - ), - page_views_by_subsection as ( - -- subsection: x:y:0 - -- take the first two numbers from the hierarchy location - select - date(emission_time) as emission_date, - org, - course_key, - {{ subsection_from_display("block_name_with_location") }} - as subsection_number, - actor_id, - count(*) as total_views - from {{ ref("fact_navigation") }} - group by emission_date, org, course_key, subsection_number, actor_id - ), - page_views as ( - select - emission_date, - org, - course_key, - 'section' as rollup_name, - section_number as hierarchy_location, - actor_id, - sum(total_views) as total_views - from page_views_by_section - group by - emission_date, org, course_key, rollup_name, hierarchy_location, actor_id - union all - select - emission_date, - org, - course_key, - 'subsection' as rollup_name, - subsection_number as hierarchy_location, - actor_id, - sum(total_views) as total_views - from page_views_by_subsection - group by - emission_date, org, course_key, rollup_name, hierarchy_location, actor_id - ) - -select - page_views.emission_date as emission_date, - page_views.org as org, - page_views.course_key as course_key, - page_views.rollup_name as rollup_name, - blocks.display_name_with_location as block_name, - blocks.course_order as course_order, - page_views.actor_id as actor_id, - page_views.total_views as total_views, - users.username as username, - users.name as name, - users.email as email -from page_views -join - blocks - on ( - page_views.org = blocks.org - and page_views.course_key = blocks.course_key - and page_views.hierarchy_location = blocks.hierarchy_location - ) -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id diff --git a/models/navigation/schema.yml b/models/navigation/schema.yml index 55e91436..801c5054 100644 --- a/models/navigation/schema.yml +++ b/models/navigation/schema.yml @@ -87,46 +87,6 @@ models: data_type: Int32 description: "The sort order of this block in the course across all course blocks" - - name: fact_navigation_dropoff - description: "A view for analyzing the number of page visits per learner per section and subsection" - columns: - - name: emission_date - data_type: date - description: "The date the event was emitted" - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course identifier" - - name: rollup_name - data_type: string - description: "The level at which page views are counted" - data_tests: - - accepted_values: - values: ["section", "subsection"] - - name: block_name - data_type: string - description: "The name of the section or subsection" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: total_views - data_type: uint64 - description: "The total number of times a learner viewed pages in this section or subsection on a given day" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: course_order - data_type: Int32 - description: "The sort order of this block in the course across all course blocks" - - name: fact_navigation_completion description: "A view for analyzing how many pages a learner has visited in a section or subsection" columns: diff --git a/models/problems/fact_learner_problem_summary.sql b/models/problems/fact_learner_problem_summary.sql deleted file mode 100644 index 5f823a7e..00000000 --- a/models/problems/fact_learner_problem_summary.sql +++ /dev/null @@ -1,74 +0,0 @@ --- summary table for a learner's performance on and interactions with a --- particular problem -with - results_with_hints as ( - select - org, - course_key, - course_name, - course_run, - problem_id, - problem_name, - problem_name_with_location, - course_order, - actor_id, - success, - attempts, - 0 as num_hints_displayed, - 0 as num_answers_displayed - from {{ ref("int_problem_results") }} - union all - select - org, - course_key, - course_name, - course_run, - problem_id, - problem_name, - problem_name_with_location, - course_order, - actor_id, - null as success, - null as attempts, - case help_type when 'hint' then 1 else 0 end as num_hints_displayed, - case help_type when 'answer' then 1 else 0 end as num_answers_displayed - from {{ ref("int_problem_hints") }} - ) - --- n.b.: there should only be one row per org, course, problem, and actor --- in problem_results, so any(success) and any(attempts) should return the --- values from that part of the union and not the null values used as --- placeholders in the problem_hints part of the union -select - org, - course_key, - course_name, - course_run, - problem_id, - problem_name, - problem_name_with_location, - course_order, - actor_id, - coalesce(any(success), false) as success, - coalesce(any(attempts), 0) as attempts, - sum(num_hints_displayed) as num_hints_displayed, - sum(num_answers_displayed) as num_answers_displayed, - users.username as username, - users.name as name, - users.email as email -from results_with_hints -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id -group by - org, - course_key, - course_name, - course_run, - problem_id, - problem_name, - problem_name_with_location, - course_order, - actor_id, - username, - name, - email diff --git a/models/problems/fact_problem_engagement.sql b/models/problems/fact_problem_engagement.sql deleted file mode 100644 index 733f4847..00000000 --- a/models/problems/fact_problem_engagement.sql +++ /dev/null @@ -1,49 +0,0 @@ -with - subsection_engagement as ( - select - org, - course_key, - 'subsection' as content_level, - actor_id, - subsection_block_id as block_id, - engagement_level as section_subsection_problem_engagement - from {{ ref("subsection_problem_engagement") }} - ), - section_engagement as ( - select - org, - course_key, - 'section' as content_level, - actor_id, - section_block_id as block_id, - engagement_level as section_subsection_problem_engagement - from {{ ref("section_problem_engagement") }} - ), - problem_engagement as ( - select * - from subsection_engagement - union all - select * - from section_engagement - ) -select - pe.org as org, - pe.course_key as course_key, - course_blocks.course_run as course_run, - course_blocks.display_name_with_location as section_subsection_name, - pe.content_level as content_level, - pe.actor_id as actor_id, - pe.section_subsection_problem_engagement as section_subsection_problem_engagement, - users.username as username, - users.name as name, - users.email as email -from problem_engagement pe -join - {{ ref("dim_course_blocks") }} course_blocks - on ( - pe.org = course_blocks.org - and pe.course_key = course_blocks.course_key - and pe.block_id = course_blocks.block_id - ) -left outer join - {{ ref("dim_user_pii") }} users on toUUID(pe.actor_id) = users.external_user_id diff --git a/models/problems/fact_problem_engagement_per_subsection.sql b/models/problems/fact_problem_engagement_per_subsection.sql deleted file mode 100644 index 3a18e86b..00000000 --- a/models/problems/fact_problem_engagement_per_subsection.sql +++ /dev/null @@ -1,41 +0,0 @@ -with - attempted_subsection_problems as ( - select distinct - date(emission_time) as attempted_on, - org, - course_key, - course_run, - {{ section_from_display("problem_name_with_location") }} as section_number, - {{ subsection_from_display("problem_name_with_location") }} - as subsection_number, - course_order as course_order, - graded, - actor_id, - problem_id, - username, - name, - email - from {{ ref("fact_problem_responses") }} - ) - -select - attempts.org as org, - attempts.course_key as course_key, - attempts.course_run as course_run, - problems.section_with_name as section_with_name, - problems.subsection_with_name as subsection_with_name, - problems.item_count as item_count, - attempts.actor_id as actor_id, - attempts.problem_id as problem_id, - attempts.username as username, - attempts.name as name, - attempts.email as email -from attempted_subsection_problems attempts -join - {{ ref("int_problems_per_subsection") }} problems - on ( - attempts.org = problems.org - and attempts.course_key = problems.course_key - and attempts.section_number = problems.section_number - and attempts.subsection_number = problems.subsection_number - ) diff --git a/models/problems/fact_problem_responses_extended.sql b/models/problems/fact_problem_responses_extended.sql deleted file mode 100644 index 6a62023d..00000000 --- a/models/problems/fact_problem_responses_extended.sql +++ /dev/null @@ -1,42 +0,0 @@ -with - problem_results as ( - select - *, - {{ section_from_display("problem_name_with_location") }} as section_number, - {{ subsection_from_display("problem_name_with_location") }} - as subsection_number - from {{ ref("int_problem_results") }} - ) -select - results.emission_time as emission_time, - results.org as org, - results.course_key as course_key, - results.course_name as course_name, - results.course_run as course_run, - problems.section_with_name as section_with_name, - problems.subsection_with_name as subsection_with_name, - results.problem_id as problem_id, - results.problem_name as problem_name, - results.problem_name_with_location as problem_name_with_location, - results.problem_link as problem_link, - results.actor_id as actor_id, - results.responses as responses, - results.success as success, - results.attempts as attempts, - results.course_order as course_order, - results.graded as graded, - results.interaction_type as interaction_type, - users.username as username, - users.name as name, - users.email as email -from problem_results results -join - {{ ref("int_problems_per_subsection") }} problems - on ( - results.org = problems.org - and results.course_key = problems.course_key - and results.section_number = problems.section_number - and results.subsection_number = problems.subsection_number - ) -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id diff --git a/models/problems/schema.yml b/models/problems/schema.yml index a9cd342d..2c49d727 100644 --- a/models/problems/schema.yml +++ b/models/problems/schema.yml @@ -1,62 +1,6 @@ version: 2 models: - - name: fact_learner_problem_summary - description: "One record per learner per problem in a course" - columns: - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: problem_id - data_type: String - description: "The problem's unique identifier" - - name: problem_name - data_type: String - description: "The problem's name" - - name: problem_name_with_location - data_type: String - description: "The problem's display name with section, subsection, and unit prepended to the name. This provides additional context when looking at problem names and can help data consumers understand which problem they are analyzing" - - name: actor_id - data_type: String - description: "The xAPI actor identifier" - - name: success - description: "The result of the last submission" - data_tests: - - not_null - data_type: Bool - - name: attempts - description: "The number of attempts made" - data_tests: - - not_null - data_type: Int16 - - name: num_hints_displayed - description: "The number of times a learner asked for a hint" - data_type: UInt64 - - name: num_answers_displayed - description: "The number of times a learner requested the answers for the problem" - data_type: UInt64 - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: course_order - data_type: Int32 - description: "The sort order of this block in the course across all course blocks" - - name: fact_problem_responses description: "One record for each submitted response to a problem" columns: @@ -290,160 +234,6 @@ models: data_type: String description: "The unique identifier for the section block" - - name: fact_problem_engagement_per_subsection - description: "A dataset with one record representing a problem attempted by a learner and the section and subsection that problem belongs to" - columns: - - name: attempted_on - data_type: date - description: "The date on which the problem was attempted" - - name: org - data_type: string - description: "The organization that the problem belongs to" - - name: course_key - data_type: string - description: "The course key for the course" - - name: course_run - data_type: string - description: "The course run for the course" - - name: section_with_name - data_type: string - description: "The name of the section this subsection belongs to, with section_number prepended" - - name: subsection_with_name - data_type: string - description: "The name of the subsection, with section_number prepended" - - name: item_count - data_type: uint64 - description: "The number of problems in this subsection" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: problem_id - data_type: string - description: "The xAPI object identifier" - - name: graded - data_type: bool - description: "Whether the block is graded" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: course_order - data_type: Int32 - description: "The sort order of this block in the course across all course blocks" - - - - name: fact_problem_engagement - description: "" - columns: - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course key for the course" - - name: course_run - data_type: string - description: "The course run for the course" - - name: section_subsection_name - data_type: string - description: "The name of the section this subsection belongs to, with section_number prepended" - - name: content_level - data_type: string - description: "The name of the subsection, with section_number prepended" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: section_subsection_problem_engagement - data_type: string - description: "The engagement level of the learner with the section/subsection" - data_tests: - - accepted_values: - values: - - 'No problems attempted yet' - - 'All problems attempted' - - 'At least one problem attempted' - - name: username - data_type: string - description: "The username of the learner" - - name: name - data_type: string - description: "The full name of the learner" - - name: email - data_type: string - description: "The email address of the learner" - - - name: fact_problem_responses_extended - description: "int_problem_results with section and subsection names" - columns: - - name: emission_time - data_type: datetime - description: "The time the event was emitted" - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course key for the course" - - name: course_name - data_type: string - description: "The name of the course" - - name: course_run - data_type: string - description: "The course run for the course" - - name: section_with_name - data_type: string - description: "The name of the section this subsection belongs to, with section_number prepended" - - name: subsection_with_name - data_type: string - description: "The name of the subsection, with section_number prepended" - - name: problem_id - data_type: string - description: "The problem's unique identifier" - - name: problem_name - data_type: string - description: "The problem's name" - - name: problem_name_with_location - data_type: string - description: "The problem's display name with section, subsection, and unit prepended to the name. This provides additional context when looking at problem names and can help data consumers understand which problem they are analyzing" - - name: problem_link - data_type: String - description: "An anchor tag with a link to the problem" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: responses - data_type: string - description: "The responses for this submission. If a problem has multiple parts, values for all parts will be in this field" - - name: success - data_type: bool - description: "Boolean indicating whether the responses were correct" - - name: attempts - data_type: int16 - description: "Number indicating which attempt this was" - - name: graded - data_type: bool - description: "Whether this subsection block is graded" - - name: interaction_type - data_type: string - description: "The type of interaction" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: course_order - data_type: Int32 - description: "The sort order of this block in the course across all course blocks" - - name: responses description: "A record per course per problem per learner with their last attempt and first success" columns: diff --git a/models/users/dim_at_risk_learners.sql b/models/users/dim_at_risk_learners.sql deleted file mode 100644 index 3388f4ed..00000000 --- a/models/users/dim_at_risk_learners.sql +++ /dev/null @@ -1,27 +0,0 @@ -with - page_visits as ( - select org, course_key, actor_id, max(emission_time) as last_visited - from {{ ref("navigation_events") }} - group by org, course_key, actor_id - ) - -select - learners.org as org, - learners.course_key as course_key, - learners.course_name as course_name, - learners.course_run as course_run, - learners.actor_id as actor_id, - learners.username as username, - learners.name as name, - learners.email as email, - learners.enrollment_mode as enrollment_mode, - learners.course_grade as course_grade, - learners.enrolled_at as enrolled_at, - learners.grade_bucket as grade_bucket, - page_visits.last_visited as last_visited -from {{ ref("fact_student_status") }} learners -join page_visits using (org, course_key, actor_id) -where - approving_state = 'failed' - and enrollment_status = 'registered' - and page_visits.last_visited < subtractDays(now(), 7) diff --git a/models/users/schema.yml b/models/users/schema.yml index 38b02732..8a2b3620 100644 --- a/models/users/schema.yml +++ b/models/users/schema.yml @@ -36,46 +36,3 @@ models: - name: email data_type: String description: "The email of the user" - - - name: dim_at_risk_learners - description: "A record for each learner determined to be at risk of dropping out of a given course" - columns: - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: username - data_type: string - description: "The username of the user" - - name: name - data_type: string - description: "The name of the user" - - name: email - data_type: string - description: "The email of the user" - - name: enrollment_mode - data_type: string - description: "The mode of enrollment" - - name: course_grade - data_type: float64 - description: "The most recent grade for the learner" - - name: enrolled_at - data_type: DateTime - description: "The timestamp, to the second, of the most recent enrollment action for this learner and course." - - name: grade_bucket - data_type: string - description: "A displayable value of grades sorted into 10% buckets. Useful for grouping grades together to show high-level learner performance" - - name: last_visited - data_type: datetime - description: "The last time the learner visited a page for this course" diff --git a/models/video/fact_transcript_usage.sql b/models/video/fact_transcript_usage.sql deleted file mode 100644 index dee79230..00000000 --- a/models/video/fact_transcript_usage.sql +++ /dev/null @@ -1,24 +0,0 @@ -select - transcripts.emission_time as emission_time, - transcripts.org as org, - transcripts.course_key as course_key, - blocks.course_name as course_name, - blocks.course_run as course_run, - transcripts.video_id as video_id, - blocks.block_name as video_name, - blocks.display_name_with_location as video_name_with_location, - blocks.course_order as course_order, - transcripts.actor_id as actor_id, - users.username as username, - users.name as name, - users.email as email -from {{ ref("video_transcript_events") }} transcripts -join - {{ ref("dim_course_blocks") }} blocks - on ( - transcripts.course_key = blocks.course_key - and transcripts.video_id = blocks.block_id - ) -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id -where transcripts.cc_enabled diff --git a/models/video/fact_watched_video_segments.sql b/models/video/fact_watched_video_segments.sql deleted file mode 100644 index c7a89e28..00000000 --- a/models/video/fact_watched_video_segments.sql +++ /dev/null @@ -1,98 +0,0 @@ -with - video_events as ( - select - emission_time, - org, - course_key, - splitByString('/xblock/', object_id)[-1] as video_id, - actor_id, - verb_id, - video_position, - video_duration - from {{ ref("video_playback_events") }} - ), - starts as ( - select * - from video_events - where verb_id = 'https://w3id.org/xapi/video/verbs/played' - ), - ends as ( - select * - from video_events - where - verb_id in ( - 'http://adlnet.gov/expapi/verbs/completed', - 'https://w3id.org/xapi/video/verbs/seeked', - 'https://w3id.org/xapi/video/verbs/paused', - 'http://adlnet.gov/expapi/verbs/terminated' - ) - ), - segments as ( - select - starts.org as org, - starts.course_key as course_key, - starts.video_id as video_id, - starts.actor_id, - cast(starts.video_position as Int32) as start_position, - cast(ends.video_position as Int32) as end_position, - starts.emission_time as started_at, - ends.emission_time as ended_at, - ends.verb_id as end_type, - starts.video_duration as video_duration - from starts left - asof join - ends - on ( - starts.org = ends.org - and starts.course_key = ends.course_key - and starts.video_id = ends.video_id - and starts.actor_id = ends.actor_id - and starts.emission_time < ends.emission_time - ) - ), - enriched_segments as ( - select - segments.org as org, - segments.course_key as course_key, - blocks.course_name as course_name, - blocks.course_run as course_run, - blocks.section_with_name as section_with_name, - blocks.subsection_with_name as subsection_with_name, - blocks.block_name as video_name, - blocks.display_name_with_location as video_name_with_location, - segments.actor_id as actor_id, - segments.started_at as started_at, - segments.start_position - (segments.start_position % 5) as start_position, - segments.end_position - (segments.end_position % 5) as end_position, - segments.video_duration as video_duration - from segments - join - {{ ref("dim_course_blocks_extended") }} blocks - on ( - segments.course_key = blocks.course_key - and segments.video_id = blocks.block_id - ) - ) - -select - org, - course_key, - course_name, - course_run, - section_with_name, - subsection_with_name, - video_name, - video_name_with_location, - actor_id, - started_at, - arrayJoin(range(start_position, end_position, 5)) as segment_start, - video_duration, - CONCAT(toString(segment_start), '-', toString(segment_start + 4)) as segment_range, - start_position, - username, - name, - email -from enriched_segments -left outer join - {{ ref("dim_user_pii") }} users on toUUID(actor_id) = users.external_user_id -order by start_position diff --git a/models/video/schema.yml b/models/video/schema.yml index e12820a7..d6fabfb6 100644 --- a/models/video/schema.yml +++ b/models/video/schema.yml @@ -65,49 +65,6 @@ models: data_type: Int32 description: "The sort order of this block in the course across all course blocks" - - name: fact_transcript_usage - description: "One record for each time a transcript or closed caption was enabled" - columns: - - name: emission_time - data_type: DateTime - description: "Timestamp, to the second, of when this event was emitted" - - name: org - data_type: String - description: "The organization that the course belongs to" - - name: course_key - data_type: String - description: "The course key for the course" - - name: course_name - data_type: String - description: "The name of the course" - - name: course_run - data_type: String - description: "The course run for the course" - - name: video_id - data_type: String - description: "The xAPI object identifier" - - name: video_name - data_type: String - description: "The name of the video" - - name: video_name_with_location - data_type: String - description: "The name of the video with the section and subsection" - - name: actor_id - data_type: String - description: "The xAPI actor identifier" - - name: username - data_type: String - description: "The username of the learner" - - name: name - data_type: String - description: "The full name of the learner" - - name: email - data_type: String - description: "The email address of the learner" - - name: course_order - data_type: Int32 - description: "The sort order of this block in the course across all course blocks" - - name: video_playback_events description: "Events related to video playback" columns: @@ -139,30 +96,6 @@ models: data_type: int64 description: "The duration of the video in seconds" - - name: video_transcript_events - description: "Events related to video transcripts" - columns: - - name: event_id - data_type: uuid - description: "The unique identifier for the event" - - name: emission_time - data_type: datetime - description: "The time the event was emitted" - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course identifier" - - name: video_id - data_type: string - description: "The video identifier" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: cc_enabled - data_type: uint8 - description: "Whether closed captions were enabled" - name: int_videos_per_subsection description: "A dimension table with the number of videos per subsection" @@ -253,61 +186,6 @@ models: description: "The email address of the learner" - - name: fact_watched_video_segments - description: "A dataset with one record per learner per video segment watched in a course" - columns: - - name: org - data_type: string - description: "The organization that the course belongs to" - - name: course_key - data_type: string - description: "The course key for the course" - - name: course_name - data_type: string - description: "The name of the course" - - name: course_run - data_type: string - description: "The course run for the course" - - name: section_with_name - data_type: string - description: "The name of the section this subsection belongs to, with section_number prepended" - - name: subsection_with_name - data_type: string - description: "The name of the subsection, with section_number prepended" - - name: video_name - data_type: string - description: "The name of the video" - - name: video_name_with_location - data_type: string - description: "The name of the video with the section and subsection" - - name: actor_id - data_type: string - description: "The xAPI actor identifier" - - name: started_at - data_type: datetime - description: "The time the video segment was started" - - name: segment_start - data_type: int64 - description: "The start position of the segment" - - name: segment_range - data_type: string - description: "An string representing the interval of the watched segment" - - name: video_duration - data_type: int64 - description: "The duration of the video in seconds" - - name: start_position - data_type: int64 - description: "The start position of the segment" - - name: username - data_type: string - description: "The username of the learner" - - name: name - data_type: string - description: "The full name of the learner" - - name: email - data_type: string - description: "The email address of the learner" - - name: section_video_engagement description: "A record per course per section per learner with their engagement level" columns: diff --git a/models/video/unit_tests.yaml b/models/video/unit_tests.yaml index c44a2c9f..6e1ecbb8 100644 --- a/models/video/unit_tests.yaml +++ b/models/video/unit_tests.yaml @@ -1,17 +1,4 @@ unit_tests: - - name: test_video_transcript_events - model: video_transcript_events - config: - tags: 'ci' - given: - - input: ref('xapi_events_all_parsed') - format: sql - rows: | - select * from xapi_events_all_parsed - expect: - format: csv - fixture: video_transcript_events_expected - - name: test_video_playback_events model: video_playback_events config: @@ -26,27 +13,6 @@ unit_tests: rows: | select * from video_playback_events_expected - - name: test_fact_transcript_usage - model: fact_transcript_usage - config: - tags: 'ci' - given: - - input: ref('video_transcript_events') - format: sql - rows: | - select * from video_transcript_events - - input: ref('dim_course_blocks') - format: sql - rows: | - select * from dim_course_blocks - - input: ref('dim_user_pii') - format: sql - rows: | - select * from dim_user_pii - expect: - format: csv - fixture: fact_transcript_usage_expected - - name: test_fact_video_plays model: fact_video_plays config: diff --git a/models/video/video_transcript_events.sql b/models/video/video_transcript_events.sql deleted file mode 100644 index bcf5df6f..00000000 --- a/models/video/video_transcript_events.sql +++ /dev/null @@ -1,34 +0,0 @@ -{{ - config( - materialized="materialized_view", - schema=env_var("ASPECTS_XAPI_DATABASE", "xapi"), - engine=get_engine("ReplacingMergeTree()"), - primary_key="(org, course_key, video_id)", - order_by="(org, course_key, video_id, emission_time, actor_id, cc_enabled, event_id)", - partition_by="(toYYYYMM(emission_time))", - ttl=env_var("ASPECTS_DATA_TTL_EXPRESSION", ""), - ) -}} - -select - event_id, - CAST(emission_time, 'DateTime') as emission_time, - org, - course_key, - splitByString('/xblock/', object_id)[2] as video_id, - actor_id, - JSONExtractBool( - event, - 'result', - 'extensions', - 'https://w3id.org/xapi/video/extensions/cc-enabled' - ) as cc_enabled -from {{ ref("xapi_events_all_parsed") }} -where - verb_id = 'http://adlnet.gov/expapi/verbs/interacted' - and JSONHas( - event, - 'result', - 'extensions', - 'https://w3id.org/xapi/video/extensions/cc-enabled' - ) diff --git a/tests/learner_problem_summary_uniqueness.sql b/tests/learner_problem_summary_uniqueness.sql deleted file mode 100644 index e09870cb..00000000 --- a/tests/learner_problem_summary_uniqueness.sql +++ /dev/null @@ -1,4 +0,0 @@ -select org, course_key, problem_id, actor_id, count(*) as num_rows -from {{ ref("fact_learner_problem_summary") }} -group by org, course_key, problem_id, actor_id -having num_rows > 1 diff --git a/tests/problem_results_uniqueness.sql b/tests/problem_results_uniqueness.sql deleted file mode 100644 index 83e43c42..00000000 --- a/tests/problem_results_uniqueness.sql +++ /dev/null @@ -1,7 +0,0 @@ --- problem_results should only have one record for the following --- combination of values: --- actor_id, problem_id, course_id, org -select org, course_key, problem_id, actor_id, responses, count(*) as num_rows -from {{ ref("int_problem_results") }} -group by org, course_key, problem_id, actor_id, responses -having num_rows > 1