-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Move loading and merging intermediate Reports to its own file
This should make the high level steps of parallel processing a bit more obvious and easier to grasp.
- Loading branch information
Showing
3 changed files
with
179 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import json | ||
from concurrent.futures import ThreadPoolExecutor | ||
from dataclasses import dataclass | ||
|
||
import sentry_sdk | ||
from shared.reports.editable import EditableReport | ||
|
||
from services.archive import ArchiveService | ||
from services.processing.state import MERGE_BATCH_SIZE | ||
|
||
|
||
@dataclass | ||
class IntermediateReport: | ||
upload_id: int | ||
""" | ||
The `Upload` id for which this report was loaded. | ||
""" | ||
|
||
report: EditableReport | ||
""" | ||
The loaded Report. | ||
""" | ||
|
||
|
||
@sentry_sdk.trace | ||
def load_intermediate_reports( | ||
archive_service: ArchiveService, | ||
commitsha: str, | ||
upload_ids: list[int], | ||
) -> list[IntermediateReport]: | ||
def load_report(upload_id: int) -> IntermediateReport: | ||
repo_hash = archive_service.storage_hash | ||
# TODO: migrate these files to a better storage location | ||
prefix = f"v4/repos/{repo_hash}/commits/{commitsha}/parallel/incremental" | ||
chunks_path = f"{prefix}/chunk{upload_id}.txt" | ||
json_path = f"{prefix}/files_and_sessions{upload_id}.txt" | ||
|
||
chunks = archive_service.read_file(chunks_path).decode(errors="replace") | ||
report_json = json.loads(archive_service.read_file(json_path)) | ||
|
||
report = EditableReport.from_chunks( | ||
chunks=chunks, | ||
files=report_json["files"], | ||
sessions=report_json["sessions"], | ||
totals=report_json.get("totals"), | ||
) | ||
return IntermediateReport(upload_id, report) | ||
|
||
with ThreadPoolExecutor(max_workers=MERGE_BATCH_SIZE) as pool: | ||
loaded_reports = pool.map(load_report, upload_ids) | ||
return list(loaded_reports) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
from dataclasses import dataclass | ||
|
||
import sentry_sdk | ||
from shared.reports.editable import EditableReport, EditableReportFile | ||
from shared.reports.enums import UploadState | ||
from shared.reports.resources import Report | ||
from shared.yaml import UserYaml | ||
from sqlalchemy.orm import Session as DbSession | ||
|
||
from database.models.reports import Upload | ||
from services.processing.loading import IntermediateReport | ||
from services.report import delete_uploads_by_sessionid | ||
from services.report.raw_upload_processor import clear_carryforward_sessions | ||
|
||
|
||
@dataclass | ||
class MergeResult: | ||
session_mapping: dict[int, int] | ||
""" | ||
This is a mapping from the input `upload_id` to the output `session_id` | ||
as it exists in the merged "master Report". | ||
""" | ||
|
||
deleted_sessions: set[int] | ||
""" | ||
The Set of carryforwarded `session_id`s that have been removed from the "master Report". | ||
""" | ||
|
||
|
||
@sentry_sdk.trace | ||
def merge_reports( | ||
commit_yaml: UserYaml, | ||
master_report: Report, | ||
intermediate_reports: list[IntermediateReport], | ||
) -> MergeResult: | ||
session_mapping: dict[int, int] = dict() | ||
deleted_sessions: set[int] = set() | ||
|
||
for intermediate_report in intermediate_reports: | ||
report = intermediate_report.report | ||
|
||
old_sessionid = next(iter(report.sessions)) | ||
new_sessionid = master_report.next_session_number() | ||
change_sessionid(report, old_sessionid, new_sessionid) | ||
session_mapping[intermediate_report.upload_id] = new_sessionid | ||
|
||
session = report.sessions[new_sessionid] | ||
|
||
_session_id, session = master_report.add_session( | ||
session, use_id_from_session=True | ||
) | ||
|
||
if flags := session.flags: | ||
session_adjustment = clear_carryforward_sessions( | ||
master_report, report, flags, commit_yaml | ||
) | ||
deleted_sessions.update(session_adjustment.fully_deleted_sessions) | ||
|
||
master_report.merge(report) | ||
|
||
return MergeResult(session_mapping, deleted_sessions) | ||
|
||
|
||
@sentry_sdk.trace | ||
def update_uploads(db_session: DbSession, merge_result: MergeResult): | ||
for upload_id, session_id in merge_result.session_mapping.items(): | ||
upload = db_session.query(Upload).filter(Upload.id_ == upload_id).first() | ||
upload.state_id = UploadState.PROCESSED.db_id | ||
upload.state = "processed" | ||
upload.order_number = session_id | ||
|
||
if upload: | ||
delete_uploads_by_sessionid(upload, list(merge_result.deleted_sessions)) | ||
db_session.flush() | ||
|
||
|
||
def change_sessionid(report: EditableReport, old_id: int, new_id: int): | ||
""" | ||
Modifies the `EditableReport`, changing the session with `old_id` to have `new_id` instead. | ||
This patches up all the references to that session across all files and line records. | ||
In particular, it changes the id in all the `LineSession`s and `CoverageDatapoint`s, | ||
and does the equivalent of `calculate_present_sessions`. | ||
""" | ||
session = report.sessions[new_id] = report.sessions.pop(old_id) | ||
session.id = new_id | ||
|
||
report_file: EditableReportFile | ||
for report_file in report._chunks: | ||
if report_file is None: | ||
continue | ||
|
||
all_sessions = set() | ||
|
||
for idx, _line in enumerate(report_file._lines): | ||
if not _line: | ||
continue | ||
|
||
# this turns the line into an actual `ReportLine` | ||
line = report_file._lines[idx] = report_file._line(_line) | ||
|
||
for session in line.sessions: | ||
if session.id == old_id: | ||
session.id = new_id | ||
all_sessions.add(session.id) | ||
|
||
if line.datapoints: | ||
for point in line.datapoints: | ||
if point.sessionid == old_id: | ||
point.sessionid = new_id | ||
|
||
report_file._details["present_sessions"] = all_sessions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters