From cc24cb8f778a75611849c65a344875df399b353f Mon Sep 17 00:00:00 2001 From: Oliver Stolpe Date: Fri, 20 Oct 2023 15:52:09 +0200 Subject: [PATCH] yyy --- .../commands/generate_result_set.py | 20 +- variants/utils.py | 310 +++++++++++------- 2 files changed, 204 insertions(+), 126 deletions(-) diff --git a/maintenance/management/commands/generate_result_set.py b/maintenance/management/commands/generate_result_set.py index b9697e834..9ee56e0d2 100644 --- a/maintenance/management/commands/generate_result_set.py +++ b/maintenance/management/commands/generate_result_set.py @@ -121,42 +121,42 @@ def handle(self, *args, **options): # from variants.views import UUIDEncoder # json.dump(duplicates, f, indent=1, cls=UUIDEncoder) with open("orphaned_sv_flags.tsv", "w") as f: - f.write("case_uuid\tproject\tcase_name\tregion\tis_missing\tjson\n") + f.write("case_uuid\tproject\tcase_name\tregion\tlost\tjson\n") for line in tsv_lines["svs"]["flags"]: f.write( - "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{is_missing}\t{json}\n".format( + "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{lost}\t{json}\n".format( **line ) ) with open("orphaned_sv_comments.tsv", "w") as f: - f.write("case_uuid\tproject\tcase_name\tregion\tis_missing\tjson\n") + f.write("case_uuid\tproject\tcase_name\tregion\tlost\tjson\n") for line in tsv_lines["svs"]["comments"]: f.write( - "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{is_missing}\t{json}\n".format( + "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{lost}\t{json}\n".format( **line ) ) with open("orphaned_sm_flags.tsv", "w") as f: - f.write("case_uuid\tproject\tcase_name\tregion\tis_missing\tjson\n") + f.write("case_uuid\tproject\tcase_name\tregion\tlost\tjson\n") for line in tsv_lines["sms"]["flags"]: f.write( - "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{is_missing}\t{json}\n".format( + "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{lost}\t{json}\n".format( **line ) ) with open("orphaned_sm_comments.tsv", "w") as f: - f.write("case_uuid\tproject\tcase_name\tregion\tis_missing\tjson\n") + f.write("case_uuid\tproject\tcase_name\tregion\tlost\tjson\n") for line in tsv_lines["sms"]["comments"]: f.write( - "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{is_missing}\t{json}\n".format( + "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{lost}\t{json}\n".format( **line ) ) with open("orphaned_sm_acmg_ratings.tsv", "w") as f: - f.write("case_uuid\tproject\tcase_name\tregion\tis_missing\tjson\n") + f.write("case_uuid\tproject\tcase_name\tregion\tlost\tjson\n") for line in tsv_lines["sms"]["acmg_ratings"]: f.write( - "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{is_missing}\t{json}\n".format( + "{case_uuid}\t{project}\t{case_name}\t{chromosome}:{start}-{end}\t{lost}\t{json}\n".format( **line ) ) diff --git a/variants/utils.py b/variants/utils.py index 39c84d150..730db9285 100644 --- a/variants/utils.py +++ b/variants/utils.py @@ -43,7 +43,15 @@ def create_queryresultset(case_uuid=None, project_uuid=None, all=False): "comments": 0, "flags": 0, }, # user annotations added to SvQueryResultSet's - "orphaned": { + "removed": { + "comments": 0, + "flags": 0, + }, # user annotations without corresponding query result row + "salvable": { + "comments": 0, + "flags": 0, + }, # user annotations without corresponding query result row + "lost": { "comments": 0, "flags": 0, }, # user annotations without corresponding query result row @@ -54,14 +62,24 @@ def create_queryresultset(case_uuid=None, project_uuid=None, all=False): "flags": 0, "acmg_ratings": 0, }, - "orphaned": { + "removed": { + "comments": 0, + "flags": 0, + "acmg_ratings": 0, + }, + "salvable": { + "comments": 0, + "flags": 0, + "acmg_ratings": 0, + }, + "lost": { "comments": 0, "flags": 0, "acmg_ratings": 0, }, }, } - orphans = {} + salvable = {} duplicates = {} tsv_lines = { "svs": { @@ -114,21 +132,17 @@ def _perform_create(_case): return _sm_result_set, _sv_result_set def _perform_fill(sm_result_set, sv_result_set, tsv_lines): - sm_count, sm_orphans, sm_duplicates, sm_tsv_lines = fill_sm_queryresultset(sm_result_set) - sv_count, sv_orphans, sv_duplicates, sv_tsv_lines = fill_sv_queryresultset(sv_result_set) - count["svs"]["added"]["flags"] += sv_count["added"]["flags"] - count["svs"]["added"]["comments"] += sv_count["added"]["comments"] - count["svs"]["orphaned"]["flags"] += sv_count["orphaned"]["flags"] - count["svs"]["orphaned"]["comments"] += sv_count["orphaned"]["comments"] - count["sms"]["added"]["flags"] += sm_count["added"]["flags"] - count["sms"]["added"]["comments"] += sm_count["added"]["comments"] - count["sms"]["added"]["acmg_ratings"] += sm_count["added"]["acmg_ratings"] - count["sms"]["orphaned"]["flags"] += sm_count["orphaned"]["flags"] - count["sms"]["orphaned"]["comments"] += sm_count["orphaned"]["comments"] - count["sms"]["orphaned"]["acmg_ratings"] += sm_count["orphaned"]["acmg_ratings"] - orphans[str(sm_result_set.case.sodar_uuid)] = { - "sms": list(set(sm_orphans)), - "svs": list(set(sv_orphans)), + sm_count, sm_salvable, sm_duplicates, sm_tsv_lines = fill_sm_queryresultset(sm_result_set) + sv_count, sv_salvable, sv_duplicates, sv_tsv_lines = fill_sv_queryresultset(sv_result_set) + for i in ("added", "removed", "salvable", "lost"): + count["svs"][i]["flags"] += sv_count[i]["flags"] + count["svs"][i]["comments"] += sv_count[i]["comments"] + count["sms"][i]["flags"] += sm_count[i]["flags"] + count["sms"][i]["comments"] += sm_count[i]["comments"] + count["sms"][i]["acmg_ratings"] += sm_count[i]["acmg_ratings"] + salvable[str(sm_result_set.case.sodar_uuid)] = { + "sms": list(set(sm_salvable)), + "svs": list(set(sv_salvable)), } duplicates[str(sm_result_set.case.sodar_uuid)] = { "sms": sm_duplicates, @@ -156,7 +170,7 @@ def _perform_fill(sm_result_set, sv_result_set, tsv_lines): sm_result_set, sv_result_set = _perform_create(_case) _perform_fill(sm_result_set, sv_result_set, tsv_lines) - return count, orphans, duplicates, tsv_lines + return count, salvable, duplicates, tsv_lines def fill_sm_queryresultset(result_set): @@ -168,13 +182,23 @@ def fill_sm_queryresultset(result_set): "comments": 0, "acmg_ratings": 0, }, - "orphaned": { + "removed": { + "flags": 0, + "comments": 0, + "acmg_ratings": 0, + }, + "salvable": { + "flags": 0, + "comments": 0, + "acmg_ratings": 0, + }, + "lost": { "flags": 0, "comments": 0, "acmg_ratings": 0, }, } - orphans = [] + salvable = [] duplicates = [] tsv_lines = { "flags": [], @@ -198,60 +222,84 @@ def _perform_create(obj): "reference": obj.reference, "alternative": obj.alternative, } - is_missing = False + lost = True # Sanity check - if not SmallVariant.objects.filter(case_id=case.id, **coords).exists(): - is_missing = True + if SmallVariant.objects.filter(case_id=case.id, **coords).exists(): + lost = False result_rows = result_set.smallvariantqueryresultrow_set.filter(**coords) - if result_rows.count() == 0: - queries = case.small_variant_queries.filter(smallvariantqueryresultset__isnull=False) - for query in queries: - query_result_set = query.smallvariantqueryresultset_set.first() - result_row = query_result_set.smallvariantqueryresultrow_set.filter(**coords) - if not result_row.exists(): - continue - result_row = result_row.first() - result_row.pk = None - result_row.sodar_uuid = uuid.uuid4() - result_row.smallvariantqueryresultset = result_set - result_row.save() - count["added"][obj_type] += 1 - break - else: - # should exist as it was annotated. - count["orphaned"][obj_type] += 1 - orphans.append("{chromosome}:{start}-{end}".format(**coords)) - from variants.views import UUIDEncoder - - tsv_lines[obj_type].append( - { - "case_uuid": str(case.sodar_uuid), - "case_name": case.name, - "project": case.project.full_title, - "chromosome": obj.chromosome, - "start": obj.start, - "end": obj.end, - "is_missing": is_missing, - "json": json.dumps(model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder), - } + if lost: + count["lost"][obj_type] += 1 + from variants.views import UUIDEncoder + + tsv_lines[obj_type].append( + { + "case_uuid": str(case.sodar_uuid), + "case_name": case.name, + "project": case.project.full_title, + "chromosome": obj.chromosome, + "start": obj.start, + "end": obj.end, + "lost": lost, + "json": json.dumps(model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder), + } + ) + for result_row in result_rows: + count["removed"][obj_type] += 1 + result_row.delete() + else: + if result_rows.count() == 0: + queries = case.small_variant_queries.filter( + smallvariantqueryresultset__isnull=False ) - elif result_rows.count() > 1: - duplicates.append(result_rows) + for query in queries: + query_result_set = query.smallvariantqueryresultset_set.first() + result_row = query_result_set.smallvariantqueryresultrow_set.filter(**coords) + if not result_row.exists(): + continue + result_row = result_row.first() + result_row.pk = None + result_row.sodar_uuid = uuid.uuid4() + result_row.smallvariantqueryresultset = result_set + result_row.save() + count["added"][obj_type] += 1 + break + else: + # should exist as it was annotated. + count["salvable"][obj_type] += 1 + salvable.append("{chromosome}:{start}-{end}".format(**coords)) + from variants.views import UUIDEncoder + + tsv_lines[obj_type].append( + { + "case_uuid": str(case.sodar_uuid), + "case_name": case.name, + "project": case.project.full_title, + "chromosome": obj.chromosome, + "start": obj.start, + "end": obj.end, + "lost": lost, + "json": json.dumps( + model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder + ), + } + ) + elif result_rows.count() > 1: + duplicates.append(result_rows) result_set.result_row_count = result_set.smallvariantqueryresultrow_set.count() result_set.save() - small_variant_flags = SmallVariantFlags.objects.filter(case=case) - small_variant_comments = SmallVariantComment.objects.filter(case=case) + sm_flags = SmallVariantFlags.objects.filter(case=case) + sm_comments = SmallVariantComment.objects.filter(case=case) acmg_rating = AcmgCriteriaRating.objects.filter(case=case) - for obj in chain(small_variant_flags, small_variant_comments, acmg_rating): + for obj in chain(sm_flags, sm_comments, acmg_rating): _perform_create(obj) - return count, orphans, duplicates, tsv_lines + return count, salvable, duplicates, tsv_lines def fill_sv_queryresultset(result_set): @@ -263,12 +311,20 @@ def fill_sv_queryresultset(result_set): "flags": 0, "comments": 0, }, - "orphaned": { + "removed": { + "flags": 0, + "comments": 0, + }, + "salvable": { + "flags": 0, + "comments": 0, + }, + "lost": { "flags": 0, "comments": 0, }, } - orphans = [] + salvable = [] duplicates = [] tsv_lines = { "flags": [], @@ -283,7 +339,7 @@ def _perform_create(obj): obj_type = "flags" elif isinstance(obj, StructuralVariantComment): obj_type = "comments" - is_missing = True + lost = True for sv_obj in StructuralVariant.objects.filter(case_id=case.id, chromosome=obj.chromosome): if ( @@ -292,7 +348,7 @@ def _perform_create(obj): ) >= 0.8 ): - is_missing = False + lost = False break result_rows = [ @@ -304,58 +360,80 @@ def _perform_create(obj): >= 0.8 ] - if not result_rows: - queries = case.svquery_set.filter(svqueryresultset__isnull=False) - for query in queries: - query_result_set = query.svqueryresultset_set.first() - result_row = [ - row_obj - for row_obj in query_result_set.svqueryresultrow_set.filter( - chromosome=obj.chromosome - ) - if reciprocal_overlap( - sv_type=obj.sv_type, - qry_start=row_obj.start, - qry_end=row_obj.end, - record=obj, + if lost: + count["lost"][obj_type] += 1 + from variants.views import UUIDEncoder + + tsv_lines[obj_type].append( + { + "case_uuid": str(case.sodar_uuid), + "case_name": case.name, + "project": case.project.full_title, + "chromosome": obj.chromosome, + "start": obj.start, + "end": obj.end, + "lost": lost, + "json": json.dumps(model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder), + } + ) + for result_row in result_rows: + count["removed"][obj_type] += 1 + result_row.delete() + else: + if not result_rows: + queries = case.svquery_set.filter(svqueryresultset__isnull=False) + for query in queries: + query_result_set = query.svqueryresultset_set.first() + result_row = [ + row_obj + for row_obj in query_result_set.svqueryresultrow_set.filter( + chromosome=obj.chromosome + ) + if reciprocal_overlap( + sv_type=obj.sv_type, + qry_start=row_obj.start, + qry_end=row_obj.end, + record=obj, + ) + >= 0.8 + ] + if not result_row: + continue + result_row = result_row[0] + result_row.pk = None + result_row.sodar_uuid = uuid.uuid4() + result_row.svqueryresultset = result_set + result_row.save() + count["added"][obj_type] += 1 + break + else: + # should exist as it was annotated. + count["salvable"][obj_type] += 1 + salvable.append( + "{chromosome}:{start}-{end}".format( + chromosome=obj.chromosome, + start=obj.start, + end=obj.end, + ) ) - >= 0.8 - ] - if not result_row: - continue - result_row = result_row[0] - result_row.pk = None - result_row.sodar_uuid = uuid.uuid4() - result_row.svqueryresultset = result_set - result_row.save() - count["added"][obj_type] += 1 - break - else: - # should exist as it was annotated. - count["orphaned"][obj_type] += 1 - orphans.append( - "{chromosome}:{start}-{end}".format( - chromosome=obj.chromosome, - start=obj.start, - end=obj.end, + from variants.views import UUIDEncoder + + tsv_lines[obj_type].append( + { + "case_uuid": str(case.sodar_uuid), + "case_name": case.name, + "project": case.project.full_title, + "chromosome": obj.chromosome, + "start": obj.start, + "end": obj.end, + "lost": lost, + "json": json.dumps( + model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder + ), + } ) - ) - from variants.views import UUIDEncoder - - tsv_lines[obj_type].append( - { - "case_uuid": str(case.sodar_uuid), - "case_name": case.name, - "project": case.project.full_title, - "chromosome": obj.chromosome, - "start": obj.start, - "end": obj.end, - "is_missing": is_missing, - "json": json.dumps(model_to_dict(obj, exclude=("id",)), cls=UUIDEncoder), - } - ) - elif len(result_rows) > 1: - duplicates.append(result_rows) + elif len(result_rows) > 1: + duplicates.append(result_rows) result_set.result_row_count = result_set.svqueryresultrow_set.count() result_set.save() @@ -366,4 +444,4 @@ def _perform_create(obj): for obj in chain(sv_flags, sv_comments): _perform_create(obj) - return count, orphans, duplicates, tsv_lines + return count, salvable, duplicates, tsv_lines