fix panel match and gnomad col

sigven · Dec 17, 2023 · b60ea96 · b60ea96
1 parent 361276b
commit b60ea96
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 11 deletions.
diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py
@@ -44,9 +44,9 @@ def get_args():
     optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version()))
     optional_other.add_argument('--no_reporting',action="store_true",help="Run functional variant annotation on VCF through VEP/vcfanno, omit classification/report generation (STEP 4), default: %(default)s")
     optional_other.add_argument('--retained_info_tags', dest ='retained_info_tags', default='None', help='Comma-separated string of VCF INFO tags from query VCF that should be kept in CPSR output TSV')
-    #optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown),  default: %(default)s' )
-    #optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s')
-    #optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s")
+    optional_other.add_argument('--report_theme',choices = ['default','cerulean','journal','flatly','readable','spacelab','united','cosmo','lumen','paper','sandstone','simplex','yeti'], default = 'default', help='Visual report theme (rmarkdown),  default: %(default)s' )
+    optional_other.add_argument('--report_nonfloating_toc', action='store_true', help='Do not float the table of contents (TOC) in output HTML report, default: %(default)s')
+    optional_other.add_argument('--report_table_display', choices = ['full','light'], default='light', help="Set the level of detail/comprehensiveness in interactive datables of HTML report, very comprehensive (option 'full') or slim/focused ('light'), default: %(default)s")
     optional_other.add_argument('--ignore_noncoding', action='store_true',dest='ignore_noncoding',default=False,help='Ignore non-coding (i.e. non protein-altering) variants in report, default: %(default)s')
     optional_other.add_argument("--debug", action="store_true", help="Print full commands to log")
     optional_other.add_argument("--pcgrr_conda", default="pcgrr", help="pcgrr conda env name (default: %(default)s)")
@@ -282,7 +282,8 @@ def run_cpsr(conf_options, cpsr_paths):
               output_pass_vcf2tsv_gz, pcgr_db_dir = cpsr_paths["db_dir"], logger = logger)
         variant_set = variant.clean_annotations(variant_set, yaml_data, germline = True, logger = logger)        
         variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False)
-        utils.remove(output_pass_vcf2tsv_gz)
+        if not debug:
+            utils.remove(output_pass_vcf2tsv_gz)
 
         logger.info('Finished cpsr-summarise')
 

diff --git a/pcgr/main.py b/pcgr/main.py
@@ -452,7 +452,8 @@ def run_pcgr(pcgr_paths, conf_options):
         variant_set = variant.set_allelic_support(variant_set, allelic_support_tags = yaml_data["conf"]['somatic_snv']['allelic_support'])
         variant_set = variant.clean_annotations(variant_set, yaml_data, germline = False, logger = logger)        
         variant_set.to_csv(output_pass_tsv_gz, sep="\t", compression="gzip", index=False)
-        utils.remove(output_pass_vcf2tsv_gz)
+        if not debug:
+            utils.remove(output_pass_vcf2tsv_gz)
 
         if yaml_data["conf"]['assay_properties']['type'] == 'WGS' or yaml_data["conf"]['assay_properties']['type'] == 'WES':
             # check that output file exist

diff --git a/pcgr/variant.py b/pcgr/variant.py
@@ -81,6 +81,7 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger):
             ## check number of variants with Ensembl gene ID's
             num_recs_with_entrez_hits = vcf2tsv_df["ENTREZGENE"].notna().sum()
 
+            #print(str(num_recs_with_entrez_hits))
             ## merge variant set with ClinVar trait and variant origin annotations
             if num_recs_with_clinvar_hits > 0:
                 if os.path.exists(clinvar_tsv_fname):
@@ -133,7 +134,6 @@ def append_annotations(vcf2tsv_gz_fname: str, pcgr_db_dir: str, logger):
                         usecols=["entrezgene","name"])
                     gene_xref_df = gene_xref_df[gene_xref_df['entrezgene'].notnull()].drop_duplicates()
                     gene_xref_df["entrezgene"] = gene_xref_df["entrezgene"].astype("int64").astype("string")
-                    #print(gene_xref_df.head)
                     gene_xref_df.rename(columns = {'entrezgene':'ENTREZGENE', 'name':'GENENAME'}, inplace = True)
                     vcf2tsv_df = vcf2tsv_df.merge(gene_xref_df, left_on=["ENTREZGENE"], right_on=["ENTREZGENE"], how="left")
                     vcf2tsv_df["ENTREZGENE"] = vcf2tsv_df['ENTREZGENE'].str.replace("\\.[0-9]{1,}$", "", regex = True)
@@ -253,8 +253,12 @@ def clean_annotations(variant_set: pd.DataFrame, yaml_data: dict, germline: bool
         variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("\\.&|\\.$", "NA&", regex = True)
         variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&$", "", regex = True)
         variant_set['EFFECT_PREDICTIONS'] = variant_set['EFFECT_PREDICTIONS'].str.replace("&", ", ", regex = True)
-        variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "CLINVAR_CONFLICTED"] = True
-        variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "CLINVAR_CONFLICTED"] = False
+        variant_set['clinvar_conflicted_bool'] = True
+        variant_set.loc[variant_set['CLINVAR_CONFLICTED'] == 1, "clinvar_conflicted_bool"] = True
+        variant_set.loc[variant_set['CLINVAR_CONFLICTED'] != 1, "clinvar_conflicted_bool"] = False
+        variant_set.drop('CLINVAR_CONFLICTED', inplace=True, axis=1)        
+        variant_set.rename(columns = {'clinvar_conflicted_bool':'CLINVAR_CONFLICTED'}, inplace = True)
+
 
     if not {'VCF_SAMPLE_ID'}.issubset(variant_set.columns):
         variant_set['VCF_SAMPLE_ID'] = str(yaml_data['sample_id'])

diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R
@@ -64,11 +64,11 @@ usethis::use_data(color_palette, overwrite = T)
 #-----evidence types---------#
 evidence_types <- c("predictive","prognostic","diagnostic",
                     "oncogenic","predisposing","functional")
-usethis::use_data(evidence_types)
+usethis::use_data(evidence_types, overwrite = T)
 
 #-----evidence levels---------#
 evidence_levels <- c("any","A_B","C_D_E")
-usethis::use_data(evidence_levels)
+usethis::use_data(evidence_levels, overwrite = T)
 
 
 #-----input column names/types-----#

diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda
diff --git a/scripts/cpsr_validate_input.py b/scripts/cpsr_validate_input.py
@@ -230,7 +230,7 @@ def simplify_vcf(input_vcf, validated_vcf, vcf, custom_bed, pcgr_directory, geno
 
             ## awk command to ignore secondary finding records while keeping records that belong to target (and that can potentially
             ## be part of the secondary findings list)
-            awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + "/))print;}'"
+            awk_command = "awk 'BEGIN{FS=\"\\t\"}{if($4 !~ /ACMG_SF/ || ($4 ~ /ACMG_SF/ && $4 ~ /" + str(ge_panel_identifier) + ":/))print;}'"
             if gwas_findings == 0 and secondary_findings == 1:
                 check_subprocess(logger, f'bgzip -dc {target_bed_gz} | egrep -v "(\|tag\|)" >> {virtual_panels_tmp_bed}', debug)
             elif gwas_findings == 0 and secondary_findings == 0: