diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java index e211a0007..8f0cf2d56 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java @@ -177,12 +177,12 @@ public List processFileComments(File dataFile) throws FileNotFoundExcept return comments; } - public boolean isDuplicateRecord(MutationRecord snp, List annotatedRecords) { - if (annotatedRecords == null || annotatedRecords.isEmpty()) { + public boolean isDuplicateRecord(MutationRecord snp, List mutationRecords) { + if (mutationRecords == null || mutationRecords.isEmpty()) { return false; } - for (AnnotatedRecord record : annotatedRecords) { + for (MutationRecord record : mutationRecords) { if (record.getCHROMOSOME().equals(snp.getCHROMOSOME()) && record.getSTART_POSITION().equals(snp.getSTART_POSITION()) && record.getEND_POSITION().equals(snp.getEND_POSITION()) && diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index bef3efeb7..3242aab00 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -80,7 +80,7 @@ public class CVRMutationDataReader implements ItemStreamReader private Annotator annotator; private List mutationRecords = new ArrayList<>(); - private Map> mutationMap = new HashMap<>(); + private Map> mutationMap = new HashMap<>(); private File mutationFile; Set header = new LinkedHashSet<>(); @@ -132,7 +132,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A"; int countSignedOutSnps = result.getAllSignedoutCvrSnps().size(); for (CVRSnp snp : result.getAllSignedoutCvrSnps()) { - recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus)); + MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); + recordsToAnnotate.add(to_add); + addRecordToMap(to_add); } cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps); if (!stopZeroVariantWarnings && countSignedOutSnps == 0) { @@ -176,6 +178,7 @@ public void handleLine(String line) { } cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); + addRecordToMap(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -192,7 +195,6 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } @@ -231,4 +233,16 @@ public AnnotatedRecord read() throws Exception { } return null; } + + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java index 7cfb68d59..7f7722406 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java @@ -77,7 +77,7 @@ public class CVRNonSignedoutMutationDataReader implements ItemStreamReader mutationRecords = new ArrayList<>(); - private Map> mutationMap = new HashMap<>(); + private Map> mutationMap = new HashMap<>(); private Set additionalPropertyKeys = new LinkedHashSet<>(); Set header = new LinkedHashSet<>(); private AnnotationSummaryStatistics summaryStatistics; @@ -128,7 +128,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { int countNonSignedoutSampleSnps = result.getAllNonSignedoutCvrSnps().size(); String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A"; for (CVRSnp snp : result.getAllNonSignedoutCvrSnps()) { - recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus)); + MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); + recordsToAnnotate.add(to_add); + addRecordToMap(to_add); } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps); } @@ -169,6 +171,7 @@ public void handleLine(String line) { } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); + addRecordToMap(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -185,7 +188,6 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } @@ -224,4 +226,17 @@ public AnnotatedRecord read() throws Exception { } return null; } + + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } + } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java index c7a80b4ac..8dbf27315 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java @@ -76,8 +76,8 @@ public class GMLMutationDataReader implements ItemStreamReader @Autowired private Annotator annotator; - private List mutationRecords = new ArrayList(); - private Map> mutationMap = new HashMap<>(); + private List mutationRecords = new ArrayList<>(); + private Map> mutationMap = new HashMap<>(); private File mutationFile; private Set additionalPropertyKeys = new LinkedHashSet<>(); private Set header = new LinkedHashSet<>(); @@ -132,8 +132,10 @@ private void loadMutationRecordsFromJson(GMLData gmlData) { if (samples != null && !snps.isEmpty()) { for (GMLSnp snp : snps) { for (String sampleId : samples) { - recordsToAnnotate.add(cvrUtilities.buildGMLMutationRecord(snp, sampleId)); + MutationRecord to_add = cvrUtilities.buildGMLMutationRecord(snp, sampleId); + recordsToAnnotate.add(to_add); germlineSamples.add(sampleId); + addRecordToMap(to_add); } } } @@ -174,6 +176,7 @@ public void handleLine(String line) { continue; } recordsToAnnotate.add(to_add); + addRecordToMap(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -190,7 +193,6 @@ private List annotateRecordsWithPOST(List recor for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } @@ -231,4 +233,15 @@ public AnnotatedRecord read() throws Exception { return null; } + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } } diff --git a/import-scripts/remove-duplicate-maf-variants.py b/import-scripts/remove-duplicate-maf-variants.py new file mode 100644 index 000000000..a256d00ab --- /dev/null +++ b/import-scripts/remove-duplicate-maf-variants.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +import sys +import os +import optparse + +# Script to remove duplicate maf records based on the 8 key columns. +# Calculates VAF for each record and picks the record with high VAF +# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count) + +ERROR_FILE = sys.stderr +OUTPUT_FILE = sys.stdout + +KEY_COLUMNS_INDEX = [] +KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short'] +MAF_DATA = {} + +def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index): + outfile = [] + outfile.append(comments) + outfile.append(header) + for key in MAF_DATA: + if len(MAF_DATA[key]) > 1: + vaf_ind = 0 + vaf_value = 0 + for val in MAF_DATA[key]: + #calculate VAF for each duplicate record. + columns = val.rstrip('\n').split('\t') + try: + VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index])) + if VAF > vaf_value: + vaf_value = VAF + vaf_ind = MAF_DATA[key].index(val) + outfile.append(MAF_DATA[key][vaf_ind]) + except: + print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key + print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index] + outfile.append(val) + else: + outfile.append(MAF_DATA[key][0]) + + out_filename = maf_filename.split('.')[0]+'_merged.txt' + datafile = open(out_filename, 'w') + for line in outfile: + datafile.write(line) + datafile.close() + print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + out_filename +'\n' + + +def main(): + # get command line arguments + parser = optparse.OptionParser() + parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file') + + (options, args) = parser.parse_args() + maf_filename = options.maf_file + + comments = "" + header = "" + + with open(maf_filename,'r') as maf_file: + for line in maf_file: + if line.startswith('#'): + comments += line + elif line.startswith('Hugo_Symbol'): + header += line + header_cols = line.rstrip('\n').split('\t') + #get the positions of the 8 key maf columns + for value in KEY_COLUMNS: + KEY_COLUMNS_INDEX.append(header_cols.index(value)) + t_refc_index = header_cols.index('t_ref_count') + t_altc_index = header_cols.index('t_alt_count') + else: + reference_key = "" + data = line.rstrip('\n').split('\t') + for index in KEY_COLUMNS_INDEX: + reference_key += data[index]+'\t' + reference_key = reference_key.rstrip('\t') + if reference_key not in MAF_DATA: + MAF_DATA[reference_key] = [line] + else: + MAF_DATA[reference_key].append(line) + + remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index) + +if __name__ == '__main__': + main() diff --git a/import-scripts/update-az-mskimpact.sh b/import-scripts/update-az-mskimpact.sh index eceb72148..dbb8c6736 100755 --- a/import-scripts/update-az-mskimpact.sh +++ b/import-scripts/update-az-mskimpact.sh @@ -298,6 +298,22 @@ function standardize_mutations_data() { $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_mutations_data.py -f "$NSOUT_MUTATIONS_INPUT_FILEPATH" } +function remove_duplicate_maf_variants() { + MUTATIONS_EXTD_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended.txt" + NSOUT_MUTATIONS_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations.txt" + + MUTATIONS_EXTD_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended_merged.txt" + NSOUT_MUTATIONS_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations_merged.txt" + + # Remove duplicate variants from MAF files + $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$MUTATIONS_EXTD_INPUT_FILEPATH" && + $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" && + + # Rewrite mutation files with updated data + mv "$MUTATIONS_EXTD_OUTPUT_FILEPATH" "$MUTATIONS_EXTD_INPUT_FILEPATH" && + mv "$NSOUT_MUTATIONS_OUTPUT_FILEPATH" "$NSOUT_MUTATIONS_INPUT_FILEPATH" +} + function standardize_structural_variant_data() { DATA_SV_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_sv.txt" $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_structural_variant_data.py -f "$DATA_SV_INPUT_FILEPATH" @@ -421,6 +437,11 @@ if ! standardize_mutations_data ; then report_error "ERROR: Failed to standardize mutations files for AstraZeneca MSK-IMPACT. Exiting." fi +# Remove duplicate variants from MAF files +if ! remove_duplicate_maf_variants ; then + report_error "ERROR: Failed to remove duplicate variants from MAF files for AstraZeneca MSK-IMPACT. Exiting." +fi + # Standardize structural variant data by removing records with invalid genes and standardizing the file header if ! standardize_structural_variant_data ; then report_error "ERROR: Failed to standardize structural variant data for AstraZeneca MSK-IMPACT. Exiting."