knowledgesystems · averyniceday · Jan 17, 2024 · Dec 6, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java
@@ -177,12 +177,12 @@ public List<String> processFileComments(File dataFile) throws FileNotFoundExcept
         return comments;
     }
 
-    public boolean isDuplicateRecord(MutationRecord snp, List<AnnotatedRecord> annotatedRecords) {
-        if (annotatedRecords == null || annotatedRecords.isEmpty()) {
+    public boolean isDuplicateRecord(MutationRecord snp, List<MutationRecord> mutationRecords) {
+        if (mutationRecords == null || mutationRecords.isEmpty()) {
             return false;
         }
 
-        for (AnnotatedRecord record : annotatedRecords) {
+        for (MutationRecord record : mutationRecords) {
             if (record.getCHROMOSOME().equals(snp.getCHROMOSOME()) &&
                     record.getSTART_POSITION().equals(snp.getSTART_POSITION()) &&
                     record.getEND_POSITION().equals(snp.getEND_POSITION()) &&

diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java
@@ -80,7 +80,7 @@ public class CVRMutationDataReader implements ItemStreamReader<AnnotatedRecord>
     private Annotator annotator;
 
     private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
-    private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
+    private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();
 
     private File mutationFile;
     Set<String> header = new LinkedHashSet<>();
@@ -132,7 +132,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) {
             String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A";
             int countSignedOutSnps = result.getAllSignedoutCvrSnps().size();
             for (CVRSnp snp : result.getAllSignedoutCvrSnps()) {
-                recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus));
+                MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus);
+                recordsToAnnotate.add(to_add);
+                addRecordToMap(to_add);
             }
             cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps);
             if (!stopZeroVariantWarnings && countSignedOutSnps == 0) {
@@ -176,6 +178,7 @@ public void handleLine(String line) {
             }
             cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1);
             recordsToAnnotate.add(to_add);
+            addRecordToMap(to_add);
         }
         reader.close();
         log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
@@ -192,7 +195,6 @@ private void annotateRecordsWithPOST(List<MutationRecord> records, boolean reann
         for (AnnotatedRecord ar : annotatedRecords) {
             logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
             mutationRecords.add(ar);
-            mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
             additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
             header.addAll(ar.getHeaderWithAdditionalFields());
         }
@@ -231,4 +233,16 @@ public AnnotatedRecord read() throws Exception {
         }
         return null;
     }
+
+    private void addRecordToMap(MutationRecord record) {
+        String sampleId = record.getTUMOR_SAMPLE_BARCODE();
+        List<MutationRecord> recordList = mutationMap.get(sampleId);
+        if (recordList == null) {
+            recordList = new ArrayList<MutationRecord>();
+            recordList.add(record);
+            mutationMap.put(sampleId, recordList);
+        } else {
+            recordList.add(record);
+        }
+    }
 }
diff --git a/...ain/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/...ain/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java
@@ -77,7 +77,7 @@ public class CVRNonSignedoutMutationDataReader implements ItemStreamReader<Annot
 
     private File mutationFile;
     private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
-    private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
+    private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();
     private Set<String> additionalPropertyKeys = new LinkedHashSet<>();
     Set<String> header = new LinkedHashSet<>();
     private AnnotationSummaryStatistics summaryStatistics;
@@ -128,7 +128,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) {
             int countNonSignedoutSampleSnps = result.getAllNonSignedoutCvrSnps().size();
             String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A";
             for (CVRSnp snp : result.getAllNonSignedoutCvrSnps()) {
-                recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus));
+                MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus);
+                recordsToAnnotate.add(to_add);
+                addRecordToMap(to_add);
             }
             cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps);
         }
@@ -169,6 +171,7 @@ public void handleLine(String line) {
             }
             cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1);
             recordsToAnnotate.add(to_add);
+            addRecordToMap(to_add);
         }
         reader.close();
         log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
@@ -185,7 +188,6 @@ private void annotateRecordsWithPOST(List<MutationRecord> records, boolean reann
         for (AnnotatedRecord ar : annotatedRecords) {
             logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
             mutationRecords.add(ar);
-            mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
             additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
             header.addAll(ar.getHeaderWithAdditionalFields());
         }
@@ -224,4 +226,17 @@ public AnnotatedRecord read() throws Exception {
         }
         return null;
     }
+
+    private void addRecordToMap(MutationRecord record) {
+        String sampleId = record.getTUMOR_SAMPLE_BARCODE();
+        List<MutationRecord> recordList = mutationMap.get(sampleId);
+        if (recordList == null) {
+            recordList = new ArrayList<MutationRecord>();
+            recordList.add(record);
+            mutationMap.put(sampleId, recordList);
+        } else {
+            recordList.add(record);
+        }
+    }
+
 }
diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java
@@ -76,8 +76,8 @@ public class GMLMutationDataReader implements ItemStreamReader<AnnotatedRecord>
     @Autowired
     private Annotator annotator;
 
-    private List<AnnotatedRecord> mutationRecords = new ArrayList();
-    private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
+    private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
+    private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();
     private File mutationFile;
     private Set<String> additionalPropertyKeys = new LinkedHashSet<>();
     private Set<String> header = new LinkedHashSet<>();
@@ -132,8 +132,10 @@ private void loadMutationRecordsFromJson(GMLData gmlData) {
             if (samples != null && !snps.isEmpty()) {
                 for (GMLSnp snp : snps) {
                     for (String sampleId : samples) {
-                        recordsToAnnotate.add(cvrUtilities.buildGMLMutationRecord(snp, sampleId));
+                        MutationRecord to_add = cvrUtilities.buildGMLMutationRecord(snp, sampleId);
+                        recordsToAnnotate.add(to_add);
                         germlineSamples.add(sampleId);
+                        addRecordToMap(to_add);
                     }
                 }
             }
@@ -174,6 +176,7 @@ public void handleLine(String line) {
                 continue;
             }
             recordsToAnnotate.add(to_add);
+            addRecordToMap(to_add);
         }
         reader.close();
         log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
@@ -190,7 +193,6 @@ private List<AnnotatedRecord> annotateRecordsWithPOST(List<MutationRecord> recor
         for (AnnotatedRecord ar : annotatedRecords) {
             logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
             mutationRecords.add(ar);
-            mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
             additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
             header.addAll(ar.getHeaderWithAdditionalFields());
         }
@@ -231,4 +233,15 @@ public AnnotatedRecord read() throws Exception {
         return null;
     }
 
+    private void addRecordToMap(MutationRecord record) {
+        String sampleId = record.getTUMOR_SAMPLE_BARCODE();
+        List<MutationRecord> recordList = mutationMap.get(sampleId);
+        if (recordList == null) {
+            recordList = new ArrayList<MutationRecord>();
+            recordList.add(record);
+            mutationMap.put(sampleId, recordList);
+        } else {
+            recordList.add(record);
+        }
+    }
 }
diff --git a/import-scripts/remove-duplicate-maf-variants.py b/import-scripts/remove-duplicate-maf-variants.py
@@ -0,0 +1,86 @@
+#!/usr/bin/python
+import sys
+import os
+import optparse
+
+# Script to remove duplicate maf records based on the 8 key columns.
+# Calculates VAF for each record and picks the record with high VAF
+# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count) 
+
+ERROR_FILE = sys.stderr
+OUTPUT_FILE = sys.stdout
+
+KEY_COLUMNS_INDEX = []
+KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short']
+MAF_DATA = {}
+
+def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index):
+	outfile = []
+	outfile.append(comments)
+	outfile.append(header)
+	for key in MAF_DATA:
+		if len(MAF_DATA[key]) > 1:
+			vaf_ind = 0
+			vaf_value = 0
+			for val in MAF_DATA[key]:
+				#calculate VAF for each duplicate record.
+				columns = val.rstrip('\n').split('\t')
+				try:
+					VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index]))
+					if VAF > vaf_value:
+						vaf_value = VAF
+						vaf_ind = MAF_DATA[key].index(val)
+						outfile.append(MAF_DATA[key][vaf_ind])
+				except:
+					print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key
+					print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index]
+					outfile.append(val)
+		else:
+			outfile.append(MAF_DATA[key][0])
+
+	out_filename = maf_filename.split('.')[0]+'_merged.txt'
+	datafile = open(out_filename, 'w')
+	for line in outfile:
+		datafile.write(line)
+	datafile.close()
+	print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + out_filename +'\n'
+
+
+def main():
+	# get command line arguments
+	parser = optparse.OptionParser()
+	parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file')
+
+	(options, args) = parser.parse_args()
+	maf_filename = options.maf_file
+
+	comments = ""
+	header = ""
+
+	with open(maf_filename,'r') as maf_file:
+		for line in maf_file:
+			if line.startswith('#'):
+				comments += line
+			elif line.startswith('Hugo_Symbol'):
+				header += line
+				header_cols = line.rstrip('\n').split('\t')
+				#get the positions of the 8 key maf columns
+				for value in KEY_COLUMNS:
+					KEY_COLUMNS_INDEX.append(header_cols.index(value))
+				t_refc_index = header_cols.index('t_ref_count')
+				t_altc_index = header_cols.index('t_alt_count')
+			else:
+				reference_key = ""
+				data = line.rstrip('\n').split('\t')
+				for index in KEY_COLUMNS_INDEX:
+					reference_key += data[index]+'\t'
+				reference_key = reference_key.rstrip('\t')
+				if reference_key not in MAF_DATA:
+					MAF_DATA[reference_key] = [line]
+				else:
+					MAF_DATA[reference_key].append(line)
+
+	remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index)
+
+if __name__ == '__main__':
+	main()
diff --git a/import-scripts/update-az-mskimpact.sh b/import-scripts/update-az-mskimpact.sh
@@ -298,6 +298,22 @@ function standardize_mutations_data() {
     $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_mutations_data.py -f "$NSOUT_MUTATIONS_INPUT_FILEPATH"
 }
 
+function remove_duplicate_maf_variants() {
+    MUTATIONS_EXTD_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended.txt"
+    NSOUT_MUTATIONS_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations.txt"
+
+    MUTATIONS_EXTD_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended_merged.txt"
+    NSOUT_MUTATIONS_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations_merged.txt"
+
+    # Remove duplicate variants from MAF files
+    $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$MUTATIONS_EXTD_INPUT_FILEPATH" &&
+    $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" &&
+
+    # Rewrite mutation files with updated data
+    mv "$MUTATIONS_EXTD_OUTPUT_FILEPATH" "$MUTATIONS_EXTD_INPUT_FILEPATH" &&
+    mv "$NSOUT_MUTATIONS_OUTPUT_FILEPATH" "$NSOUT_MUTATIONS_INPUT_FILEPATH"
+}
+
 function standardize_structural_variant_data() {
     DATA_SV_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_sv.txt"
     $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_structural_variant_data.py -f "$DATA_SV_INPUT_FILEPATH"
@@ -421,6 +437,11 @@ if ! standardize_mutations_data ; then
     report_error "ERROR: Failed to standardize mutations files for AstraZeneca MSK-IMPACT. Exiting."
 fi
 
+# Remove duplicate variants from MAF files
+if ! remove_duplicate_maf_variants ; then
+    report_error "ERROR: Failed to remove duplicate variants from MAF files for AstraZeneca MSK-IMPACT. Exiting."
+fi
+
 # Standardize structural variant data by removing records with invalid genes and standardizing the file header
 if ! standardize_structural_variant_data ; then
     report_error "ERROR: Failed to standardize structural variant data for AstraZeneca MSK-IMPACT. Exiting."