From 76051eaa910c57d95d487a0a3c902e84599a46e0 Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Wed, 6 Dec 2023 16:53:08 -0500 Subject: [PATCH 1/7] Check if mutationRecord is duplicated before annotating --- .../org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java | 6 +++--- .../cmo/pipelines/cvr/mutation/CVRMutationDataReader.java | 7 +++++-- .../cvr/mutation/CVRNonSignedoutMutationDataReader.java | 6 ++++-- .../cmo/pipelines/cvr/mutation/GMLMutationDataReader.java | 6 ++++-- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java index e211a0007..8f0cf2d56 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/CVRUtilities.java @@ -177,12 +177,12 @@ public List processFileComments(File dataFile) throws FileNotFoundExcept return comments; } - public boolean isDuplicateRecord(MutationRecord snp, List annotatedRecords) { - if (annotatedRecords == null || annotatedRecords.isEmpty()) { + public boolean isDuplicateRecord(MutationRecord snp, List mutationRecords) { + if (mutationRecords == null || mutationRecords.isEmpty()) { return false; } - for (AnnotatedRecord record : annotatedRecords) { + for (MutationRecord record : mutationRecords) { if (record.getCHROMOSOME().equals(snp.getCHROMOSOME()) && record.getSTART_POSITION().equals(snp.getSTART_POSITION()) && record.getEND_POSITION().equals(snp.getEND_POSITION()) && diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index bef3efeb7..0dd0c8b44 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -80,7 +80,9 @@ public class CVRMutationDataReader implements ItemStreamReader private Annotator annotator; private List mutationRecords = new ArrayList<>(); - private Map> mutationMap = new HashMap<>(); + + //private Map> mutationMap = new HashMap<>(); + private Map> mutationMap = new HashMap<>(); private File mutationFile; Set header = new LinkedHashSet<>(); @@ -176,6 +178,7 @@ public void handleLine(String line) { } cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -192,7 +195,7 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); + //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java index 7cfb68d59..9464a3621 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java @@ -77,7 +77,8 @@ public class CVRNonSignedoutMutationDataReader implements ItemStreamReader mutationRecords = new ArrayList<>(); - private Map> mutationMap = new HashMap<>(); + //private Map> mutationMap = new HashMap<>(); + private Map> mutationMap = new HashMap<>(); private Set additionalPropertyKeys = new LinkedHashSet<>(); Set header = new LinkedHashSet<>(); private AnnotationSummaryStatistics summaryStatistics; @@ -169,6 +170,7 @@ public void handleLine(String line) { } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -185,7 +187,7 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); + //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java index c7a80b4ac..c1ead28c4 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java @@ -77,7 +77,8 @@ public class GMLMutationDataReader implements ItemStreamReader private Annotator annotator; private List mutationRecords = new ArrayList(); - private Map> mutationMap = new HashMap<>(); + //private Map> mutationMap = new HashMap<>(); + private Map> mutationMap = new HashMap<>(); private File mutationFile; private Set additionalPropertyKeys = new LinkedHashSet<>(); private Set header = new LinkedHashSet<>(); @@ -174,6 +175,7 @@ public void handleLine(String line) { continue; } recordsToAnnotate.add(to_add); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -190,7 +192,7 @@ private List annotateRecordsWithPOST(List recor for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); + //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } From 982f4d0fe16e641fe781c7852620d9f0830b4038 Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Mon, 18 Dec 2023 14:08:19 -0500 Subject: [PATCH 2/7] Populate mutationMap in loadMutationRecordsFromJson --- .../cmo/pipelines/cvr/mutation/CVRMutationDataReader.java | 5 ++++- .../cvr/mutation/CVRNonSignedoutMutationDataReader.java | 4 +++- .../cmo/pipelines/cvr/mutation/GMLMutationDataReader.java | 4 +++- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index 0dd0c8b44..55359583b 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -134,7 +134,10 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A"; int countSignedOutSnps = result.getAllSignedoutCvrSnps().size(); for (CVRSnp snp : result.getAllSignedoutCvrSnps()) { - recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus)); + MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); + recordsToAnnotate.add(to_add); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + } cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps); if (!stopZeroVariantWarnings && countSignedOutSnps == 0) { diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java index 9464a3621..e3ac54dbd 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java @@ -129,7 +129,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { int countNonSignedoutSampleSnps = result.getAllNonSignedoutCvrSnps().size(); String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A"; for (CVRSnp snp : result.getAllNonSignedoutCvrSnps()) { - recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus)); + MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); + recordsToAnnotate.add(to_add); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps); } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java index c1ead28c4..296603fff 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java @@ -133,8 +133,10 @@ private void loadMutationRecordsFromJson(GMLData gmlData) { if (samples != null && !snps.isEmpty()) { for (GMLSnp snp : snps) { for (String sampleId : samples) { - recordsToAnnotate.add(cvrUtilities.buildGMLMutationRecord(snp, sampleId)); + MutationRecord to_add = cvrUtilities.buildGMLMutationRecord(snp, sampleId); + recordsToAnnotate.add(to_add); germlineSamples.add(sampleId); + mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } } } From 11207d8ba14e90a8719a084dd0381d8cc4daa66d Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Mon, 18 Dec 2023 17:07:01 -0500 Subject: [PATCH 3/7] add addRecordToMap --- .../cvr/mutation/CVRMutationDataReader.java | 19 ++++++++++++++++--- .../CVRNonSignedoutMutationDataReader.java | 19 +++++++++++++++++-- .../cvr/mutation/GMLMutationDataReader.java | 17 +++++++++++++++-- 3 files changed, 48 insertions(+), 7 deletions(-) diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index 55359583b..d9270d22f 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -136,8 +136,8 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { for (CVRSnp snp : result.getAllSignedoutCvrSnps()) { MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); recordsToAnnotate.add(to_add); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); - + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); } cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps); if (!stopZeroVariantWarnings && countSignedOutSnps == 0) { @@ -181,7 +181,8 @@ public void handleLine(String line) { } cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -237,4 +238,16 @@ public AnnotatedRecord read() throws Exception { } return null; } + + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java index e3ac54dbd..1f16192d8 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java @@ -131,7 +131,8 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { for (CVRSnp snp : result.getAllNonSignedoutCvrSnps()) { MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); recordsToAnnotate.add(to_add); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps); } @@ -172,7 +173,8 @@ public void handleLine(String line) { } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -228,4 +230,17 @@ public AnnotatedRecord read() throws Exception { } return null; } + + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } + } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java index 296603fff..3a5d99b45 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java @@ -136,7 +136,8 @@ private void loadMutationRecordsFromJson(GMLData gmlData) { MutationRecord to_add = cvrUtilities.buildGMLMutationRecord(snp, sampleId); recordsToAnnotate.add(to_add); germlineSamples.add(sampleId); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } } } @@ -177,7 +178,8 @@ public void handleLine(String line) { continue; } recordsToAnnotate.add(to_add); - mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); + addRecordToMap(to_add); + //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -235,4 +237,15 @@ public AnnotatedRecord read() throws Exception { return null; } + private void addRecordToMap(MutationRecord record) { + String sampleId = record.getTUMOR_SAMPLE_BARCODE(); + List recordList = mutationMap.get(sampleId); + if (recordList == null) { + recordList = new ArrayList(); + recordList.add(record); + mutationMap.put(sampleId, recordList); + } else { + recordList.add(record); + } + } } From 91c692d0833bfacdad72062ec4ffa12955f5ebc4 Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Tue, 19 Dec 2023 15:24:28 -0500 Subject: [PATCH 4/7] Remove comments, add local vars for debugging --- .../cmo/pipelines/cvr/mutation/CVRMutationDataReader.java | 7 +++---- .../cvr/mutation/CVRNonSignedoutMutationDataReader.java | 4 ---- .../cmo/pipelines/cvr/mutation/GMLMutationDataReader.java | 6 +----- 3 files changed, 4 insertions(+), 13 deletions(-) diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index d9270d22f..4d4ad1627 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -81,7 +81,6 @@ public class CVRMutationDataReader implements ItemStreamReader private List mutationRecords = new ArrayList<>(); - //private Map> mutationMap = new HashMap<>(); private Map> mutationMap = new HashMap<>(); private File mutationFile; @@ -136,7 +135,6 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { for (CVRSnp snp : result.getAllSignedoutCvrSnps()) { MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); recordsToAnnotate.add(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); addRecordToMap(to_add); } cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps); @@ -175,13 +173,15 @@ public void handleLine(String line) { MutationRecord to_add; while ((to_add = reader.read()) != null && to_add.getTUMOR_SAMPLE_BARCODE() != null) { // skip if new sample or if mutation record for sample seen already + String tumorSampleBarcode = to_add.getTUMOR_SAMPLE_BARCODE(); + String hugoSymbol = to_add.getHUGO_SYMBOL(); + if (cvrSampleListUtil.getNewDmpSamples().contains(to_add.getTUMOR_SAMPLE_BARCODE()) || cvrUtilities.isDuplicateRecord(to_add, mutationMap.get(to_add.getTUMOR_SAMPLE_BARCODE()))) { continue; } cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); addRecordToMap(to_add); } reader.close(); @@ -199,7 +199,6 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java index 1f16192d8..7f7722406 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRNonSignedoutMutationDataReader.java @@ -77,7 +77,6 @@ public class CVRNonSignedoutMutationDataReader implements ItemStreamReader mutationRecords = new ArrayList<>(); - //private Map> mutationMap = new HashMap<>(); private Map> mutationMap = new HashMap<>(); private Set additionalPropertyKeys = new LinkedHashSet<>(); Set header = new LinkedHashSet<>(); @@ -132,7 +131,6 @@ private void loadMutationRecordsFromJson(CVRData cvrData) { MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus); recordsToAnnotate.add(to_add); addRecordToMap(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps); } @@ -174,7 +172,6 @@ public void handleLine(String line) { cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1); recordsToAnnotate.add(to_add); addRecordToMap(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -191,7 +188,6 @@ private void annotateRecordsWithPOST(List records, boolean reann for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java index 3a5d99b45..8dbf27315 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/GMLMutationDataReader.java @@ -76,8 +76,7 @@ public class GMLMutationDataReader implements ItemStreamReader @Autowired private Annotator annotator; - private List mutationRecords = new ArrayList(); - //private Map> mutationMap = new HashMap<>(); + private List mutationRecords = new ArrayList<>(); private Map> mutationMap = new HashMap<>(); private File mutationFile; private Set additionalPropertyKeys = new LinkedHashSet<>(); @@ -137,7 +136,6 @@ private void loadMutationRecordsFromJson(GMLData gmlData) { recordsToAnnotate.add(to_add); germlineSamples.add(sampleId); addRecordToMap(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } } } @@ -179,7 +177,6 @@ public void handleLine(String line) { } recordsToAnnotate.add(to_add); addRecordToMap(to_add); - //mutationMap.getOrDefault(to_add.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(to_add); } reader.close(); log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF"); @@ -196,7 +193,6 @@ private List annotateRecordsWithPOST(List recor for (AnnotatedRecord ar : annotatedRecords) { logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize); mutationRecords.add(ar); - //mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar); additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet()); header.addAll(ar.getHeaderWithAdditionalFields()); } From b77b7b98d64c6640bac7e53f765903706c4611ba Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Thu, 4 Jan 2024 13:03:55 -0500 Subject: [PATCH 5/7] Remove duplicate MAF variants for AZ --- .../cvr/mutation/CVRMutationDataReader.java | 3 - .../remove-duplicate-maf-variants.py | 86 +++++++++++++++++++ import-scripts/update-az-mskimpact.sh | 14 +++ 3 files changed, 100 insertions(+), 3 deletions(-) create mode 100644 import-scripts/remove-duplicate-maf-variants.py diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index 4d4ad1627..c62a67a65 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -173,9 +173,6 @@ public void handleLine(String line) { MutationRecord to_add; while ((to_add = reader.read()) != null && to_add.getTUMOR_SAMPLE_BARCODE() != null) { // skip if new sample or if mutation record for sample seen already - String tumorSampleBarcode = to_add.getTUMOR_SAMPLE_BARCODE(); - String hugoSymbol = to_add.getHUGO_SYMBOL(); - if (cvrSampleListUtil.getNewDmpSamples().contains(to_add.getTUMOR_SAMPLE_BARCODE()) || cvrUtilities.isDuplicateRecord(to_add, mutationMap.get(to_add.getTUMOR_SAMPLE_BARCODE()))) { continue; diff --git a/import-scripts/remove-duplicate-maf-variants.py b/import-scripts/remove-duplicate-maf-variants.py new file mode 100644 index 000000000..a256d00ab --- /dev/null +++ b/import-scripts/remove-duplicate-maf-variants.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +import sys +import os +import optparse + +# Script to remove duplicate maf records based on the 8 key columns. +# Calculates VAF for each record and picks the record with high VAF +# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count) + +ERROR_FILE = sys.stderr +OUTPUT_FILE = sys.stdout + +KEY_COLUMNS_INDEX = [] +KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short'] +MAF_DATA = {} + +def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index): + outfile = [] + outfile.append(comments) + outfile.append(header) + for key in MAF_DATA: + if len(MAF_DATA[key]) > 1: + vaf_ind = 0 + vaf_value = 0 + for val in MAF_DATA[key]: + #calculate VAF for each duplicate record. + columns = val.rstrip('\n').split('\t') + try: + VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index])) + if VAF > vaf_value: + vaf_value = VAF + vaf_ind = MAF_DATA[key].index(val) + outfile.append(MAF_DATA[key][vaf_ind]) + except: + print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key + print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index] + outfile.append(val) + else: + outfile.append(MAF_DATA[key][0]) + + out_filename = maf_filename.split('.')[0]+'_merged.txt' + datafile = open(out_filename, 'w') + for line in outfile: + datafile.write(line) + datafile.close() + print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + out_filename +'\n' + + +def main(): + # get command line arguments + parser = optparse.OptionParser() + parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file') + + (options, args) = parser.parse_args() + maf_filename = options.maf_file + + comments = "" + header = "" + + with open(maf_filename,'r') as maf_file: + for line in maf_file: + if line.startswith('#'): + comments += line + elif line.startswith('Hugo_Symbol'): + header += line + header_cols = line.rstrip('\n').split('\t') + #get the positions of the 8 key maf columns + for value in KEY_COLUMNS: + KEY_COLUMNS_INDEX.append(header_cols.index(value)) + t_refc_index = header_cols.index('t_ref_count') + t_altc_index = header_cols.index('t_alt_count') + else: + reference_key = "" + data = line.rstrip('\n').split('\t') + for index in KEY_COLUMNS_INDEX: + reference_key += data[index]+'\t' + reference_key = reference_key.rstrip('\t') + if reference_key not in MAF_DATA: + MAF_DATA[reference_key] = [line] + else: + MAF_DATA[reference_key].append(line) + + remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index) + +if __name__ == '__main__': + main() diff --git a/import-scripts/update-az-mskimpact.sh b/import-scripts/update-az-mskimpact.sh index eceb72148..ae512f0b2 100755 --- a/import-scripts/update-az-mskimpact.sh +++ b/import-scripts/update-az-mskimpact.sh @@ -298,6 +298,15 @@ function standardize_mutations_data() { $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_mutations_data.py -f "$NSOUT_MUTATIONS_INPUT_FILEPATH" } +function remove_duplicate_maf_variants() { + MUTATIONS_EXTD_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended.txt" + NSOUT_MUTATIONS_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations.txt" + + # Remove duplicate variants from MAF files + $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$MUTATIONS_EXTD_INPUT_FILEPATH" && + $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" +} + function standardize_structural_variant_data() { DATA_SV_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_sv.txt" $PYTHON_BINARY $PORTAL_HOME/scripts/standardize_structural_variant_data.py -f "$DATA_SV_INPUT_FILEPATH" @@ -421,6 +430,11 @@ if ! standardize_mutations_data ; then report_error "ERROR: Failed to standardize mutations files for AstraZeneca MSK-IMPACT. Exiting." fi +# Remove duplicate variants from MAF files +if ! remove_duplicate_maf_variants ; then + report_error "ERROR: Failed to remove duplicate variants from MAF files for AstraZeneca MSK-IMPACT. Exiting." +fi + # Standardize structural variant data by removing records with invalid genes and standardizing the file header if ! standardize_structural_variant_data ; then report_error "ERROR: Failed to standardize structural variant data for AstraZeneca MSK-IMPACT. Exiting." From fd908a2937e3e44c9dda20073f1c4bea8c976ae8 Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Thu, 4 Jan 2024 17:22:07 -0500 Subject: [PATCH 6/7] Fix remove-duplicate-maf-variants call --- import-scripts/update-az-mskimpact.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/import-scripts/update-az-mskimpact.sh b/import-scripts/update-az-mskimpact.sh index ae512f0b2..dbb8c6736 100755 --- a/import-scripts/update-az-mskimpact.sh +++ b/import-scripts/update-az-mskimpact.sh @@ -302,9 +302,16 @@ function remove_duplicate_maf_variants() { MUTATIONS_EXTD_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended.txt" NSOUT_MUTATIONS_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations.txt" + MUTATIONS_EXTD_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended_merged.txt" + NSOUT_MUTATIONS_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations_merged.txt" + # Remove duplicate variants from MAF files $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$MUTATIONS_EXTD_INPUT_FILEPATH" && - $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" + $PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" && + + # Rewrite mutation files with updated data + mv "$MUTATIONS_EXTD_OUTPUT_FILEPATH" "$MUTATIONS_EXTD_INPUT_FILEPATH" && + mv "$NSOUT_MUTATIONS_OUTPUT_FILEPATH" "$NSOUT_MUTATIONS_INPUT_FILEPATH" } function standardize_structural_variant_data() { From 8eb02eb701fa14328863f81e950d03481b430931 Mon Sep 17 00:00:00 2001 From: Calla Chennault <51501715+callachennault@users.noreply.github.com> Date: Fri, 5 Jan 2024 12:42:03 -0500 Subject: [PATCH 7/7] revert whitespace change --- .../cmo/pipelines/cvr/mutation/CVRMutationDataReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java index c62a67a65..3242aab00 100644 --- a/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java +++ b/cvr/src/main/java/org/cbioportal/cmo/pipelines/cvr/mutation/CVRMutationDataReader.java @@ -80,7 +80,6 @@ public class CVRMutationDataReader implements ItemStreamReader private Annotator annotator; private List mutationRecords = new ArrayList<>(); - private Map> mutationMap = new HashMap<>(); private File mutationFile;