Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix bug in checking for duplicate Mutation Records #1099

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,12 @@ public List<String> processFileComments(File dataFile) throws FileNotFoundExcept
return comments;
}

public boolean isDuplicateRecord(MutationRecord snp, List<AnnotatedRecord> annotatedRecords) {
if (annotatedRecords == null || annotatedRecords.isEmpty()) {
public boolean isDuplicateRecord(MutationRecord snp, List<MutationRecord> mutationRecords) {
if (mutationRecords == null || mutationRecords.isEmpty()) {
return false;
}

for (AnnotatedRecord record : annotatedRecords) {
for (MutationRecord record : mutationRecords) {
if (record.getCHROMOSOME().equals(snp.getCHROMOSOME()) &&
record.getSTART_POSITION().equals(snp.getSTART_POSITION()) &&
record.getEND_POSITION().equals(snp.getEND_POSITION()) &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ public class CVRMutationDataReader implements ItemStreamReader<AnnotatedRecord>
private Annotator annotator;

private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();

private File mutationFile;
Set<String> header = new LinkedHashSet<>();
Expand Down Expand Up @@ -132,7 +132,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) {
String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A";
int countSignedOutSnps = result.getAllSignedoutCvrSnps().size();
for (CVRSnp snp : result.getAllSignedoutCvrSnps()) {
recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus));
MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus);
recordsToAnnotate.add(to_add);
addRecordToMap(to_add);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Think this logic will fix the issue, but have you run an entire mskimpact fetch with this code? I'm curious about the memory usage, worried that if we have another map tracking the entire mutation record, we'll exacerbate the memory issue

}
cvrSampleListUtil.updateSignedoutSampleSnpCounts(sampleId, countSignedOutSnps);
if (!stopZeroVariantWarnings && countSignedOutSnps == 0) {
Expand Down Expand Up @@ -176,6 +178,7 @@ public void handleLine(String line) {
}
cvrSampleListUtil.updateSignedoutSampleSnpCounts(to_add.getTUMOR_SAMPLE_BARCODE(), 1);
recordsToAnnotate.add(to_add);
addRecordToMap(to_add);
}
reader.close();
log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
Expand All @@ -192,7 +195,6 @@ private void annotateRecordsWithPOST(List<MutationRecord> records, boolean reann
for (AnnotatedRecord ar : annotatedRecords) {
logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
mutationRecords.add(ar);
mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
header.addAll(ar.getHeaderWithAdditionalFields());
}
Expand Down Expand Up @@ -231,4 +233,16 @@ public AnnotatedRecord read() throws Exception {
}
return null;
}

private void addRecordToMap(MutationRecord record) {
String sampleId = record.getTUMOR_SAMPLE_BARCODE();
List<MutationRecord> recordList = mutationMap.get(sampleId);
if (recordList == null) {
recordList = new ArrayList<MutationRecord>();
recordList.add(record);
mutationMap.put(sampleId, recordList);
} else {
recordList.add(record);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public class CVRNonSignedoutMutationDataReader implements ItemStreamReader<Annot

private File mutationFile;
private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();
private Set<String> additionalPropertyKeys = new LinkedHashSet<>();
Set<String> header = new LinkedHashSet<>();
private AnnotationSummaryStatistics summaryStatistics;
Expand Down Expand Up @@ -128,7 +128,9 @@ private void loadMutationRecordsFromJson(CVRData cvrData) {
int countNonSignedoutSampleSnps = result.getAllNonSignedoutCvrSnps().size();
String somaticStatus = result.getMetaData().getSomaticStatus() != null ? result.getMetaData().getSomaticStatus() : "N/A";
for (CVRSnp snp : result.getAllNonSignedoutCvrSnps()) {
recordsToAnnotate.add(cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus));
MutationRecord to_add = cvrUtilities.buildCVRMutationRecord(snp, sampleId, somaticStatus);
recordsToAnnotate.add(to_add);
addRecordToMap(to_add);
}
cvrSampleListUtil.updateNonSignedoutSampleSnpCount(sampleId, countNonSignedoutSampleSnps);
}
Expand Down Expand Up @@ -169,6 +171,7 @@ public void handleLine(String line) {
}
cvrSampleListUtil.updateNonSignedoutSampleSnpCount(to_add.getTUMOR_SAMPLE_BARCODE(), 1);
recordsToAnnotate.add(to_add);
addRecordToMap(to_add);
}
reader.close();
log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
Expand All @@ -185,7 +188,6 @@ private void annotateRecordsWithPOST(List<MutationRecord> records, boolean reann
for (AnnotatedRecord ar : annotatedRecords) {
logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
mutationRecords.add(ar);
mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
header.addAll(ar.getHeaderWithAdditionalFields());
}
Expand Down Expand Up @@ -224,4 +226,17 @@ public AnnotatedRecord read() throws Exception {
}
return null;
}

private void addRecordToMap(MutationRecord record) {
String sampleId = record.getTUMOR_SAMPLE_BARCODE();
List<MutationRecord> recordList = mutationMap.get(sampleId);
if (recordList == null) {
recordList = new ArrayList<MutationRecord>();
recordList.add(record);
mutationMap.put(sampleId, recordList);
} else {
recordList.add(record);
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ public class GMLMutationDataReader implements ItemStreamReader<AnnotatedRecord>
@Autowired
private Annotator annotator;

private List<AnnotatedRecord> mutationRecords = new ArrayList();
private Map<String, List<AnnotatedRecord>> mutationMap = new HashMap<>();
private List<AnnotatedRecord> mutationRecords = new ArrayList<>();
private Map<String, List<MutationRecord>> mutationMap = new HashMap<>();
private File mutationFile;
private Set<String> additionalPropertyKeys = new LinkedHashSet<>();
private Set<String> header = new LinkedHashSet<>();
Expand Down Expand Up @@ -132,8 +132,10 @@ private void loadMutationRecordsFromJson(GMLData gmlData) {
if (samples != null && !snps.isEmpty()) {
for (GMLSnp snp : snps) {
for (String sampleId : samples) {
recordsToAnnotate.add(cvrUtilities.buildGMLMutationRecord(snp, sampleId));
MutationRecord to_add = cvrUtilities.buildGMLMutationRecord(snp, sampleId);
recordsToAnnotate.add(to_add);
germlineSamples.add(sampleId);
addRecordToMap(to_add);
}
}
}
Expand Down Expand Up @@ -174,6 +176,7 @@ public void handleLine(String line) {
continue;
}
recordsToAnnotate.add(to_add);
addRecordToMap(to_add);
}
reader.close();
log.info("Loaded " + String.valueOf(recordsToAnnotate.size()) + " records from MAF");
Expand All @@ -190,7 +193,6 @@ private List<AnnotatedRecord> annotateRecordsWithPOST(List<MutationRecord> recor
for (AnnotatedRecord ar : annotatedRecords) {
logAnnotationProgress(++annotatedVariantsCount, totalVariantsToAnnotateCount, postIntervalSize);
mutationRecords.add(ar);
mutationMap.getOrDefault(ar.getTUMOR_SAMPLE_BARCODE(), new ArrayList()).add(ar);
additionalPropertyKeys.addAll(ar.getAdditionalProperties().keySet());
header.addAll(ar.getHeaderWithAdditionalFields());
}
Expand Down Expand Up @@ -231,4 +233,15 @@ public AnnotatedRecord read() throws Exception {
return null;
}

private void addRecordToMap(MutationRecord record) {
String sampleId = record.getTUMOR_SAMPLE_BARCODE();
List<MutationRecord> recordList = mutationMap.get(sampleId);
if (recordList == null) {
recordList = new ArrayList<MutationRecord>();
recordList.add(record);
mutationMap.put(sampleId, recordList);
} else {
recordList.add(record);
}
}
}
86 changes: 86 additions & 0 deletions import-scripts/remove-duplicate-maf-variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/usr/bin/python
import sys
import os
import optparse

# Script to remove duplicate maf records based on the 8 key columns.
# Calculates VAF for each record and picks the record with high VAF
# Formula for VAF = t_alt_count / (t_ref_count + t_alt_count)

ERROR_FILE = sys.stderr
OUTPUT_FILE = sys.stdout

KEY_COLUMNS_INDEX = []
KEY_COLUMNS = ['Entrez_Gene_Id','Chromosome','Start_Position','End_Position','Variant_Classification','Tumor_Seq_Allele2','Tumor_Sample_Barcode','HGVSp_Short']
MAF_DATA = {}

def remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index):
outfile = []
outfile.append(comments)
outfile.append(header)
for key in MAF_DATA:
if len(MAF_DATA[key]) > 1:
vaf_ind = 0
vaf_value = 0
for val in MAF_DATA[key]:
#calculate VAF for each duplicate record.
columns = val.rstrip('\n').split('\t')
try:
VAF = int(columns[t_altc_index])/(int(columns[t_altc_index])+int(columns[t_refc_index]))
if VAF > vaf_value:
vaf_value = VAF
vaf_ind = MAF_DATA[key].index(val)
outfile.append(MAF_DATA[key][vaf_ind])
except:
print >> ERROR_FILE, 'ERROR: VAF cannot be calculated for the variant : ' + key
print >> ERROR_FILE, 'The t_ref_count is: '+ columns[t_refc_index]+ ' and t_alt_count is: '+ columns[t_altc_index]
outfile.append(val)
else:
outfile.append(MAF_DATA[key][0])

out_filename = maf_filename.split('.')[0]+'_merged.txt'
datafile = open(out_filename, 'w')
for line in outfile:
datafile.write(line)
datafile.close()
print >> OUTPUT_FILE, 'MAF file with duplicate variants removed is written to: ' + out_filename +'\n'


def main():
# get command line arguments
parser = optparse.OptionParser()
parser.add_option('-i', '--input-maf-file', action = 'store', dest = 'maf_file')

(options, args) = parser.parse_args()
maf_filename = options.maf_file

comments = ""
header = ""

with open(maf_filename,'r') as maf_file:
for line in maf_file:
if line.startswith('#'):
comments += line
elif line.startswith('Hugo_Symbol'):
header += line
header_cols = line.rstrip('\n').split('\t')
#get the positions of the 8 key maf columns
for value in KEY_COLUMNS:
KEY_COLUMNS_INDEX.append(header_cols.index(value))
t_refc_index = header_cols.index('t_ref_count')
t_altc_index = header_cols.index('t_alt_count')
else:
reference_key = ""
data = line.rstrip('\n').split('\t')
for index in KEY_COLUMNS_INDEX:
reference_key += data[index]+'\t'
reference_key = reference_key.rstrip('\t')
if reference_key not in MAF_DATA:
MAF_DATA[reference_key] = [line]
else:
MAF_DATA[reference_key].append(line)

remove_duplicate_variants(maf_filename, comments, header, t_refc_index, t_altc_index)

if __name__ == '__main__':
main()
21 changes: 21 additions & 0 deletions import-scripts/update-az-mskimpact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,22 @@ function standardize_mutations_data() {
$PYTHON_BINARY $PORTAL_HOME/scripts/standardize_mutations_data.py -f "$NSOUT_MUTATIONS_INPUT_FILEPATH"
}

function remove_duplicate_maf_variants() {
MUTATIONS_EXTD_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended.txt"
NSOUT_MUTATIONS_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations.txt"

MUTATIONS_EXTD_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_mutations_extended_merged.txt"
NSOUT_MUTATIONS_OUTPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_nonsignedout_mutations_merged.txt"

# Remove duplicate variants from MAF files
$PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$MUTATIONS_EXTD_INPUT_FILEPATH" &&
$PYTHON_BINARY $PORTAL_HOME/scripts/remove-duplicate-maf-variants.py -i "$NSOUT_MUTATIONS_INPUT_FILEPATH" &&

# Rewrite mutation files with updated data
mv "$MUTATIONS_EXTD_OUTPUT_FILEPATH" "$MUTATIONS_EXTD_INPUT_FILEPATH" &&
mv "$NSOUT_MUTATIONS_OUTPUT_FILEPATH" "$NSOUT_MUTATIONS_INPUT_FILEPATH"
}

function standardize_structural_variant_data() {
DATA_SV_INPUT_FILEPATH="$AZ_MSK_IMPACT_DATA_HOME/data_sv.txt"
$PYTHON_BINARY $PORTAL_HOME/scripts/standardize_structural_variant_data.py -f "$DATA_SV_INPUT_FILEPATH"
Expand Down Expand Up @@ -421,6 +437,11 @@ if ! standardize_mutations_data ; then
report_error "ERROR: Failed to standardize mutations files for AstraZeneca MSK-IMPACT. Exiting."
fi

# Remove duplicate variants from MAF files
if ! remove_duplicate_maf_variants ; then
report_error "ERROR: Failed to remove duplicate variants from MAF files for AstraZeneca MSK-IMPACT. Exiting."
fi

# Standardize structural variant data by removing records with invalid genes and standardizing the file header
if ! standardize_structural_variant_data ; then
report_error "ERROR: Failed to standardize structural variant data for AstraZeneca MSK-IMPACT. Exiting."
Expand Down