Skip to content

Commit

Permalink
Adding command line exclusion lists, so that users can prune fields f…
Browse files Browse the repository at this point in the history
…rom output. (#5226)

- Added support for `--exclusion-list` parameter, which can be specified multiple times.  This will remove some rendered fields from the output.  The field should appear exactly as written in the output MAF or VCF (e.g. ClinVar_ALLELEID). Closes #4359. 
- Does basic cleaning of field values for MAF.  I.e. cleaning tabs and \n.  Fixes #4693
  • Loading branch information
LeeTL1220 authored Sep 29, 2018
1 parent 02117e2 commit dc59aab
Show file tree
Hide file tree
Showing 19 changed files with 594 additions and 141 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,6 @@ public interface Funcotation {
*/
void setFieldSerializationOverrideValue( final String fieldName, final String overrideValue );

/**
* Converts this {@link Funcotation} to a string suitable for insertion into a VCF file.
* @return a {@link String} representing this {@link Funcotation} suitable for insertion into a VCF file.
*/
String serializeToVcfString();

/**
* @return The name of the data source behind the {@link DataSourceFuncotationFactory} used to create this {@link Funcotation}.
*/
Expand Down Expand Up @@ -76,17 +70,6 @@ default void setFieldSerializationOverrideValues(final Map<String,String> fieldS
}
}

/**
* TODO: This interface should have nothing specific to a VCF. That should be the job of the VCFOutputRenderer to sanitize any strings. https://github.com/broadinstitute/gatk/issues/4797
* Converts this {@link Funcotation} to a string suitable for insertion into a VCF file.
* {@code manualAnnotationString} should be written first, followed by the inherent annotations in this {@link Funcotation}.
* @param manualAnnotationString A {@link String} of manually-provided annotations to add to this {@link Funcotation}.
* @return a {@link String} representing this {@link Funcotation} suitable for insertion into a VCF file.
*/
default String serializeToVcfString(final String manualAnnotationString) {
return (manualAnnotationString == null ? "" : manualAnnotationString) + serializeToVcfString();
}

/**
* @return Return whether the field exists in this {@link Funcotation}.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,11 @@ public class FuncotatorArgumentCollection implements Serializable {
doc = "(Advanced / DO NOT USE*) If you select this flag, Funcotator will force a conversion of variant contig names from b37 to hg19. *This option is useful in integration tests (written by devs) only."
)
public boolean forceB37ToHg19ContigNameConversion = false;

@Argument(
fullName = FuncotatorArgumentDefinitions.EXCLUSION_FIELDS_LONG_NAME,
optional = true,
doc = "Fields that should not be rendered in the final output. Only exact name matches will be excluded."
)
public Set<String> excludedFields = new HashSet<>();
}
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ public class FuncotatorArgumentDefinitions {
*/
public static final String ANNOTATION_OVERRIDES_LONG_NAME = "annotation-override";

/**
* List of final rendered fields to exclude from a final rendering
*/
public static final String EXCLUSION_FIELDS_LONG_NAME = "exclude-field";

public static final String HG19_REFERENCE_VERSION_STRING = "hg19";
public static final String HG38_REFERENCE_VERSION_STRING = "hg38";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ public OutputRenderer createOutputRenderer(final LinkedHashMap<String, String> a
unaccountedForDefaultAnnotations,
unaccountedForOverrideAnnotations,
defaultToolVcfHeaderLines.stream().map(Object::toString).collect(Collectors.toCollection(LinkedHashSet::new)),
funcotatorArgs.referenceVersion);
funcotatorArgs.referenceVersion, funcotatorArgs.excludedFields);
break;

case VCF:
Expand All @@ -202,7 +202,7 @@ public OutputRenderer createOutputRenderer(final LinkedHashMap<String, String> a
headerForVariants,
unaccountedForDefaultAnnotations,
unaccountedForOverrideAnnotations,
defaultToolVcfHeaderLines
defaultToolVcfHeaderLines, funcotatorArgs.excludedFields
);
break;
default:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2077,14 +2077,25 @@ public static String[] extractFuncotatorKeysFromHeaderDescription(final String f
}

/**
* Make sure that an individual funcotation (i.e. single value of a funcotation) is sanitized for VCF consumption.
* Make sure that an individual funcotation field (i.e. single value of a funcotation) is sanitized for VCF consumption.
* Particularly, make sure that it does not allow special characters that would interfere with VCF parsing.
* @param individualFuncotation value from a funcotation Never {@code null}
* @return input string with special characters replaced by _%HEX%_ where HEX is the 2 digit ascii hex code.
* @param individualFuncotationField value from a funcotation. Never {@code null}
* @return input string with special characters replaced by _%HEX%_ where HEX is the 2 digit ascii hex code. Never {@code null}
*/
public static String sanitizeFuncotationForVcf(final String individualFuncotation) {
Utils.nonNull(individualFuncotation);
return StringUtils.replaceEach(individualFuncotation, new String[]{",", ";", "=", "\t", "|", " "}, new String[]{"_%2C_", "_%3B_", "_%3D_", "_%09_", "_%7C_", "_%20_"});
public static String sanitizeFuncotationFieldForVcf(final String individualFuncotationField) {
Utils.nonNull(individualFuncotationField);
return StringUtils.replaceEach(individualFuncotationField, new String[]{",", ";", "=", "\t", "|", " ", "\n"}, new String[]{"_%2C_", "_%3B_", "_%3D_", "_%09_", "_%7C_", "_%20_", "_%0A_"});
}

/**
* Make sure that an individual funcotation field (i.e. single value of a funcotation) is sanitized for MAF consumption.
* Particularly, make sure that it does not allow special characters that would interfere with MAF parsing.
* @param individualFuncotationField value from a funcotation. Never {@code null}
* @return input string with special characters replaced by _%HEX%_ where HEX is the 2 digit ascii hex code. Never {@code null}
*/
public static String sanitizeFuncotationFieldForMaf(final String individualFuncotationField) {
Utils.nonNull(individualFuncotationField);
return StringUtils.replaceEach(individualFuncotationField, new String[]{"\t", "\n"}, new String[]{"_%09_", "_%0A_"});
}

/**
Expand Down Expand Up @@ -2175,5 +2186,24 @@ public static List<Funcotation> createFuncotations(final VariantContext vc, fina

return result;
}

/**
* @param funcotation Funcotation to render for a VCF. Never {@code null}
* @param includedFields List of fields to include. Any that match fields in the funcotation will be rendered.
* Never {@code null}
* @return string with the VCF representation of a {@link VcfOutputRenderer#FUNCOTATOR_VCF_FIELD_NAME} for the given
* Funcotation. Never {@code null}, but empty string is possible.
*/
public static String renderSanitizedFuncotationForVcf(final Funcotation funcotation, final List<String> includedFields) {
Utils.nonNull(funcotation);
Utils.nonNull(includedFields);
if (includedFields.size() == 0) {
return "";
}
return funcotation.getFieldNames().stream()
.filter(f -> includedFields.contains(f))
.map(field -> FuncotatorUtils.sanitizeFuncotationFieldForVcf(funcotation.getField(field)))
.collect(Collectors.joining(VcfOutputRenderer.FIELD_DELIMITER));
}
}

Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package org.broadinstitute.hellbender.tools.funcotator;

import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLineType;
import htsjdk.variant.vcf.VCFInfoHeaderLine;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.TableFuncotation;
import org.broadinstitute.hellbender.tools.funcotator.metadata.VcfFuncotationMetadata;
import org.broadinstitute.hellbender.utils.Utils;

import java.util.LinkedHashMap;
import java.util.List;
Expand All @@ -23,11 +29,6 @@ public abstract class OutputRenderer implements AutoCloseable {
*/
protected LinkedHashMap<String, String> manualAnnotations;

/**
* {@link String} representation of {@link OutputRenderer#manualAnnotations} serialized to the output format of this {@link OutputRenderer}.
*/
protected String manualAnnotationSerializedString;

/**
* {@link List} of the {@link DataSourceFuncotationFactory} objects that are being used in this run of {@link Funcotator}.
*/
Expand Down Expand Up @@ -56,4 +57,26 @@ public String getDataSourceInfoString() {
* @param txToFuncotationMap {@link FuncotationMap} to add to the given {@code variant} on output.
*/
public abstract void write(final VariantContext variant, final FuncotationMap txToFuncotationMap);

/**
* Utility for output renderers.
*
* Given a {@link LinkedHashMap} of field:value pairs (String:String), create a funcotation (funcotation metadata included!)
* with the best representation possible.
*
* @param data Never {@code null}
* @param altAllele Never {@code null}
* @param datasourceName Name to use as the datasource. Never {@code null}
* @return A funcotation with all fields considered strings and a generic description. Funcotation metadata is populated.
* Never {@code null}
*/
public static Funcotation createFuncotationFromLinkedHashMap(final LinkedHashMap<String, String> data, final Allele altAllele, final String datasourceName) {
Utils.nonNull(data);
Utils.nonNull(altAllele);
Utils.nonNull(datasourceName);
final List<VCFInfoHeaderLine> manualAnnotationHeaderLines = data.entrySet().stream()
.map(e -> new VCFInfoHeaderLine(e.getKey(), 1, VCFHeaderLineType.String, "Specified from map: " + e.getKey() + ":" + e.getValue() ))
.collect(Collectors.toList());
return TableFuncotation.create(data, altAllele, datasourceName, VcfFuncotationMetadata.create(manualAnnotationHeaderLines));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.Funcotation;
import org.broadinstitute.hellbender.tools.funcotator.FuncotatorUtils;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.xsv.LocatableXsvFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.xsv.SimpleKeyXsvFuncotationFactory;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadata;
import org.broadinstitute.hellbender.tools.funcotator.metadata.FuncotationMetadataUtils;
import org.broadinstitute.hellbender.tools.funcotator.vcfOutput.VcfOutputRenderer;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.codecs.xsvLocatableTable.XsvTableFeature;

Expand Down Expand Up @@ -101,13 +99,6 @@ public void setFieldSerializationOverrideValue(final String fieldName, final Str
fieldMap.put(fieldName, overrideValue);
}

@Override
public String serializeToVcfString() {
return fieldMap.values().stream()
.map(f -> (f == null ? "" : FuncotatorUtils.sanitizeFuncotationForVcf(f)))
.collect(Collectors.joining(VcfOutputRenderer.FIELD_DELIMITER));
}

@Override
public LinkedHashSet<String> getFieldNames() {
return new LinkedHashSet<>(fieldMap.keySet());
Expand Down Expand Up @@ -216,6 +207,21 @@ public static TableFuncotation create(final Map<String, Object> data, final Alle
return create(fieldNames, fieldValues, altAllele, dataSourceName, metadata);
}

/**
* See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*
* @param data Map for field name to field value. Never {@code null}
* @param altAllele See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @param dataSourceName See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @param metadata See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
* @return See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*/
public static TableFuncotation create(final LinkedHashMap<String, String> data, final Allele altAllele, final String dataSourceName, final FuncotationMetadata metadata ) {
final List<String> fieldNames = new ArrayList<>(data.keySet());
final List<String> fieldValues = fieldNames.stream().map(f -> data.get(f)).collect(Collectors.toList());
return create(fieldNames, fieldValues, altAllele, dataSourceName, metadata);
}

/**
* See {@link TableFuncotation#create(List, List, Allele, String, FuncotationMetadata)}
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ public Allele getAltAllele() {
return Allele.create(tumorSeqAllele2.getBytes(), false);
}

@Override
public String serializeToVcfString() {
// Alias for the FIELD_DELIMITER so we can have nicer looking code:
final String DELIMITER = VcfOutputRenderer.FIELD_DELIMITER;
Expand Down Expand Up @@ -212,7 +211,7 @@ public String serializeToVcfString() {
(otherTranscriptsSerializedOverride != null ? otherTranscriptsSerializedOverride : (otherTranscripts != null ? otherTranscripts.stream().map(Object::toString).collect(Collectors.joining(VcfOutputRenderer.OTHER_TRANSCRIPT_DELIMITER)) : ""))
);

return funcotations.stream().map(FuncotatorUtils::sanitizeFuncotationForVcf).collect(Collectors.joining(DELIMITER));
return funcotations.stream().map(FuncotatorUtils::sanitizeFuncotationFieldForVcf).collect(Collectors.joining(DELIMITER));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public GencodeFuncotationBuilder(final GencodeFuncotation gf) {
}

public GencodeFuncotation build() {
// TODO: In the future, we will need a mechanism for populating the metadata (https://github.com/broadinstitute/gatk/issues/4857)
// TODO: In the future, we will need a mechanism for populating the metadata, especially if we want to support separate INFO fields in a VCF for each funcotation field (https://github.com/broadinstitute/gatk/issues/4857)
gencodeFuncotation.setMetadata(FuncotationMetadataUtils.createWithUnknownAttributes(new ArrayList<>(gencodeFuncotation.getFieldNames())));
return gencodeFuncotation;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.apache.commons.lang.StringUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.*;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.gencode.GencodeFuncotation;
Expand All @@ -23,6 +24,7 @@
import java.nio.file.Path;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -144,6 +146,9 @@ public class MafOutputRenderer extends OutputRenderer {
/** The version of the reference used to create annotations that will be output by this {@link MafOutputRenderer}.*/
private final String referenceVersion;

/** Fields that should be removed in the final MAF file. */
private final Set<String> excludedOutputFields;

//==================================================================================================================
// Constructors:

Expand All @@ -163,7 +168,7 @@ public MafOutputRenderer(final Path outputFilePath,
final LinkedHashMap<String, String> unaccountedForDefaultAnnotations,
final LinkedHashMap<String, String> unaccountedForOverrideAnnotations,
final Set<String> toolHeaderLines,
final String referenceVersion) {
final String referenceVersion, final Set<String> excludedOutputFields) {

// Set our internal variables from the input:
this.outputFilePath = outputFilePath;
Expand Down Expand Up @@ -253,16 +258,15 @@ public MafOutputRenderer(final Path outputFilePath,
defaultMap.put(MafOutputRendererConstants.FieldName_Score, MafOutputRendererConstants.UNUSED_STRING);
defaultMap.put(MafOutputRendererConstants.FieldName_BAM_File, MafOutputRendererConstants.UNUSED_STRING);

// Cache the manual annotation string so we can pass it easily into any Funcotations:
manualAnnotationSerializedString = (manualAnnotations.size() != 0 ? MafOutputRendererConstants.FIELD_DELIMITER + String.join( MafOutputRendererConstants.FIELD_DELIMITER, manualAnnotations.values() ) + MafOutputRendererConstants.FIELD_DELIMITER : "");

// Open the output object:
try {
printWriter = new PrintWriter(Files.newOutputStream(outputFilePath));
}
catch (final IOException ex) {
throw new UserException("Error opening output file path: " + outputFilePath.toUri().toString(), ex);
}

this.excludedOutputFields = excludedOutputFields;
}

//==================================================================================================================
Expand Down Expand Up @@ -327,12 +331,13 @@ public void write(final VariantContext variant, final FuncotationMap txToFuncota
writeString(entry.getValue());
writeString(MafOutputRendererConstants.FIELD_DELIMITER);
}
writeLine(manualAnnotationSerializedString);
writeLine("");
}
}
}

private LinkedHashMap<String, String> createMafCompliantOutputMap(final Allele altAllele, final List<Funcotation> funcotations) {
@VisibleForTesting
LinkedHashMap<String, String> createMafCompliantOutputMap(final Allele altAllele, final List<Funcotation> funcotations) {
// Create our output maps:
final LinkedHashMap<String, Object> outputMap = new LinkedHashMap<>(defaultMap);
final LinkedHashMap<String, Object> extraFieldOutputMap = new LinkedHashMap<>();
Expand Down Expand Up @@ -366,7 +371,17 @@ private LinkedHashMap<String, String> createMafCompliantOutputMap(final Allele a
outputMap.putAll(extraFieldOutputMap);

// Now translate fields to the field names that MAF likes:
return replaceFuncotationValuesWithMafCompliantValues(outputMap);
final LinkedHashMap<String, String> mafCompliantMap = replaceFuncotationValuesWithMafCompliantValues(outputMap);

// Remove any fields that are excluded and sanitize any field values.
return mafCompliantMap.keySet().stream()
.filter(k -> !excludedOutputFields.contains(k))
.collect(Collectors.toMap(
Function.identity(), k -> FuncotatorUtils.sanitizeFuncotationFieldForMaf(mafCompliantMap.get(k)),
(u, v) -> {
throw new GATKException.ShouldNeverReachHereException("Found duplicate keys for MAF output");
},
LinkedHashMap::new));
}

//==================================================================================================================
Expand Down
Loading

0 comments on commit dc59aab

Please sign in to comment.