From 6f98aa4e5b8152cc9031848080ca0c0c36f72606 Mon Sep 17 00:00:00 2001 From: James Date: Fri, 24 Aug 2018 13:04:28 -0400 Subject: [PATCH 01/15] Added RevertSamSpark, a replacement for the RevertSam tool that allows for removal of alignment information and reversion to the original base qualities for reads. --- .../cmdline/StandardArgumentDefinitions.java | 3 + .../tools/spark/RevertSamSpark.java | 706 ++++++++++++++++++ .../tools/spark/pipelines/SortSamSpark.java | 4 +- .../MarkDuplicatesSparkUtils.java | 51 +- .../utils/codecs/table/TableCodec.java | 44 +- .../hellbender/utils/read/GATKRead.java | 25 +- .../read/SAMRecordToGATKReadAdapter.java | 8 + .../runtime/StreamingProcessController.java | 2 +- .../hellbender/utils/spark/SparkUtils.java | 55 +- ...andLineProgramExecutorIntegrationTest.java | 2 +- .../spark/RevertSamSparkIntegrationTest.java | 434 +++++++++++ .../tools/spark/RevertSamSparkUnitTest.java | 84 +++ .../SortSamSparkIntegrationTest.java | 7 +- .../MarkDuplicatesSparkUtilsUnitTest.java | 3 +- ...tMarkDuplicatesCommandLineProgramTest.java | 2 +- .../codecs/table/TableCodecUnitTest.java | 8 +- .../haplotype/HaplotypeBAMWriterUnitTest.java | 8 +- .../tools/spark/RevertSamSpark/test.dict | 9 + .../spark/revertsamspark/missing-rg-info.sam | 243 ++++++ .../revert_sam_bad_header_output_map.txt | 3 + .../spark/revertsamspark/revert_sam_basic.sam | 22 + .../revert_sam_sample_library_override.sam | 21 + .../revertsamspark/revert_sam_single_end.sam | 5 + .../revert_sam_valid_output_map.txt | 3 + .../tools/spark/revertsamspark/test.fasta | 40 + .../testutils/SamAssertionUtils.java | 6 +- 26 files changed, 1714 insertions(+), 84 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.dict create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_bad_header_output_map.txt create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_basic.sam create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_sample_library_override.sam create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_single_end.sam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta diff --git a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java index d8bd4ba172e..e3d7c04c5a3 100644 --- a/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java +++ b/src/main/java/org/broadinstitute/hellbender/cmdline/StandardArgumentDefinitions.java @@ -40,6 +40,8 @@ private StandardArgumentDefinitions(){} public static final String PEDIGREE_FILE_LONG_NAME = "pedigree"; public static final String SITES_ONLY_LONG_NAME = "sites-only-vcf-output"; public static final String INVALIDATE_PREVIOUS_FILTERS_LONG_NAME = "invalidate-previous-filters"; + public static final String SORT_ORDER_LONG_NAME = "sort-order"; + public static final String INPUT_SHORT_NAME = "I"; public static final String OUTPUT_SHORT_NAME = "O"; @@ -67,6 +69,7 @@ private StandardArgumentDefinitions(){} public static final String ANNOTATIONS_TO_EXCLUDE_SHORT_NAME = "AX"; public static final String SAMPLE_NAME_SHORT_NAME = "sn"; public static final String PEDIGREE_FILE_SHORT_NAME = "ped"; + public static final String SORT_ORDER_SHORT_NAME = "SO"; public static final String SPARK_PROPERTY_NAME = "conf"; diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java new file mode 100644 index 00000000000..0811f38948f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -0,0 +1,706 @@ +package org.broadinstitute.hellbender.tools.spark; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.LinkedListMultimap; +import com.google.common.collect.ListMultimap; +import com.google.common.collect.Lists; +import htsjdk.samtools.*; +import htsjdk.samtools.util.*; +import htsjdk.tribble.AbstractFeatureReader; +import htsjdk.tribble.FeatureReader; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.broadcast.Broadcast; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; +import org.broadinstitute.barclay.argparser.CommandLineParser; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; +import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.codecs.table.TableCodec; +import org.broadinstitute.hellbender.utils.codecs.table.TableFeature; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.ReadUtils; +import org.broadinstitute.hellbender.utils.spark.SparkUtils; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; +import scala.Tuple2; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Reverts a SAM file by optionally restoring original quality scores and by removing + * all alignment information. + *

+ *

+ * This tool removes or restores certain properties of the SAM records, including alignment information. + * It can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of + * restoring the original quality scores of a BAM file that has already undergone base quality score recalibration + * (BQSR) if the original qualities were retained during the calibration (in the OQ tag). + *

+ *

Usage Examples

+ *

Output to a single file

+ *
+ * java -jar picard.jar RevertSam \\
+ *      -I input.bam \\
+ *      -O reverted.bam
+ * 
+ *

+ *

Output by read group into multiple files with sample map

+ *
+ * java -jar picard.jar RevertSam \\
+ *      -I input.bam \\
+ *      --output-by-readgroup \\
+ *      --output-map reverted_bam_paths.tsv
+ * 
+ *

+ *

Output by read group with no output map

+ *
+ * java -jar picard.jar RevertSam \\
+ *      -I input.bam \\
+ *      --output-by-readgroup \\
+ *      -O /write/reverted/read/group/bams/in/this/dir
+ * 
+ * This will output a BAM (Can be overridden with outputByReadgroupFileFormat option.) + *
+ * Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to + * LENIENT or SILENT if the failures are expected to be obviated by the reversion process + * (e.g. invalid alignment information will be obviated when the dontRemoveAlignmentInformation option is used). + */ + +@DocumentedFeature +@CommandLineProgramProperties( + summary =RevertSamSpark.USAGE_DETAILS, + oneLineSummary =RevertSamSpark.USAGE_SUMMARY, + programGroup = ReadDataManipulationProgramGroup.class) +@BetaFeature +public class RevertSamSpark extends GATKSparkTool { + static final String USAGE_SUMMARY = "Reverts SAM or BAM files to a previous state. "; + static final String USAGE_DETAILS = "This tool removes or restores certain properties of the SAM records, including alignment " + + "information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " + + "restoring the original quality scores of a BAM file that has already undergone base quality score recalibration (BQSR) if the" + + "original qualities were retained.\n" + + "

Examples

\n" + + "

Example with single output:

\n" + + "java -jar picard.jar RevertSam \\\n" + + " -I input.bam \\\n" + + " -O reverted.bam\n" + + "\n" + + "

Example outputting by read group with output map:

\n" + + "java -jar picard.jar RevertSam \\\n" + + " -I input.bam \\\n" + + " --output-by-readgroup \\\n" + + " --output-map reverted_bam_paths.tsv\n" + + "\n" + + "Will output a BAM/SAM file per read group.\n" + + "

Example outputting by read group without output map:

\n" + + "java -jar picard.jar RevertSam \\\n" + + " I=input.bam \\\n" + + " --output-by-readgroup \\\n" + + " -O /write/reverted/read/group/bams/in/this/dir\n" + + "\n" + + "Will output a BAM file per read group." + + " Output format can be overridden with the outputByReadgroupFileFormat option.\n" + + "Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to " + + "LENIENT or SILENT if the failures are expected to be obviated by the reversion process " + + "(e.g. invalid alignment information will be obviated when the dontRemoveAlignmentInformation option is used).\n" + + ""; + @Override + public boolean requiresReads() { return true; } + + @Argument(mutex = {OUTPUT_MAP_ARG}, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, doc = "The output SAM/BAM file to create, or an output directory if '--output-by-readgroup' is set.") + public File output; + + public static final String OUTPUT_MAP_ARG = "output-map"; + @Argument(mutex = {StandardArgumentDefinitions.OUTPUT_LONG_NAME}, fullName = OUTPUT_MAP_ARG, doc = "Tab separated file with two columns, READ_GROUP_ID and OUTPUT, providing file mapping only used if '--output-by-readgroup' is set.") + public File outputMap; + + public static final String OUTPUT_BY_READGROUP_ARG = "output-by-readgroup"; + @Argument(fullName = OUTPUT_BY_READGROUP_ARG, doc = "When true, outputs each read group in a separate file.") + public boolean outputByReadGroup = false; + + public static enum FileType implements CommandLineParser.ClpEnum { + sam("Generate SAM files."), + bam("Generate BAM files."), + cram("Generate CRAM files."), + dynamic("Generate files based on the extention of input."); + + final String description; + + FileType(String descrition) { + this.description = descrition; + } + + @Override + public String getHelpDoc() { + return description; + } + } + + @Argument(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " + + "a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " + + "mates, duplicated records, records with mismatches in length of bases and qualities. This option should " + + "only be enabled if the output sort order is queryname and will always cause sorting to occur.") + public boolean sanitize = false; + + @Argument(doc = "If 'sanitize' only one record when we find more than one record with the same name for R1/R2/unpaired reads respectively. " + + "For paired end reads, keeps only the first R1 and R2 found respectively, and discards all unpaired reads. " + + "Duplicates do not refer to the duplicate flag in the FLAG field, but instead reads with the same name.", + fullName = "keep-first-duplicate") + public boolean keepFirstDuplicate = false; + + public static final String OUTPUT_BY_READGROUP_FILE_FORMAT_ARG = "output-by-readgroup-file-format"; + @Argument(fullName = OUTPUT_BY_READGROUP_FILE_FORMAT_ARG, doc = "When using outputByReadGroup, the output file format can be set to a certain format.") + public FileType outputByReadgroupFileFormat = FileType.dynamic; + + @Argument(shortName = StandardArgumentDefinitions.SORT_ORDER_SHORT_NAME, fullName = StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, doc = "The sort order to create the reverted output file with, defaults to whatever is specified in the current file", optional = true) + public SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.queryname; + + public static final String DONT_RESTORE_ORIGINAL_QUALITIES_ARG = "dont-restore-original-qualities"; + @Argument( fullName = DONT_RESTORE_ORIGINAL_QUALITIES_ARG, doc = "Set to prevent the tool from setting the OQ field to the QUAL where avalible.", optional = true) + public boolean dontRestoreOriginalQualities = false; + + public static final String DONT_REMOVE_DUPLICATE_INFORMATION_ARG = "remove-duplicate-information"; + @Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_ARG, doc = "By default we remove duplicate read flags from all reads. Note that if this is true, " + + " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") + public boolean dontRemoveDuplicateInformation = false; //TODO flip this value + + public static final String DONT_REMOVE_ALIGNMENT_INFORMATION_ARG = "remove-alignment-information"; + @Argument(fullName = DONT_REMOVE_ALIGNMENT_INFORMATION_ARG, doc = "Remove all alignment information from the file.") + public boolean dontRemoveAlignmentInformation = false; + + public static final String ATTRIBUTE_TO_CLEAR_ARG = "attributes-to-clear"; + @Argument(fullName = ATTRIBUTE_TO_CLEAR_ARG,doc = "When removing alignment information, the set of optional tags to remove.", optional = true) + public List attributesToClear = new ArrayList(); + + public static final String REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_ARG = "remove-default-attributes-to-clear"; + @Argument(fullName = REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_ARG,doc = "When removing alignment information, the set of optional tags to remove.", optional = true) + public boolean removeDefaults = false; + + public static List DEFAULT_ATTRIBUTES_TO_CLEAR = new ArrayList() {{ + add(SAMTag.NM.name()); + add(SAMTag.UQ.name()); + add(SAMTag.PG.name()); + add(SAMTag.MD.name()); + add(SAMTag.MQ.name()); + add(SAMTag.SA.name()); // Supplementary alignment metadata + add(SAMTag.MC.name()); // Mate Cigar + add(SAMTag.AS.name()); + }}; + + public static final String SAMPLE_ALIAS_ARG = "sample-alias"; + @Argument(fullName = SAMPLE_ALIAS_ARG, doc = "The sample alias to use in the reverted output file. This will override the existing " + + "sample alias in the file and is used only if all the read groups in the input file have the " + + "same sample alias.", shortName = StandardArgumentDefinitions.SAMPLE_ALIAS_SHORT_NAME, optional = true) + public String sampleAlias; + + public static final String LIBRARY_NAME_ARG = "library-name"; + @Argument(fullName = LIBRARY_NAME_ARG, doc = "The library name to use in the reverted output file. This will override the existing " + + "sample alias in the file and is used only if all the read groups in the input file have the " + + "same library name.", optional = true) + public String libraryName; + + @Override + public List getDefaultReadFilters() { + return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); + } + + /** + * Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort. + */ + @Override + protected String[] customCommandLineValidation() { + final List errors = new ArrayList<>(); + RevertSamSpark.ValidationUtil.validateOutputParams(outputByReadGroup, output, outputMap, errors); + + if (!sanitize && keepFirstDuplicate) errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'"); + + if (!errors.isEmpty()) { + return errors.toArray(new String[errors.size()]); + } + return null; + } + + @Override + protected void runTool(JavaSparkContext ctx) { + Broadcast headerb = ctx.broadcast(getHeaderForReads()); + JavaRDD reads = getReads(); + + //////////////////////////////////////////////////////////////////////////// + // Grab the input header and remap values where appropriate + //////////////////////////////////////////////////////////////////////////// + SAMFileHeader inHeader = getHeaderForReads(); + ValidationUtil.validateHeaderOverrides(inHeader, sampleAlias, libraryName); + if (sampleAlias != null) { + inHeader.getReadGroups().forEach(rg -> rg.setSample(sampleAlias)); + } + if (libraryName != null) { + inHeader.getReadGroups().forEach(rg -> rg.setLibrary(libraryName)); + } + + //////////////////////////////////////////////////////////////////////////// + // Map the readgroups in the header to appropriate + //////////////////////////////////////////////////////////////////////////// + Map writerMap = getOutputMap(outputMap, + output, + getDefaultExtension(readArguments.getReadFiles().get(0).toString(), outputByReadgroupFileFormat), + inHeader.getReadGroups(), + outputByReadGroup); + + //////////////////////////////////////////////////////////////////////////// + // Construct appropriate headers for the output files + //////////////////////////////////////////////////////////////////////////// + final Map headerMap = getReadgroupHeaderMap(inHeader, writerMap); + + // Revert the reads based on the given attributes + List attributesToRevert = removeDefaults? DEFAULT_ATTRIBUTES_TO_CLEAR: new ArrayList<>(); + attributesToRevert.addAll(attributesToClear); + JavaRDD readsReverted = revertReads(reads, attributesToRevert); + + //////////////////////////////////////////////////////////////////////////// + // Sanitize the reads, sorting them into appropriate order if necessary + //////////////////////////////////////////////////////////////////////////// + if (sanitize) { + Map readGroupFormatMap = createReadGroupFormatMap(readsReverted, headerb, !dontRestoreOriginalQualities); + + readsReverted = sanitize(readGroupFormatMap, readsReverted, inHeader, keepFirstDuplicate); + } + + // Write the one or many read output files + for (Map.Entry rmap: writerMap.entrySet()) { + //TODO what to do if the readgroup isn't present + final String key = rmap.getKey(); + JavaRDD filteredreads = rmap.getKey()==null? readsReverted : + readsReverted.filter(r -> r.getReadGroup().equals(key)); + writeReads(ctx, rmap.getValue().getPath(), filteredreads, headerMap.get(rmap.getKey())); //TODO proper header map + } + } + + /** + * Runs the QualityEncodingDetector over a sampling of each readGroup present in the file to detect what the encoding format + * for base quality those reads are. + * + * @param reads Reads RDD over which to iterate and detect readgroups + * @param inHeader Header describing the readgroups present in the bam + * @param restoreOriginalQualities Whether to use the OQ tag for determining the map + * @return + */ + private Map createReadGroupFormatMap( final JavaRDD reads, + final Broadcast inHeader, + final boolean restoreOriginalQualities) { + final Map output = new HashMap<>(); + + inHeader.getValue().getReadGroups().stream().parallel().forEach(rg -> { + // For each readgroup filter down to just the reads in that group + final String key = rg.getId(); + JavaRDD filtered = reads.filter(r -> r.getReadGroup().equals(key)); + + if (!filtered.isEmpty()) { + + // take the number of reads required by QualityEncodingDetector to determine quality score map + CloseableIterator iterator = new CloseableIterator() { + Iterator delegateIterator = filtered.take((int) QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE).iterator(); + + @Override + public void close() { + delegateIterator = null; + } + + @Override + public boolean hasNext() { + return delegateIterator != null && delegateIterator.hasNext(); + } + + @Override + public SAMRecord next() { + if (!hasNext()) { + throw new NoSuchElementException("hasNext should be called before next"); + } + return delegateIterator.next().convertToSAMRecord(inHeader.getValue()); + } + }; + + // Save what the quality format for each readgroup was. + output.put(rg.getId(), QualityEncodingDetector.detect( + QualityEncodingDetector.DEFAULT_MAX_RECORDS_TO_ITERATE, + iterator, + restoreOriginalQualities)); + } + }); + return output; + } + + /** + * If this is run, we want to be careful to remove duplicated reads from the bam. + * + * In order to do this we group each read by its readname and randomly select one read labeled as first in pair + * and one read labled as second in pair to treat as the representative reads, throwing away the rest. + */ + private JavaRDD sanitize(final Map readGroupToFormat, final JavaRDD reads, final SAMFileHeader header, final boolean keepFirstDuplicate) { + JavaRDD sortedReads = querynameSortReadsIfNecessary(reads.filter(r -> r.getBases().length == r.getBaseQualities().length), getRecommendedNumReducers(), header); + JavaPairRDD> readsByGroup = spanReadsByKey(sortedReads); + + return readsByGroup.flatMap(group -> { + final List out = Lists.newArrayList(); + + List primaryReads = Utils.stream(group._2()).collect(Collectors.toList()); + + // Get the number of R1s, R2s, and unpaired reads respectively. + int firsts = 0, seconds = 0, unpaired = 0; + GATKRead firstRecord = null, secondRecord = null, unpairedRecord = null; + for (final GATKRead rec : primaryReads) { + if (!rec.isPaired()) { + if (unpairedRecord == null) { + unpairedRecord = rec; + } + ++unpaired; + } else { + if (rec.isFirstOfPair()) { + if (firstRecord == null) { + firstRecord = rec; + } + ++firsts; + } + if (rec.isSecondOfPair()) { + if (secondRecord == null) { + secondRecord = rec; + } + ++seconds; + } + } + } + + // If we have paired reads, then check that there is exactly one first of pair and one second of pair. + // Otherwise, check that we have only one unpaired read. + if (firsts > 0 || seconds > 0) { // if we have any paired reads + if (firsts != 1 || seconds != 1) { // if we do not have exactly one R1 and one R2 + if (keepFirstDuplicate && firsts >= 1 && seconds >= 1) { // if we have at least one R1 and one R2, we can discard all but an arbitrary one + primaryReads = Arrays.asList(firstRecord, secondRecord); + } + // Otherwise don't admit anything from this group + else { + return out.iterator(); + } + } + } else if (unpaired > 1) { // only unpaired reads, and we have too many + if (keepFirstDuplicate) { + primaryReads = Collections.singletonList(unpairedRecord); + } + // Otherwise remove these reads entirely + else { + return out.iterator(); + } + } + + // If we've made it this far spit the records into the output! + for (final GATKRead rec : primaryReads) { + // The only valid quality score encoding scheme is standard; if it's not standard, change it. + final FastqQualityFormat recordFormat = readGroupToFormat.get(rec.getReadGroup()); + if (recordFormat != null && !recordFormat.equals(FastqQualityFormat.Standard)) { + final byte[] quals = rec.getBaseQualities(); + for (int i = 0; i < quals.length; i++) { + quals[i] -= SolexaQualityConverter.ILLUMINA_TO_PHRED_SUBTRAHEND; + } + rec.setBaseQualities(quals); + } + out.add(rec); + } + return out.iterator(); + }); + + } + + /** + * Method which takes an RDD of reads that is guaranteed to have every readname group placed together on the same + * partition and maps those so a JavaPairRDD with the readname as the key. + */ + private static JavaPairRDD> spanReadsByKey(final JavaRDD reads) { + JavaPairRDD nameReadPairs = reads.mapToPair(read -> new Tuple2<>(read.getName(), read)); + return SparkUtils.spanByKey(nameReadPairs).flatMapToPair(namedRead -> { + // for each name, separate reads by key (group name) + List>> out = Lists.newArrayList(); + ListMultimap multi = LinkedListMultimap.create(); + for (GATKRead read : namedRead._2()) { + multi.put(read.getName(), read); + } + for (String key : multi.keySet()) { + // list from Multimap is not serializable by Kryo, so put in a new array list + out.add(new Tuple2<>(key, Lists.newArrayList(multi.get(key)))); + } + return out.iterator(); + }); + } + + private static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader headerForTool) { + JavaRDD sortedReadsForMarking; + if (ReadUtils.isReadNameGroupedBam(headerForTool)) { + sortedReadsForMarking = reads; + } else { + headerForTool.setSortOrder(SAMFileHeader.SortOrder.queryname); + sortedReadsForMarking = SparkUtils.sortReadsAccordingToHeader(reads, headerForTool, numReducers); + } + return sortedReadsForMarking; + } + + + private Map getReadgroupHeaderMap(SAMFileHeader inHeader, Map writerMap) { + final Map headerMap; + if (outputByReadGroup) { + if (inHeader.getReadGroups().isEmpty()) { + throw new GATKException("The header is missing its read group map"); + } + + ValidationUtil.assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups()); + headerMap = new HashMap<>(); + for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) { + final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); + header.addReadGroup(readGroup); + headerMap.put(readGroup.getId(), header); + } + } else { + final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); + inHeader.getReadGroups().forEach(singleOutHeader::addReadGroup); + headerMap = Collections.singletonMap(null,singleOutHeader); + } + return headerMap; + } + + private SAMFileHeader createOutHeader( + final SAMFileHeader inHeader, + final SAMFileHeader.SortOrder sortOrder, + final boolean removeAlignmentInformation) { + final SAMFileHeader outHeader = new SAMFileHeader(); + outHeader.setSortOrder(sortOrder); + if (!removeAlignmentInformation) { + outHeader.setSequenceDictionary(inHeader.getSequenceDictionary()); + outHeader.setProgramRecords(inHeader.getProgramRecords()); + } + return outHeader; + } + + @VisibleForTesting + static String getDefaultExtension(final String input, final FileType setting) { + if (setting == FileType.dynamic) { + if (input.endsWith(".sam")) { + return ".sam"; + } + if (input.endsWith(".cram")) { + throw new GATKException("Input file is a cram. This is currently unsupported for this tool");//TODO unsupported feature + } + return ".bam"; + } else { + return "." + setting.toString(); + } + } + + /** + * Takes an individual SAMRecord and applies the set of changes/reversions to it that + * have been requested by program level options. + */ + public JavaRDD revertReads(JavaRDD reads, List attributesToClear) { + Broadcast> attrBroadcast = JavaSparkContext.fromSparkContext(reads.context()).broadcast(attributesToClear); + + if (!dontRestoreOriginalQualities) { + reads = reads.map(r -> { + final byte[] oq = r.getOriginalBaseQualities(); + if (oq != null) { + r.setBaseQualities(oq); + r.setAttribute("OQ", (String)null); + } + return r; + }); + } + + if (!dontRemoveDuplicateInformation) { + reads = reads.map(r -> {r.setIsDuplicate(false); return r;}); + } + + if (!dontRemoveAlignmentInformation) { + reads = reads.map(rec -> { + if (rec.isReverseStrand()) { + rec.reverseComplement(); + rec.setIsReverseStrand(false); + } + + // Remove all alignment based information about the read itself + rec.setIsUnplaced(); + rec.setCigar(SAMRecord.NO_ALIGNMENT_CIGAR); + + rec.setInferredInsertSize(0); + rec.setIsSecondaryAlignment(false); + rec.setIsProperlyPaired(false); + + // Then remove any mate flags and info related to alignment + rec.setMateIsUnplaced(); + + // And then remove any tags that are calculated from the alignment + attrBroadcast.getValue().forEach(tag -> rec.setAttribute(tag, (String) null)); + return rec; + }); + } + + return reads; + } + + @VisibleForTesting + static Map getOutputMap( + final File outputMapFile, + final File outputDir, + final String defaultExtension, + final List readGroups, + final boolean outputByReadgroup) { + + if (outputByReadgroup) { + final Map outputMap; + if (outputMapFile != null) { + try { + outputMap = createOutputMapFromFile(outputMapFile); + } catch (IOException e) { + throw new GATKException("Encountered an error reading output map file", e); + } + } else { + outputMap = createOutputMapFromHeader(readGroups, outputDir, defaultExtension); + } + return outputMap; + } else { + return Collections.singletonMap(null, outputDir); + } + } + + // Names the files based on the locations laid out in the readgroup map + private static Map createOutputMapFromFile(final File outputMapFile) throws IOException { + final Map outputMap = new HashMap<>(); + final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMapFile.getAbsolutePath(), new TableCodec(null), false); + for (final TableFeature row : parser.iterator()) { + final String id = row.get("READ_GROUP_ID"); + final String output = row.get("OUTPUT"); + final File outputPath = new File(output); + outputMap.put(id, outputPath); + } + CloserUtil.close(parser); + return outputMap; + } + + // Names the files based on the readgroups individually presented in the header + private static Map createOutputMapFromHeader(final List readGroups, final File outputDir, final String extension) { + final Map outputMap = new HashMap<>(); + for (final SAMReadGroupRecord readGroup : readGroups) { + final String id = readGroup.getId(); + final String fileName = id + extension; + final Path outputPath = Paths.get(outputDir.toString(), fileName); + outputMap.put(id, outputPath.toFile()); + } + return outputMap; + } + +// ________________________________________________________________________________________________________________________ +// sum garbage +// ________________________________________________________________________________________________________________________ + /** + * Methods used for validating parameters to RevertSam. + */ + static class ValidationUtil { + + static void validateOutputParams(final boolean outputByReadGroup, final File output, final File outputMap, final List errors) { + if (outputByReadGroup) { + validateOutputParamsByReadGroup(output, outputMap, errors); + } else { + validateOutputParamsNotByReadGroup(output, outputMap, errors); + } + } + + static void validateOutputParamsByReadGroup(final File output, final File outputMap, final List errors) { + if (output != null) { + if (!Files.isDirectory(output.toPath())) { + errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output); + } + return; + } + // output is null if we reached here + if (outputMap == null) { + errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set."); + return; + } + if (!Files.isReadable(outputMap.toPath())) { + errors.add("Cannot read outputMap " + outputMap); + return; + } + final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMap.getAbsolutePath(), new TableCodec(null),false); + if (!RevertSamSpark.ValidationUtil.isOutputMapHeaderValid((List)parser.getHeader())) { + errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with READ_GROUP_ID as first column and output as second column."); + } + } + + static void validateOutputParamsNotByReadGroup(final File output, final File outputMap, final List errors) { + if (outputMap != null) { + errors.add("Cannot provide outputMap when '--output-by-read' isn't set. Provide output instead."); + } + if (output == null) { + errors.add("output is required when '--output-by-read'"); + return; + } + if (Files.isDirectory(output.toPath())) { + errors.add("output " + output + " should not be a directory when '--output-by-read'"); + } + } + + /** + * If we are going to override sampleAlias or libraryName, make sure all the read + * groups have the same values. + */ + static void validateHeaderOverrides( + final SAMFileHeader inHeader, + final String sampleAlias, + final String libraryName) { + + final List rgs = inHeader.getReadGroups(); + if (sampleAlias != null || libraryName != null) { + boolean allSampleAliasesIdentical = true; + boolean allLibraryNamesIdentical = true; + for (int i = 1; i < rgs.size(); i++) { + if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) { + allSampleAliasesIdentical = false; + } + if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) { + allLibraryNamesIdentical = false; + } + } + if (sampleAlias != null && !allSampleAliasesIdentical) { + throw new GATKException("Read groups have multiple values for sample. " + + "A value for sampleAlias cannot be supplied."); + } + if (libraryName != null && !allLibraryNamesIdentical) { + throw new GATKException("Read groups have multiple values for library name. " + + "A value for library name cannot be supplied."); + } + } + } + + static void assertAllReadGroupsMapped(final Map outputMap, final List readGroups) { + for (final SAMReadGroupRecord readGroup : readGroups) { + final String id = readGroup.getId(); + final File output = outputMap.get(id); + if (output == null) { + throw new GATKException("Read group id " + id + " not found in outputMap " + outputMap); + } + } + } + + static boolean isOutputMapHeaderValid(final List columnLabels) { + return columnLabels.size() >= 2 && + "READ_GROUP_ID".equals(columnLabels.get(0)) && + "OUTPUT".equals(columnLabels.get(1)); + } + } +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java index 4e25d95c889..aa358766f4d 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSpark.java @@ -26,15 +26,13 @@ public final class SortSamSpark extends GATKSparkTool { private static final long serialVersionUID = 1L; - public static final String SORT_ORDER_LONG_NAME = "sort-order"; - @Override public boolean requiresReads() { return true; } @Argument(doc="the output file path", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, optional = false) private String outputFile; - @Argument(doc="sort order of the output file", fullName = SORT_ORDER_LONG_NAME, optional = true) + @Argument(doc="sort order of the output file", shortName = StandardArgumentDefinitions.SORT_ORDER_SHORT_NAME, fullName = StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, optional = true) private SparkSortOrder sortOrder = SparkSortOrder.coordinate; /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java index 6ed2d7601dd..2553841c5d4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtils.java @@ -21,6 +21,7 @@ import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import org.broadinstitute.hellbender.utils.read.markduplicates.*; import org.broadinstitute.hellbender.utils.read.markduplicates.sparkrecords.*; +import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.sam.markduplicates.util.OpticalDuplicateFinder; import picard.sam.markduplicates.util.ReadEnds; import scala.Tuple2; @@ -259,7 +260,7 @@ private static JavaPairRDD>> getReadsGroupe */ private static JavaPairRDD>> spanReadsByKey(final JavaRDD> reads) { JavaPairRDD> nameReadPairs = reads.mapToPair(read -> new Tuple2<>(read.getValue().getName(), read)); - return spanByKey(nameReadPairs).flatMapToPair(namedRead -> { + return SparkUtils.spanByKey(nameReadPairs).flatMapToPair(namedRead -> { // for each name, separate reads by key (group name) List>>> out = Lists.newArrayList(); ListMultimap> multi = LinkedListMultimap.create(); @@ -274,54 +275,6 @@ private static JavaPairRDD>> spanReadsByKey }); } - /** - * Like groupByKey, but assumes that values are already sorted by key, so no shuffle is needed, - * which is much faster. - * @param rdd the input RDD - * @param type of keys - * @param type of values - * @return an RDD where each the values for each key are grouped into an iterable collection - */ - private static JavaPairRDD> spanByKey(JavaPairRDD rdd) { - return rdd.mapPartitionsToPair(MarkDuplicatesSparkUtils::spanningIterator); - } - - /** - * An iterator that groups values having the same key into iterable collections. - * @param iterator an iterator over key-value pairs - * @param type of keys - * @param type of values - * @return an iterator over pairs of keys and grouped values - */ - static Iterator>> spanningIterator(Iterator> iterator) { - final PeekingIterator> iter = Iterators.peekingIterator(iterator); - return new AbstractIterator>>() { - @Override - protected Tuple2> computeNext() { - K key = null; - List group = Lists.newArrayList(); - while (iter.hasNext()) { - if (key == null) { - Tuple2 next = iter.next(); - key = next._1(); - V value = next._2(); - group.add(value); - continue; - } - K nextKey = iter.peek()._1(); // don't advance... - if (nextKey.equals(key)) { - group.add(iter.next()._2()); // .. unless the keys match - } else { - return new Tuple2<>(key, group); - } - } - if (key != null) { - return new Tuple2<>(key, group); - } - return endOfData(); - } - }; - } /** * Primary landing point for MarkDuplicateSparkRecords: diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java index 851ad51a1c2..e896b1a4b15 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java @@ -17,6 +17,8 @@ *
    *
  • Header: must begin with line HEADER or track (for IGV), followed by any number of column names, * separated by whitespace.
  • + *
  • Header: Custom header delimiters can be provided, with a null header line being interpreted as having a non-delimeted + * header which consists of one line.
  • *
  • Comment lines starting with # are ignored
  • *
  • Each non-header and non-comment line is split into parts by whitespace, * and these parts are assigned as a map to their corresponding column name in the header. @@ -28,30 +30,56 @@ * *

    * - *

    File format example

    + *

    File format example 1

    *
      *     HEADER a b c
      *     1:1  1   2   3
      *     1:2  4   5   6
      *     1:3  7   8   9
      * 
    + * + *

    File format example 2

    + *
    + *     a b c
    + *     1:1  1   2   3
    + *     1:2  4   5   6
    + *     1:3  7   8   9
    + * 
    */ public final class TableCodec extends AsciiFeatureCodec { - protected static final String HEADER_DELIMITER = "HEADER"; + protected static final String DEFAULT_HEADER_DELIMITER = "HEADER"; protected static final String IGV_HEADER_DELIMITER = "track"; protected static final String COMMENT_DELIMITER = "#"; + private final String headerDelimiter; + private final String commentDelimiter; + protected String delimiter_regex = "\\s+"; protected List header = new ArrayList<>(); - public TableCodec() { + private boolean havePassedHeader = false; + + public TableCodec(final String headerLineDelimiter, final String commentLineDelimiter) { super(TableFeature.class); + headerDelimiter = headerLineDelimiter; + commentDelimiter = commentLineDelimiter; + } + + public TableCodec(final String headerLineDelimiter) { + this(headerLineDelimiter, COMMENT_DELIMITER); + } + + public TableCodec() { + this(DEFAULT_HEADER_DELIMITER); } @Override public TableFeature decode(final String line) { - if (line.startsWith(HEADER_DELIMITER) || line.startsWith(COMMENT_DELIMITER) || line.startsWith(IGV_HEADER_DELIMITER)) { + if ((headerDelimiter != null && ! line.startsWith(headerDelimiter)) || + (headerDelimiter == null && !havePassedHeader) || + line.startsWith(commentDelimiter) || line.startsWith(IGV_HEADER_DELIMITER)) { + havePassedHeader = true; return null; } final String[] split = line.split(delimiter_regex); @@ -66,11 +94,11 @@ public List readActualHeader(final LineIterator reader) { boolean isFirst = true; while (reader.hasNext()) { final String line = reader.peek(); // Peek to avoid reading non-header data - if ( isFirst && ! line.startsWith(HEADER_DELIMITER) && ! line.startsWith(COMMENT_DELIMITER)) { + if ( isFirst && ! line.startsWith(commentDelimiter) && headerDelimiter != null && ! line.startsWith(headerDelimiter) ) { throw new UserException.MalformedFile("TableCodec file does not have a header"); } - isFirst &= line.startsWith(COMMENT_DELIMITER); - if (line.startsWith(HEADER_DELIMITER)) { + isFirst &= line.startsWith(commentDelimiter); + if (headerDelimiter == null || line.startsWith(headerDelimiter)) { reader.next(); // "Commit" the peek if (!header.isEmpty()) { throw new UserException.MalformedFile("Input table file seems to have two header lines. The second is = " + line); @@ -78,7 +106,7 @@ public List readActualHeader(final LineIterator reader) { final String[] spl = line.split(delimiter_regex); Collections.addAll(header, spl); return header; - } else if (line.startsWith(COMMENT_DELIMITER)) { + } else if (line.startsWith(commentDelimiter)) { reader.next(); // "Commit" the peek } else { break; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java b/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java index c5c7ccaae06..66139a4ca89 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java @@ -1,9 +1,6 @@ package org.broadinstitute.hellbender.utils.read; -import htsjdk.samtools.Cigar; -import htsjdk.samtools.CigarElement; -import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.SAMRecord; +import htsjdk.samtools.*; import htsjdk.samtools.util.Locatable; import htsjdk.samtools.util.StringUtil; import org.broadinstitute.hellbender.exceptions.GATKException; @@ -307,6 +304,20 @@ default String getBasesString() { */ byte[] getBaseQualities(); + /** + * If the original base quality scores have been store in the "OQ" tag will return the numeric + * score as a byte[] + */ + default byte[] getOriginalBaseQualities() { + final String oqString = getAttributeAsString("OQ"); + if (oqString != null && !oqString.isEmpty()) { + return SAMUtils.fastqToPhred(oqString); + } + else { + return null; + } + } + /** * @return Base qualities as binary phred scores (not ASCII), or an empty byte[] if base qualities are not present. * @@ -484,6 +495,12 @@ default int numCigarElements(){ */ void setIsUnplaced(); + /** + * insert size (difference btw 5' end of read & 5' end of mate), if possible, else 0. + * Negative if mate maps to lower position than read. + */ + void setInferredInsertSize(int insertSize); + /** * @return True if this read's mate is unmapped (this includes mates that have a position but are explicitly marked as unmapped, * as well as mates that lack a fully-defined position but are not explicitly marked as unmapped). Otherwise false. diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java b/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java index 9bab4278199..dd517240cd6 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java @@ -447,6 +447,13 @@ public void setIsUnplaced() { samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); } + @Override + public void setInferredInsertSize(int insertSize) { + clearCachedValues(); + + samRecord.setInferredInsertSize(insertSize); + } + @Override public boolean mateIsUnmapped() { Utils.validate(isPaired(), "Cannot get mate information for an unpaired read"); @@ -482,6 +489,7 @@ public void setMateIsUnplaced() { setIsPaired(true); samRecord.setMateUnmappedFlag(true); + samRecord.setMateNegativeStrandFlag(false); samRecord.setMateReferenceIndex(SAMRecord.NO_ALIGNMENT_REFERENCE_INDEX); samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java b/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java index 59e89775135..25ad6ce5ecc 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java @@ -363,7 +363,7 @@ protected void tryCleanShutdown() { } } if (process != null) { - // terminate the app by closing the process' INPUT stream + // terminate the app by closing the process' input stream IOUtils.closeQuietly(process.getOutputStream()); } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java index 37c1d2448da..e0fc3ef3f49 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java @@ -1,7 +1,6 @@ package org.broadinstitute.hellbender.utils.spark; -import com.google.common.collect.Iterators; -import com.google.common.collect.PeekingIterator; +import com.google.common.collect.*; import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.samtools.SAMTextHeaderCodec; @@ -21,8 +20,10 @@ import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.*; +import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; import scala.Tuple2; import java.io.*; @@ -231,4 +232,54 @@ public static JavaRDD putReadsWithTheSameNameInTheSamePartition(final return current; }); } + + /** + * Like groupByKey, but assumes that values are already sorted by key, so no shuffle is needed, + * which is much faster. + * @param rdd the input RDD + * @param type of keys + * @param type of values + * @return an RDD where each the values for each key are grouped into an iterable collection + */ + public static JavaPairRDD> spanByKey(JavaPairRDD rdd) { + return rdd.mapPartitionsToPair(SparkUtils::spanningIterator); + } + + /** + * An iterator that groups values having the same key into iterable collections. + * @param iterator an iterator over key-value pairs + * @param type of keys + * @param type of values + * @return an iterator over pairs of keys and grouped values + */ + public static Iterator>> spanningIterator(Iterator> iterator) { + final PeekingIterator> iter = Iterators.peekingIterator(iterator); + return new AbstractIterator>>() { + @Override + protected Tuple2> computeNext() { + K key = null; + List group = Lists.newArrayList(); + while (iter.hasNext()) { + if (key == null) { + Tuple2 next = iter.next(); + key = next._1(); + V value = next._2(); + group.add(value); + continue; + } + K nextKey = iter.peek()._1(); // don't advance... + if (nextKey.equals(key)) { + group.add(iter.next()._2()); // .. unless the keys match + } else { + return new Tuple2<>(key, group); + } + } + if (key != null) { + return new Tuple2<>(key, group); + } + return endOfData(); + } + }; + } + } diff --git a/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java index 548b9385e7d..822d8cf400d 100644 --- a/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java @@ -52,7 +52,7 @@ public void testPicardNormalizeFastaWithBadArgs() throws IOException { final File outfile = createTempFile("normalized", ".fasta"); // Use GATK-style lower case argument names, which are rejected by Picard - // because it uses upper cased argument names (--INPUT/--OUTPUT) + // because it uses upper cased argument names (--input/--output) final String[] args = { "--input", input.getAbsolutePath(), "--output", outfile.getAbsolutePath(), diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java new file mode 100644 index 00000000000..fb433ffdd0d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -0,0 +1,434 @@ +package org.broadinstitute.hellbender.tools.spark; + +import htsjdk.samtools.*; +import htsjdk.samtools.util.CloserUtil; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.testng.Assert; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; + +import java.io.File; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.util.*; + +@Test(groups = "Spark") +public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { + + @Override + public String getToolTestDataDir() { + return "src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark"; + } + + public static List defaultAttributesToClearPlusXT = new ArrayList() {{ + add(SAMTag.NM.name()); + add(SAMTag.UQ.name()); + add(SAMTag.PG.name()); + add(SAMTag.MD.name()); + add(SAMTag.MQ.name()); + add(SAMTag.SA.name()); // Supplementary alignment metadata + add(SAMTag.MC.name()); // Mate Cigar + add(SAMTag.AS.name()); + add("XT"); + }}; + + private final File basicSamToRevert = getTestFile("revert_sam_basic.sam"); + private final File sampleLibraryOverrideSam = getTestFile("revert_sam_sample_library_override.sam"); + private final File validOutputMap = getTestFile("revert_sam_valid_output_map.txt"); + private final File nonExistentOutputMap = getTestFile("revert_sam_does_not_exist.txt"); + private final File badHeaderOutputMap = getTestFile("revert_sam_bad_header_output_map.txt"); + private final File referenceFasta = getTestFile("test.fasta"); + private final File singleEndSamToRevert = getTestFile("revert_sam_single_end.sam"); + private final File missingRGInfo = getTestFile("missing-rg-info.sam"); + + private static final String revertedQualities = + "11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111"; + + private static final String unmappedRead = "both_reads_present_only_first_aligns/2"; + + + + @DataProvider(name="positiveTestData") + public Object[][] getPostitiveTestData() { + return new Object[][] { + {null, false, false, true, true, null, null, Collections.EMPTY_LIST}, + {SAMFileHeader.SortOrder.queryname, false, false, true, false, "Hey,Dad!", null, defaultAttributesToClearPlusXT}, + {null, true, false, false, false, "Hey,Dad!", "NewLibraryName", defaultAttributesToClearPlusXT}, + {null, true, true, false, false, null, null, Collections.EMPTY_LIST} + }; + } + + @Test(dataProvider="positiveTestData") + public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean removeDuplicates, final boolean removeAlignmentInfo, + final boolean restoreOriginalQualities, final boolean outputByReadGroup, final String sample, final String library, + final List attributesToClear) throws Exception { + + final File output = outputByReadGroup?Files.createTempDirectory("picardRevertSamSparkTest").toFile():File.createTempFile("reverted", ".sam"); + File output0 = new File(output.getPath()+"/0.sam"); + File output1 = new File(output.getPath()+"/1.sam"); + File output2 = new File(output.getPath()+"/2.sam"); + File output3 = new File(output.getPath()+"/3.sam"); + output.deleteOnExit(); + final RevertSamSpark reverter = new RevertSamSpark(); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addInput(basicSamToRevert); + args.addOutput(output); + + if (outputByReadGroup) { + args.addPositionalArgument("--"+RevertSamSpark.OUTPUT_BY_READGROUP_ARG); + } + if (so != null) { + args.addArgument("sort-order",so.name()); //TODO decide on sort order outputing + } + if (!removeAlignmentInfo) { + args.addPositionalArgument("--"+RevertSamSpark.DONT_REMOVE_ALIGNMENT_INFORMATION_ARG); + } + if (sample != null) { + args.addArgument("sample-alias",sample); + } + if (library != null) { + args.addArgument("library-name",library); + } + for (final String attr : attributesToClear) { + args.addArgument("attributes-to-clear",attr); + } + + runCommandLine(args); + + if (outputByReadGroup) { + verifyPositiveResults(output0, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "0", 2, sample, library); + verifyPositiveResults(output1, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "1", 4, sample, library); + verifyPositiveResults(output2, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "2", 2, sample, library); + verifyPositiveResults(output3, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "3", 0, sample, library); + } else { + verifyPositiveResults(output, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, null, 8, sample, library); + } + } + + @Test + public void testOutputByReadGroupWithOutputMap() throws Exception { + final File outputDir = createTempDir("testOutputByReadGroupWithOutputMap"); + outputDir.deleteOnExit(); + // Create the output map + final File outputMapFile = Files.createTempFile("picardRevertSamSparkTestOutputMap", ".txt").toFile(); + final PrintWriter mapWriter = new PrintWriter(outputMapFile); + final String outputPath0 = outputDir + "/my_rg0.sam"; + final String outputPath1 = outputDir + "/rg1.sam"; + final String outputPath2 = outputDir + "/my_rg2.bam"; + final String outputPath3 = outputDir + "/my_rg3.sam";//TODO not used? + mapWriter.println("READ_GROUP_ID\tOUTPUT"); + mapWriter.println("0\t" + outputPath0); + mapWriter.println("2\t" + outputPath2); + mapWriter.println("1\t" + outputPath1); + mapWriter.println("3\t" + outputPath3); + System.out.println("outputFile: " + outputPath0); + System.out.println("outputFile: " + outputPath1); + System.out.println("outputFile: " + outputPath2); + System.out.println("outputFile: " + outputPath3); + mapWriter.close(); + outputMapFile.deleteOnExit(); + + final RevertSamSpark reverter = new RevertSamSpark(); + + final String args[] = new String[] { + "-I",basicSamToRevert.getPath(), + "--output-by-readgroup", + "--output-map",outputMapFile.getPath(), + "-R",referenceFasta.getPath(), + "--sort-order",SAMFileHeader.SortOrder.queryname.name(), + "--"+RevertSamSpark.SAMPLE_ALIAS_ARG,"test_sample_1", + "--"+RevertSamSpark.LIBRARY_NAME_ARG,"test_library_1", + "--"+RevertSamSpark.ATTRIBUTE_TO_CLEAR_ARG,SAMTag.NM.name() + }; + + runCommandLine(args); + + final File output0 = new File(outputPath0); + final File output1 = new File(outputPath1); + final File output2 = new File(outputPath2); + verifyPositiveResults(output0, reverter, true, true, true, true, "0", 2, "test_sample_1", "test_library_1"); + verifyPositiveResults(output1, reverter, true, true, true, true, "1", 4, "test_sample_1", "test_library_1"); + verifyPositiveResults(output2, reverter, true, true, true, true, "2", 2, "test_sample_1", "test_library_1"); + } + + @Test (expectedExceptions = UserException.class) + public void testSingleEndSanitize() throws Exception { + final File output = File.createTempFile("single_end_reverted", ".sam"); + output.deleteOnExit(); + final String args[] = { "-I " + singleEndSamToRevert, "-O " + output.getAbsolutePath(), "--sanitize"}; + runCommandLine(args); + } + + private void verifyPositiveResults( + final File outputFile, + final RevertSamSpark reverter, + final boolean removeDuplicates, + final boolean removeAlignmentInfo, + final boolean restoreOriginalQualities, + final boolean outputByReadGroup, + final String readGroupId, + final int numReadsExpected, + final String sample, + final String library) { + + outputFile.deleteOnExit(); + final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(outputFile); + final SAMFileHeader header = reader.getFileHeader(); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.queryname); + Assert.assertEquals(header.getProgramRecords().size(), removeAlignmentInfo ? 0 : 1); + final List readGroups = header.getReadGroups(); + if (outputByReadGroup) { + Assert.assertEquals(readGroups.size(), 1); + Assert.assertEquals(readGroups.get(0).getId(), readGroupId); + } + for (final SAMReadGroupRecord rg : header.getReadGroups()) { + if (sample != null) { + Assert.assertEquals(rg.getSample(), sample); + } else { + Assert.assertEquals(rg.getSample(), "Hi,Mom!"); + } + if (library != null) { + Assert.assertEquals(rg.getLibrary(), library); + } else { + Assert.assertEquals(rg.getLibrary(), "my-library"); + } + } + int numReads = 0; + for (final SAMRecord rec : reader) { + numReads++; + if (removeDuplicates) { + Assert.assertFalse(rec.getDuplicateReadFlag(), + "Duplicates should have been removed: " + rec.getReadName()); + } + + if (removeAlignmentInfo) { + Assert.assertTrue(rec.getReadUnmappedFlag(), + "Alignment info should have been removed: " + rec.getReadName()); + } + + if (restoreOriginalQualities && !unmappedRead.equals( + rec.getReadName() + "/" + (rec.getFirstOfPairFlag() ? "1" : "2"))) { + + Assert.assertEquals(rec.getBaseQualityString(), revertedQualities); + } else { + Assert.assertNotSame(rec.getBaseQualityString(), revertedQualities); + } + + for (final SAMRecord.SAMTagAndValue attr : rec.getAttributes()) { + if (removeAlignmentInfo || (!attr.tag.equals("PG") && !attr.tag.equals("NM") + && !attr.tag.equals(SAMTag.MQ.toString()))) { + Assert.assertFalse(reverter.attributesToClear.contains(attr.tag), + attr.tag + " should have been cleared."); + } + } + } + Assert.assertEquals(numReads, numReadsExpected); + CloserUtil.close(reader); + } + + @Test + public void testSanitizeAndDeduplicateRecords() throws Exception { + final File input = File.createTempFile("test-input-santize-and-deduplicate-records", ".sam"); + final File output = File.createTempFile("test-output-santize-and-deduplicate-records", ".sam"); + + // Create a SAM file that has duplicate records + final SamReader reader = SamReaderFactory.makeDefault().open(basicSamToRevert); + final SAMFileWriter writer = new SAMFileWriterFactory().makeSAMOrBAMWriter(reader.getFileHeader(), false, input); + int numDuplicated = 0; + for (final SAMRecord rec : reader) { + writer.addAlignment(rec); + if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) { + writer.addAlignment(rec); + numDuplicated++; + } + } + reader.close(); + writer.close(); + + // Make sure some records are duplicated + Assert.assertTrue(numDuplicated > 0); + + final String [] args = new String[]{ + "--input", input.getAbsolutePath(), + "--sanitize", + "--keep-first-duplicate", + "--"+RevertSamSpark.DONT_RESTORE_ORIGINAL_QUALITIES_ARG, + "-O", output.getAbsolutePath() + }; + runCommandLine(args); + verifyPositiveResults(output, new RevertSamSpark(), false, true, false, false, null, 8, null, null); + } + + @Test(dataProvider="overrideTestData", expectedExceptions = {GATKException.class}) + public void testSampleLibraryOverride(final String sample, final String library) throws Exception { + final File output = createTempFile("bad", ".sam"); + ArgumentsBuilder args = new ArgumentsBuilder(); + args.addInput(sampleLibraryOverrideSam); + args.addOutput(output); + if (sample != null) { + args.addArgument(RevertSamSpark.SAMPLE_ALIAS_ARG,sample); + } + if (library != null) { + args.addArgument(RevertSamSpark.LIBRARY_NAME_ARG,library); + } + runCommandLine(args); + } + + @DataProvider(name="overrideTestData") + public Object[][] getNegativeTestData() { + return new Object[][] { + {"NewSample", null}, + {null, "NewLibrary"}, + {"NewSample", "NewLibrary"} + }; + } + + @Test + public void testValidateOutputParamsByReadGroupMapValid() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, validOutputMap, errors); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsByReadGroupMissingMap() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, nonExistentOutputMap, errors); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Cannot read"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupBadHeaderMap() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, badHeaderOutputMap, errors); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Invalid header"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupNoMapOrDir() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, null, errors); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Must provide either"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupDirValid() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupValid"), null, errors); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsNotByReadGroupValid() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(createTempFile("testValidateOutputParamsNotByReadGroupValid",""), null, errors); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsNotByReadGroupNoOutput() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(null, null, errors); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("output is required"), true); + } + + @Test + public void testValidateOutputParamsNotByReadGroupMap() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(null, validOutputMap, errors); + Assert.assertEquals(errors.size(), 2); + Assert.assertEquals(errors.get(0).contains("Cannot provide outputMap"), true); + Assert.assertEquals(errors.get(1).contains("output is required"), true); + } + + @Test + public void testValidateOutputParamsNotByReadGroupDir() { + final List errors = new ArrayList(); + RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir"), null, errors); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("should not be a directory"), true); + } + + @Test + public void testAssertAllReadGroupsMappedSuccess() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + + final Map outputMap = new HashMap(); + outputMap.put("rg1", new File("/foo/bar/rg1.bam")); + outputMap.put("rg2", new File("/foo/bar/rg2.bam")); + RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2)); + RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1)); + RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg2)); + } + + @Test(expectedExceptions = {GATKException.class}) + public void testAssertAllReadGroupsMappedFailure() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + final SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3"); + + final Map outputMap = new HashMap(); + outputMap.put("rg1", new File("/foo/bar/rg1.bam")); + outputMap.put("rg2", new File("/foo/bar/rg2.bam")); + RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2, rg3)); + } + + @Test + public void testIsOutputMapHeaderValid() { + boolean isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Arrays.asList("READ_GROUP_ID", "OUTPUT")); + Assert.assertEquals(isValid, true); + + isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Arrays.asList("OUTPUT")); + Assert.assertEquals(isValid, false); + + isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Collections.EMPTY_LIST); + Assert.assertEquals(isValid, false); + } + + @Test + public void testFilePathsWithoutMapFile() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + + final Map outputMap = RevertSamSpark.getOutputMap(null, new File("/foo/bar"), ".bam", Arrays.asList(rg1, rg2), true); + Assert.assertEquals(outputMap.get("rg1"), new File("/foo/bar/rg1.bam")); + Assert.assertEquals(outputMap.get("rg2"), new File("/foo/bar/rg2.bam")); + } + + @Test + public void testFilePathsWithMapFile() { + final Map outputMap = RevertSamSpark.getOutputMap(validOutputMap, null, ".bam", Collections.emptyList(), true); + Assert.assertEquals(outputMap.get("rg1"), new File("/path/to/my_rg_1.ubam")); + Assert.assertEquals(outputMap.get("rg2"), new File("/path/to/my_rg_2.ubam")); + } + + @Test + public void testGetDefaultExtension() { + Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.sam", RevertSamSpark.FileType.dynamic), ".sam"); + //Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.cram", RevertSamSpark.FileType.dynamic), ".cram"); + Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.bam", RevertSamSpark.FileType.dynamic), ".bam"); + Assert.assertEquals(RevertSamSpark.getDefaultExtension("foo", RevertSamSpark.FileType.dynamic), ".bam"); + } + + @Test + public void testNoRgInfoSanitize() throws Exception { + final File output = File.createTempFile("no-rg-reverted", ".sam"); + output.deleteOnExit(); + final String [] args = new String[]{ + "-I",missingRGInfo.getAbsolutePath(), + "--sanitize", + "-O", output.getAbsolutePath() + }; + runCommandLine(args); + verifyPositiveResults(output, new RevertSamSpark(), true, true, false, false, null, 240, null, null); + } + +} \ No newline at end of file diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java new file mode 100644 index 00000000000..8341b22751d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java @@ -0,0 +1,84 @@ +package org.broadinstitute.hellbender.tools.spark; + +import htsjdk.samtools.SAMFileHeader; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.GATKBaseTest; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.testng.annotations.DataProvider; +import org.testng.annotations.Test; +import picard.sam.RevertSam; + +import java.io.File; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +public class RevertSamSparkUnitTest extends CommandLineProgramTest { + + private final File basicSamToRevert = getTestFile("revert_sam_basic.sam"); + + @DataProvider(name="positiveTestData") + public Object[][] getPostitiveTestData() { + return new Object[][] { + {null, true, true, true, true, null, null, Collections.EMPTY_LIST}, + {SAMFileHeader.SortOrder.queryname, true, true, true, false, "Hey,Dad!", null, Arrays.asList("XT")}, + {null, false, true, false, false, "Hey,Dad!", "NewLibraryName", Arrays.asList("XT")}, + {null, false, false, false, false, null, null, Collections.EMPTY_LIST} + }; + } + + @Test(dataProvider="positiveTestData") + public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean removeDuplicates, final boolean removeAlignmentInfo, + final boolean restoreOriginalQualities, final boolean outputByReadGroup, final String sample, final String library, + final List attributesToClear) throws Exception { + + final File output = outputByReadGroup?Files.createTempDirectory("picardRevertSamTest").toFile():File.createTempFile("reverted", ".sam"); + File output0 = createTempFile("0", ".sam"); + File output1 = createTempFile("1", ".sam"); + File output2 = createTempFile("2", ".sam"); +// if (outputByReadGroup) { +// output = Files.createTempDirectory("picardRevertSamTest").toFile(); +// output0 = Paths.get(output.toString(), "0.sam").toFile(); +// output1 = Paths.get(output.toString(), "1.sam").toFile(); +// output2 = Paths.get(output.toString(), "2.sam").toFile(); +// } else { +// output = File.createTempFile("reverted", ".sam"); +// } + output.deleteOnExit(); + final RevertSam reverter = new RevertSam(); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addInput(basicSamToRevert); + args.addOutput(output); + + if (outputByReadGroup) { + args.add("output-by-readgroup"); + } + if (so != null) { + args.addArgument("sort-order",so.name()); //TODO decide on sort order outputing + } +// args[index++] = "dontRemoveDuplicateInformation=" + removeDuplicates; //TODO this is unsuported + args.add("remove-alignment-inormation"); + args.add("restore-original-qualities"); + if (sample != null) { + args.addArgument("sample-alias",sample); + } + if (library != null) { + args.addArgument("library-name",library); + } + for (final String attr : attributesToClear) { + args.addArgument("attributes-to-clear",attr); + } + + runCommandLine(args); + +// if (outputByReadGroup) { +// verifyPositiveResults(output0, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "0", 2, sample, library); +// verifyPositiveResults(output1, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "1", 4, sample, library); +// verifyPositiveResults(output2, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "2", 2, sample, library); +// } else { +// verifyPositiveResults(output, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, null, 8, sample, library); +// } + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java index f2fb73b6ce4..136f2076755 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/pipelines/SortSamSparkIntegrationTest.java @@ -7,6 +7,7 @@ import org.apache.spark.api.java.JavaRDD; import org.broadinstitute.barclay.argparser.CommandLineException; import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.ReadsDataSource; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.engine.spark.SparkContextFactory; @@ -74,7 +75,7 @@ public void testSortBAMs( args.addReference(referenceFile); factory.referenceSequence(referenceFile); } - args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); + args.addArgument(StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, sortOrder.name()); this.runCommandLine(args); @@ -103,7 +104,7 @@ public void testSortBAMsSharded( if (null != referenceFile) { args.addReference(referenceFile); } - args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, sortOrder.name()); + args.addArgument(StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, sortOrder.name()); args.addBooleanArgument(GATKSparkTool.SHARDED_OUTPUT_LONG_NAME,true); args.addArgument(GATKSparkTool.NUM_REDUCERS_LONG_NAME, "2"); @@ -134,7 +135,7 @@ public void testBadSortOrders(SAMFileHeader.SortOrder badOrder){ ArgumentsBuilder args = new ArgumentsBuilder(); args.addInput(unsortedBam); args.addOutput(createTempFile("sort_bam_spark", BAM)); - args.addArgument(SortSamSpark.SORT_ORDER_LONG_NAME, badOrder.toString()); + args.addArgument(StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, badOrder.toString()); this.runCommandLine(args); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java index 63cc3282e8c..7fa97a7b7bc 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java @@ -18,6 +18,7 @@ import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter; import org.broadinstitute.hellbender.utils.read.markduplicates.MarkDuplicatesScoringStrategy; +import org.broadinstitute.hellbender.utils.spark.SparkUtils; import org.testng.Assert; import org.testng.annotations.Test; import picard.sam.markduplicates.MarkDuplicates; @@ -51,7 +52,7 @@ private String getReadGroupId(final SAMFileHeader header, final int index) { } private static void check(Iterator> it, List>> expected) { - Iterator>> spanning = MarkDuplicatesSparkUtils.spanningIterator(it); + Iterator>> spanning = SparkUtils.spanningIterator(it); ArrayList>> actual = Lists.newArrayList(spanning); Assert.assertEquals(actual, expected); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java index 7b99364050a..f5395c98278 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java @@ -755,7 +755,7 @@ public void testDuplicateDetectionDataProviderWithMetrics(final File sam, final final List lines = FileUtils.readLines(metricsFile, StandardCharsets.UTF_8); Assert.assertTrue(lines.get(0).startsWith("##"), lines.get(0)); Assert.assertTrue(lines.get(1).startsWith("#"), lines.get(1)); - Assert.assertTrue(lines.get(1).toLowerCase().contains("--input"), lines.get(1)); //Note: lowercase because picard uses INPUT and GATK uses input for full name + Assert.assertTrue(lines.get(1).toLowerCase().contains("--input"), lines.get(1)); //Note: lowercase because picard uses input and GATK uses input for full name Assert.assertTrue(lines.get(2).startsWith("##"), lines.get(2)); Assert.assertTrue(lines.get(3).startsWith("# Started on:"), lines.get(3)); Assert.assertTrue(lines.get(4).trim().isEmpty()); diff --git a/src/test/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodecUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodecUnitTest.java index a46f8c6f558..c3da0d5444a 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodecUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodecUnitTest.java @@ -175,8 +175,8 @@ public void close() { @DataProvider(name = "stringNull") public Object[][] stringNull() { List params = new ArrayList<>(); - params.add(new String[]{TableCodec.HEADER_DELIMITER + " foo"}); - params.add(new String[]{TableCodec.HEADER_DELIMITER + " bar"}); + params.add(new String[]{TableCodec.DEFAULT_HEADER_DELIMITER + " foo"}); + params.add(new String[]{TableCodec.DEFAULT_HEADER_DELIMITER + " bar"}); params.add(new String[]{TableCodec.IGV_HEADER_DELIMITER + " baz"}); return params.toArray(new Object[][]{}); } @@ -191,8 +191,8 @@ public void testDecodeNull(String stringNull){ @DataProvider(name = "stringNotNull") public Object[][] stringNotNull() { List params = new ArrayList<>(); - params.add(new String[]{"foo " + TableCodec.HEADER_DELIMITER}); - params.add(new String[]{"foo " + TableCodec.HEADER_DELIMITER}); + params.add(new String[]{"foo " + TableCodec.DEFAULT_HEADER_DELIMITER}); + params.add(new String[]{"foo " + TableCodec.DEFAULT_HEADER_DELIMITER}); params.add(new String[]{"foo " + TableCodec.IGV_HEADER_DELIMITER}); return params.toArray(new Object[][]{}); } diff --git a/src/test/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriterUnitTest.java index 23334a24d33..3d638891a89 100644 --- a/src/test/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriterUnitTest.java @@ -58,7 +58,7 @@ public Object[][] makeReadsLikelikhoodData() { final ReadLikelihoods readLikelihoods ) throws IOException { - // create OUTPUT SAM file + // create output SAM file final Path outPath = testWriteToFile(".sam", haplotypes, genomeLoc, readLikelihoods, false, false); final File expectedFile = new File(expectedFilePath, "testSAM.sam"); IntegrationTestSpec.assertEqualTextFiles(outPath.toFile(), expectedFile); @@ -73,7 +73,7 @@ public Object[][] makeReadsLikelikhoodData() { final ReadLikelihoods readLikelihoods ) throws IOException { - // create OUTPUT BAM file + // create output BAM file final Path outPath = testWriteToFile(".bam", haplotypes, genomeLoc, readLikelihoods, false, false); final File expectedFile = new File(expectedFilePath, "testBAM.bam"); SamAssertionUtils.assertEqualBamFiles(outPath.toFile(), expectedFile, false, ValidationStringency.DEFAULT_STRINGENCY); @@ -88,7 +88,7 @@ public Object[][] makeReadsLikelikhoodData() { final ReadLikelihoods readLikelihoods ) throws IOException { - // create OUTPUT BAM file + // create output BAM file final Path outPath = testWriteToFile(".bam", haplotypes, genomeLoc, readLikelihoods, true, false); final File expectedFile = new File(expectedFilePath, "testBAM.bam"); SamAssertionUtils.assertEqualBamFiles(outPath.toFile(), expectedFile, false, ValidationStringency.DEFAULT_STRINGENCY); @@ -103,7 +103,7 @@ public Object[][] makeReadsLikelikhoodData() { final ReadLikelihoods readLikelihoods ) throws IOException { - // create OUTPUT BAM file + // create output BAM file final Path outPath = testWriteToFile(".bam", haplotypes, genomeLoc, readLikelihoods, false, true); final File expectedFile = new File(expectedFilePath, "testBAM.bam"); SamAssertionUtils.assertEqualBamFiles(outPath.toFile(), expectedFile, false, ValidationStringency.DEFAULT_STRINGENCY); diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.dict b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.dict new file mode 100644 index 00000000000..ca4b5c15099 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.dict @@ -0,0 +1,9 @@ +@HD VN:1.5 +@SQ SN:chr1 LN:101 M5:bd01f7e11515bb6beda8f7257902aa67 UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr2 LN:101 M5:31c33e2155b3de5e2554b693c475b310 UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr3 LN:101 M5:631593c6dd2048ae88dcce2bd505d295 UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr4 LN:101 M5:c60cb92f1ee5b78053c92bdbfa19abf1 UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr5 LN:101 M5:07ebc213c7611db0eacbb1590c3e9bda UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr6 LN:101 M5:7be2f5e7ee39e60a6c3b5b6a41178c6d UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr7 LN:404 M5:3e5c961c0496e099b2a223692430cb8c UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta +@SQ SN:chr8 LN:202 M5:d339678efce576d5546e88b49a487b63 UR:file:/Users/emeryj/hellbender/gatk/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam new file mode 100644 index 00000000000..6f3ef07d858 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam @@ -0,0 +1,243 @@ +@HD VN:1.5 SO:queryname +@SQ SN:20 LN:63025520 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens +@PG ID:GATK PrintReads VN:3.6-0-g89b7209 CL:readGroup=null platform=null number=-1 sample_file=[] sample_name=[] simplify=false no_pg_tag=false +H0164ALXX140820:2:1101:17727:54981 83 20 10000954 48 151M = 10000786 -319 TAATATTTGTAACTTACAATTACTTCAACTGAATAATAAAAGAATTGGACTAGATTTCTCCAACATCTCTCTCTTTTGGCTTTATGTTAGATAATGCTAAATTTTCATCATATCCAAACATGCTATATAATTTTATGAACTGTTACAGAGT A-BFGBGGDBE>@>D00F@0FB.6@D@=/..@8@@.8BFFDCGEGDGGGGEG0FGGGGBGGGGGGGGEBGGGGGGGGGGGGGGGGGGGGGEGGGGGGEGEGGGE@DGCGFGGGGGEGCF=@FGGGG RG:Z:A diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt new file mode 100644 index 00000000000..ed5a13a1647 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt @@ -0,0 +1,3 @@ +READ_GROUP_ID OUTPUT +rg1 /path/to/my_rg_1.ubam +rg2 /path/to/my_rg_2.ubam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta new file mode 100644 index 00000000000..6bcf2a8ce01 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta @@ -0,0 +1,40 @@ +>chr1 +TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC +TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA +A +>chr2 +CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC +TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT +T +>chr3 +CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT +AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA +A +>chr4 +CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT +TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC +C +>chr5 +NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT +CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC +T +>chr6 +NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA +ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT +C +>chr7 +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CAAC +>chr8 +CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA +GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC +CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG +AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG +CA diff --git a/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java b/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java index e638aa4408d..1e4e9dd05ad 100644 --- a/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java +++ b/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java @@ -440,11 +440,11 @@ private static void sortSam(final File input, final File output, final File refe // We can't use ArgumentsBuilder since it assumes GATK argument names, but we're running a Picard // tool, which uses upper case argument names. final List args = new ArrayList<>(6); - args.add("--INPUT"); + args.add("--input"); args.add(input.getAbsolutePath()); - args.add("--OUTPUT"); + args.add("--output"); args.add(output.getAbsolutePath()); - args.add("--SORT_ORDER"); + args.add("--sortOrder"); args.add(SAMFileHeader.SortOrder.coordinate.name()); args.add("--VALIDATION_STRINGENCY"); args.add(stringency.name()); From 1fd38988bb83066b9e2519663d937b136817499b Mon Sep 17 00:00:00 2001 From: James Date: Tue, 27 Nov 2018 16:44:01 -0500 Subject: [PATCH 02/15] responding to the first round of comments --- .../tools/spark/RevertSamSpark.java | 393 +++++++++--------- .../markduplicates/MarkDuplicatesSpark.java | 16 +- .../hellbender/utils/spark/SparkUtils.java | 13 + .../spark/RevertSamSparkIntegrationTest.java | 102 +++-- 4 files changed, 261 insertions(+), 263 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java index 0811f38948f..c38e8603977 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -5,6 +5,7 @@ import com.google.common.collect.ListMultimap; import com.google.common.collect.Lists; import htsjdk.samtools.*; +import htsjdk.samtools.cram.build.CramIO; import htsjdk.samtools.util.*; import htsjdk.tribble.AbstractFeatureReader; import htsjdk.tribble.FeatureReader; @@ -22,16 +23,16 @@ import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.codecs.table.TableCodec; import org.broadinstitute.hellbender.utils.codecs.table.TableFeature; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; -import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import scala.Tuple2; -import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -52,14 +53,14 @@ *

    Usage Examples

    *

    Output to a single file

    *
    - * java -jar picard.jar RevertSam \\
    + * gatk RevertSamSpark  \\
      *      -I input.bam \\
      *      -O reverted.bam
      * 
    *

    *

    Output by read group into multiple files with sample map

    *
    - * java -jar picard.jar RevertSam \\
    + * gatk RevertSamSpark \\
      *      -I input.bam \\
      *      --output-by-readgroup \\
      *      --output-map reverted_bam_paths.tsv
    @@ -67,7 +68,7 @@
      * 

    *

    Output by read group with no output map

    *
    - * java -jar picard.jar RevertSam \\
    + * gatk RevertSamSpark \\
      *      -I input.bam \\
      *      --output-by-readgroup \\
      *      -O /write/reverted/read/group/bams/in/this/dir
    @@ -86,27 +87,27 @@
             programGroup = ReadDataManipulationProgramGroup.class)
     @BetaFeature
     public class RevertSamSpark extends GATKSparkTool {
    -    static final String USAGE_SUMMARY = "Reverts SAM or BAM files to a previous state.  ";
    +    static final String USAGE_SUMMARY = "Reverts SAM or BAM files to a previous state.";
         static final String USAGE_DETAILS = "This tool removes or restores certain properties of the SAM records, including alignment " +
                 "information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " +
                 "restoring the original quality scores of a BAM file that has already undergone base quality score recalibration (BQSR) if the" +
                 "original qualities were retained.\n" +
                 "

    Examples

    \n" + "

    Example with single output:

    \n" + - "java -jar picard.jar RevertSam \\\n" + + "gatk RevertSamSpark \\\n" + " -I input.bam \\\n" + " -O reverted.bam\n" + "\n" + "

    Example outputting by read group with output map:

    \n" + - "java -jar picard.jar RevertSam \\\n" + + "gatk RevertSamSpark \\\n" + " -I input.bam \\\n" + " --output-by-readgroup \\\n" + " --output-map reverted_bam_paths.tsv\n" + "\n" + "Will output a BAM/SAM file per read group.\n" + "

    Example outputting by read group without output map:

    \n" + - "java -jar picard.jar RevertSam \\\n" + - " I=input.bam \\\n" + + "gatk RevertSamSpark \\\n" + + " -I input.bam \\\n" + " --output-by-readgroup \\\n" + " -O /write/reverted/read/group/bams/in/this/dir\n" + "\n" + @@ -116,89 +117,68 @@ public class RevertSamSpark extends GATKSparkTool { "LENIENT or SILENT if the failures are expected to be obviated by the reversion process " + "(e.g. invalid alignment information will be obviated when the dontRemoveAlignmentInformation option is used).\n" + ""; + public static final String OUTPUT_MAP_READ_GROUP_FIELD_NAME = "READ_GROUP_ID"; + public static final String OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME = "OUTPUT"; + @Override public boolean requiresReads() { return true; } - @Argument(mutex = {OUTPUT_MAP_ARG}, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, doc = "The output SAM/BAM file to create, or an output directory if '--output-by-readgroup' is set.") - public File output; + @Argument(mutex = {OUTPUT_MAP_LONG_NAME}, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, + fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, + doc = "The output SAM/BAM file to create, or an output directory if '--output-by-readgroup' is set.") + public String output; - public static final String OUTPUT_MAP_ARG = "output-map"; - @Argument(mutex = {StandardArgumentDefinitions.OUTPUT_LONG_NAME}, fullName = OUTPUT_MAP_ARG, doc = "Tab separated file with two columns, READ_GROUP_ID and OUTPUT, providing file mapping only used if '--output-by-readgroup' is set.") - public File outputMap; + public static final String OUTPUT_MAP_LONG_NAME = "output-map"; + @Argument(mutex = {StandardArgumentDefinitions.OUTPUT_LONG_NAME}, + fullName = OUTPUT_MAP_LONG_NAME, + doc = "Tab separated file with two columns, OUTPUT_MAP_READ_GROUP_FIELD_NAME and OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME, providing file mapping only used if '--output-by-readgroup' is set.") + public String outputMap; - public static final String OUTPUT_BY_READGROUP_ARG = "output-by-readgroup"; - @Argument(fullName = OUTPUT_BY_READGROUP_ARG, doc = "When true, outputs each read group in a separate file.") + public static final String OUTPUT_BY_READGROUP_LONG_NAME = "output-by-readgroup"; + @Argument(fullName = OUTPUT_BY_READGROUP_LONG_NAME, doc = "When true, outputs each read group in a separate file.") public boolean outputByReadGroup = false; - public static enum FileType implements CommandLineParser.ClpEnum { - sam("Generate SAM files."), - bam("Generate BAM files."), - cram("Generate CRAM files."), - dynamic("Generate files based on the extention of input."); - - final String description; - - FileType(String descrition) { - this.description = descrition; - } - - @Override - public String getHelpDoc() { - return description; - } - } - @Argument(doc = "WARNING: This option is potentially destructive. If enabled will discard reads in order to produce " + "a consistent output BAM. Reads discarded include (but are not limited to) paired reads with missing " + "mates, duplicated records, records with mismatches in length of bases and qualities. This option should " + "only be enabled if the output sort order is queryname and will always cause sorting to occur.") public boolean sanitize = false; + public static final String KEEP_FIRST_DUPLICATE_LONG_NAME = "keep-first-duplicate"; @Argument(doc = "If 'sanitize' only one record when we find more than one record with the same name for R1/R2/unpaired reads respectively. " + "For paired end reads, keeps only the first R1 and R2 found respectively, and discards all unpaired reads. " + "Duplicates do not refer to the duplicate flag in the FLAG field, but instead reads with the same name.", - fullName = "keep-first-duplicate") + fullName = KEEP_FIRST_DUPLICATE_LONG_NAME) public boolean keepFirstDuplicate = false; - public static final String OUTPUT_BY_READGROUP_FILE_FORMAT_ARG = "output-by-readgroup-file-format"; - @Argument(fullName = OUTPUT_BY_READGROUP_FILE_FORMAT_ARG, doc = "When using outputByReadGroup, the output file format can be set to a certain format.") + public static final String OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME = "output-by-readgroup-file-format"; + @Argument(fullName = OUTPUT_BY_READGROUP_FILE_FORMAT_LONG_NAME, doc = "When using --output-by-readgroup, the output file format can be set to a certain format.") public FileType outputByReadgroupFileFormat = FileType.dynamic; @Argument(shortName = StandardArgumentDefinitions.SORT_ORDER_SHORT_NAME, fullName = StandardArgumentDefinitions.SORT_ORDER_LONG_NAME, doc = "The sort order to create the reverted output file with, defaults to whatever is specified in the current file", optional = true) public SAMFileHeader.SortOrder sortOrder = SAMFileHeader.SortOrder.queryname; - public static final String DONT_RESTORE_ORIGINAL_QUALITIES_ARG = "dont-restore-original-qualities"; - @Argument( fullName = DONT_RESTORE_ORIGINAL_QUALITIES_ARG, doc = "Set to prevent the tool from setting the OQ field to the QUAL where avalible.", optional = true) + public static final String DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME = "dont-restore-original-qualities"; + @Argument( fullName = DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME, doc = "Set to prevent the tool from setting the OQ field to the QUAL where available.", optional = true) public boolean dontRestoreOriginalQualities = false; - public static final String DONT_REMOVE_DUPLICATE_INFORMATION_ARG = "remove-duplicate-information"; - @Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_ARG, doc = "By default we remove duplicate read flags from all reads. Note that if this is true, " + + public static final String DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME = "remove-duplicate-information"; + @Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME, doc = "By default we remove duplicate read flags from all reads. Note that if this is true, " + " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") - public boolean dontRemoveDuplicateInformation = false; //TODO flip this value + public boolean dontRemoveDuplicateInformation = false; - public static final String DONT_REMOVE_ALIGNMENT_INFORMATION_ARG = "remove-alignment-information"; - @Argument(fullName = DONT_REMOVE_ALIGNMENT_INFORMATION_ARG, doc = "Remove all alignment information from the file.") + public static final String DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME = "remove-alignment-information"; + @Argument(fullName = DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME, doc = "Remove all alignment information from the file.") public boolean dontRemoveAlignmentInformation = false; - public static final String ATTRIBUTE_TO_CLEAR_ARG = "attributes-to-clear"; - @Argument(fullName = ATTRIBUTE_TO_CLEAR_ARG,doc = "When removing alignment information, the set of optional tags to remove.", optional = true) - public List attributesToClear = new ArrayList(); + public static final String ATTRIBUTE_TO_CLEAR_LONG_NAME = "attributes-to-clear"; + @Argument(fullName = ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true) + public Set attributesToClear = new HashSet(); - public static final String REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_ARG = "remove-default-attributes-to-clear"; - @Argument(fullName = REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_ARG,doc = "When removing alignment information, the set of optional tags to remove.", optional = true) + public static final String REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME = "remove-default-attributes-to-clear"; + @Argument(fullName = REMOVE_DEFAULT_ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true) public boolean removeDefaults = false; - public static List DEFAULT_ATTRIBUTES_TO_CLEAR = new ArrayList() {{ - add(SAMTag.NM.name()); - add(SAMTag.UQ.name()); - add(SAMTag.PG.name()); - add(SAMTag.MD.name()); - add(SAMTag.MQ.name()); - add(SAMTag.SA.name()); // Supplementary alignment metadata - add(SAMTag.MC.name()); // Mate Cigar - add(SAMTag.AS.name()); - }}; - public static final String SAMPLE_ALIAS_ARG = "sample-alias"; @Argument(fullName = SAMPLE_ALIAS_ARG, doc = "The sample alias to use in the reverted output file. This will override the existing " + "sample alias in the file and is used only if all the read groups in the input file have the " + @@ -216,13 +196,43 @@ public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } + public static List DEFAULT_ATTRIBUTES_TO_CLEAR = new ArrayList() {{ + add(SAMTag.NM.name()); + add(SAMTag.UQ.name()); + add(SAMTag.PG.name()); + add(SAMTag.MD.name()); + add(SAMTag.MQ.name()); + add(SAMTag.SA.name()); // Supplementary alignment metadata + add(SAMTag.MC.name()); // Mate Cigar + add(SAMTag.AS.name()); + }}; + + public enum FileType implements CommandLineParser.ClpEnum { + sam("Generate SAM files."), + bam("Generate BAM files."), + cram("Generate CRAM files."), + dynamic("Generate files based on the extention of input."); + + final String description; + + FileType(String description) { + this.description = description; + } + + @Override + public String getHelpDoc() { + return description; + } + } + /** * Enforce that output ordering is queryname when sanitization is turned on since it requires a queryname sort. + * Also checks to ensure that the user has chosen a valid subset of arguments pertaining to output and sanitization. */ @Override protected String[] customCommandLineValidation() { final List errors = new ArrayList<>(); - RevertSamSpark.ValidationUtil.validateOutputParams(outputByReadGroup, output, outputMap, errors); + validateOutputParams(outputByReadGroup, output, outputMap); if (!sanitize && keepFirstDuplicate) errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'"); @@ -234,37 +244,37 @@ protected String[] customCommandLineValidation() { @Override protected void runTool(JavaSparkContext ctx) { - Broadcast headerb = ctx.broadcast(getHeaderForReads()); + Broadcast headerBroadcast = ctx.broadcast(getHeaderForReads()); JavaRDD reads = getReads(); //////////////////////////////////////////////////////////////////////////// // Grab the input header and remap values where appropriate //////////////////////////////////////////////////////////////////////////// - SAMFileHeader inHeader = getHeaderForReads(); - ValidationUtil.validateHeaderOverrides(inHeader, sampleAlias, libraryName); + SAMFileHeader localHeader = headerBroadcast.getValue(); + validateHeaderOverrides(localHeader, sampleAlias, libraryName); if (sampleAlias != null) { - inHeader.getReadGroups().forEach(rg -> rg.setSample(sampleAlias)); + localHeader.getReadGroups().forEach(rg -> rg.setSample(sampleAlias)); } if (libraryName != null) { - inHeader.getReadGroups().forEach(rg -> rg.setLibrary(libraryName)); + localHeader.getReadGroups().forEach(rg -> rg.setLibrary(libraryName)); } //////////////////////////////////////////////////////////////////////////// // Map the readgroups in the header to appropriate //////////////////////////////////////////////////////////////////////////// - Map writerMap = getOutputMap(outputMap, + Map writerMap = getOutputMap(outputMap, output, getDefaultExtension(readArguments.getReadFiles().get(0).toString(), outputByReadgroupFileFormat), - inHeader.getReadGroups(), + localHeader.getReadGroups(), outputByReadGroup); //////////////////////////////////////////////////////////////////////////// // Construct appropriate headers for the output files //////////////////////////////////////////////////////////////////////////// - final Map headerMap = getReadgroupHeaderMap(inHeader, writerMap); + final Map headerMap = getReadGroupHeaderMap(localHeader, writerMap); // Revert the reads based on the given attributes - List attributesToRevert = removeDefaults? DEFAULT_ATTRIBUTES_TO_CLEAR: new ArrayList<>(); + List attributesToRevert = removeDefaults ? DEFAULT_ATTRIBUTES_TO_CLEAR : new ArrayList<>(); attributesToRevert.addAll(attributesToClear); JavaRDD readsReverted = revertReads(reads, attributesToRevert); @@ -272,18 +282,18 @@ protected void runTool(JavaSparkContext ctx) { // Sanitize the reads, sorting them into appropriate order if necessary //////////////////////////////////////////////////////////////////////////// if (sanitize) { - Map readGroupFormatMap = createReadGroupFormatMap(readsReverted, headerb, !dontRestoreOriginalQualities); + Map readGroupFormatMap = createReadGroupFormatMap(readsReverted, headerBroadcast, !dontRestoreOriginalQualities); - readsReverted = sanitize(readGroupFormatMap, readsReverted, inHeader, keepFirstDuplicate); + readsReverted = sanitize(readGroupFormatMap, readsReverted, localHeader, keepFirstDuplicate); } // Write the one or many read output files - for (Map.Entry rmap: writerMap.entrySet()) { + for (Map.Entry rmap: writerMap.entrySet()) { //TODO what to do if the readgroup isn't present final String key = rmap.getKey(); JavaRDD filteredreads = rmap.getKey()==null? readsReverted : readsReverted.filter(r -> r.getReadGroup().equals(key)); - writeReads(ctx, rmap.getValue().getPath(), filteredreads, headerMap.get(rmap.getKey())); //TODO proper header map + writeReads(ctx, rmap.getValue().toString(), filteredreads, headerMap.get(rmap.getKey())); //TODO proper header map } } @@ -294,7 +304,7 @@ protected void runTool(JavaSparkContext ctx) { * @param reads Reads RDD over which to iterate and detect readgroups * @param inHeader Header describing the readgroups present in the bam * @param restoreOriginalQualities Whether to use the OQ tag for determining the map - * @return + * @return the best guess at the quality encoding format present for each readgroup based on the first {@link QualityEncodingDetector#DEFAULT_MAX_RECORDS_TO_ITERATE} reads in each readgroup. */ private Map createReadGroupFormatMap( final JavaRDD reads, final Broadcast inHeader, @@ -348,7 +358,7 @@ public SAMRecord next() { * and one read labled as second in pair to treat as the representative reads, throwing away the rest. */ private JavaRDD sanitize(final Map readGroupToFormat, final JavaRDD reads, final SAMFileHeader header, final boolean keepFirstDuplicate) { - JavaRDD sortedReads = querynameSortReadsIfNecessary(reads.filter(r -> r.getBases().length == r.getBaseQualities().length), getRecommendedNumReducers(), header); + JavaRDD sortedReads = SparkUtils.querynameSortReadsIfNecessary(reads.filter(r -> r.getBases().length == r.getBaseQualities().length), getRecommendedNumReducers(), header); JavaPairRDD> readsByGroup = spanReadsByKey(sortedReads); return readsByGroup.flatMap(group -> { @@ -442,26 +452,15 @@ private static JavaPairRDD> spanReadsByKey(final Java }); } - private static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader headerForTool) { - JavaRDD sortedReadsForMarking; - if (ReadUtils.isReadNameGroupedBam(headerForTool)) { - sortedReadsForMarking = reads; - } else { - headerForTool.setSortOrder(SAMFileHeader.SortOrder.queryname); - sortedReadsForMarking = SparkUtils.sortReadsAccordingToHeader(reads, headerForTool, numReducers); - } - return sortedReadsForMarking; - } - - private Map getReadgroupHeaderMap(SAMFileHeader inHeader, Map writerMap) { + private Map getReadGroupHeaderMap(SAMFileHeader inHeader, Map writerMap) { final Map headerMap; if (outputByReadGroup) { if (inHeader.getReadGroups().isEmpty()) { - throw new GATKException("The header is missing its read group map"); + throw new UserException("The header is missing its read group map"); } - ValidationUtil.assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups()); + assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups()); headerMap = new HashMap<>(); for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) { final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); @@ -471,12 +470,12 @@ private Map getReadgroupHeaderMap(SAMFileHeader inHeader, } else { final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); inHeader.getReadGroups().forEach(singleOutHeader::addReadGroup); - headerMap = Collections.singletonMap(null,singleOutHeader); + headerMap = Collections.singletonMap(null, singleOutHeader); } return headerMap; } - private SAMFileHeader createOutHeader( + private static SAMFileHeader createOutHeader( final SAMFileHeader inHeader, final SAMFileHeader.SortOrder sortOrder, final boolean removeAlignmentInformation) { @@ -492,13 +491,13 @@ private SAMFileHeader createOutHeader( @VisibleForTesting static String getDefaultExtension(final String input, final FileType setting) { if (setting == FileType.dynamic) { - if (input.endsWith(".sam")) { - return ".sam"; + if (input.endsWith(IOUtil.SAM_FILE_EXTENSION)) { + return IOUtil.SAM_FILE_EXTENSION; } - if (input.endsWith(".cram")) { - throw new GATKException("Input file is a cram. This is currently unsupported for this tool");//TODO unsupported feature + if (input.endsWith(CramIO.CRAM_FILE_EXTENSION)) { + throw new UserException.UnimplementedFeature("Input file is a cram. This is currently unsupported for this tool"); } - return ".bam"; + return BamFileIoUtils.BAM_FILE_EXTENSION; } else { return "." + setting.toString(); } @@ -554,153 +553,159 @@ public JavaRDD revertReads(JavaRDD reads, List attri } @VisibleForTesting - static Map getOutputMap( - final File outputMapFile, - final File outputDir, + static Map getOutputMap( + final String outputMapFile, + final String outputDir, final String defaultExtension, final List readGroups, final boolean outputByReadgroup) { if (outputByReadgroup) { - final Map outputMap; + final Map outputMap; if (outputMapFile != null) { try { outputMap = createOutputMapFromFile(outputMapFile); } catch (IOException e) { - throw new GATKException("Encountered an error reading output map file", e); + throw new UserException("Encountered an error reading output map file", e); } } else { outputMap = createOutputMapFromHeader(readGroups, outputDir, defaultExtension); } return outputMap; } else { - return Collections.singletonMap(null, outputDir); + return Collections.singletonMap(null, IOUtils.getPath(outputDir)); } } // Names the files based on the locations laid out in the readgroup map - private static Map createOutputMapFromFile(final File outputMapFile) throws IOException { - final Map outputMap = new HashMap<>(); - final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMapFile.getAbsolutePath(), new TableCodec(null), false); - for (final TableFeature row : parser.iterator()) { - final String id = row.get("READ_GROUP_ID"); - final String output = row.get("OUTPUT"); - final File outputPath = new File(output); - outputMap.put(id, outputPath); + private static Map createOutputMapFromFile(final String outputMapFile) throws IOException { + final Map outputMap = new HashMap<>(); + try (final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMapFile, new TableCodec(null), false);) { + for (final TableFeature row : parser.iterator()) { + final String id = row.get(OUTPUT_MAP_READ_GROUP_FIELD_NAME); + final String output = row.get(OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME); + final Path outputPath = IOUtils.getPath(output); + outputMap.put(id, outputPath); + } } - CloserUtil.close(parser); return outputMap; } // Names the files based on the readgroups individually presented in the header - private static Map createOutputMapFromHeader(final List readGroups, final File outputDir, final String extension) { - final Map outputMap = new HashMap<>(); + private static Map createOutputMapFromHeader(final List readGroups, final String outputDir, final String extension) { + final Map outputMap = new HashMap<>(); for (final SAMReadGroupRecord readGroup : readGroups) { final String id = readGroup.getId(); final String fileName = id + extension; - final Path outputPath = Paths.get(outputDir.toString(), fileName); - outputMap.put(id, outputPath.toFile()); + final Path outputPath = Paths.get(outputDir, fileName); + outputMap.put(id, outputPath); } return outputMap; } -// ________________________________________________________________________________________________________________________ -// sum garbage -// ________________________________________________________________________________________________________________________ /** * Methods used for validating parameters to RevertSam. */ - static class ValidationUtil { - - static void validateOutputParams(final boolean outputByReadGroup, final File output, final File outputMap, final List errors) { + static List validateOutputParams(final boolean outputByReadGroup, final String output, final String outputMap) { + final List errors = new ArrayList<>(); + try { if (outputByReadGroup) { - validateOutputParamsByReadGroup(output, outputMap, errors); + errors.addAll(validateOutputParamsByReadGroup(output, outputMap)); } else { - validateOutputParamsNotByReadGroup(output, outputMap, errors); + errors.addAll(validateOutputParamsNotByReadGroup(output, outputMap)); } + } catch (IOException e) { + throw new UserException.BadInput("Error while validating input file", e); } + return errors; + } - static void validateOutputParamsByReadGroup(final File output, final File outputMap, final List errors) { - if (output != null) { - if (!Files.isDirectory(output.toPath())) { - errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output); - } - return; - } - // output is null if we reached here - if (outputMap == null) { - errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set."); - return; - } - if (!Files.isReadable(outputMap.toPath())) { - errors.add("Cannot read outputMap " + outputMap); - return; - } - final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMap.getAbsolutePath(), new TableCodec(null),false); - if (!RevertSamSpark.ValidationUtil.isOutputMapHeaderValid((List)parser.getHeader())) { - errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with READ_GROUP_ID as first column and output as second column."); + @SuppressWarnings("unchecked") + static List validateOutputParamsByReadGroup(final String output, final String outputMap) throws IOException { + final List errors = new ArrayList<>(); + if (output != null) { + if (!Files.isDirectory(IOUtil.getPath(output))) { + errors.add("When '--output-by-readgroup' is set and output is provided, it must be a directory: " + output); } + return errors; + } + // output is null if we reached here + if (outputMap == null) { + errors.add("Must provide either output or outputMap when '--output-by-readgroup' is set."); + return errors; } + if (!Files.isReadable(IOUtil.getPath(outputMap))) { + errors.add("Cannot read outputMap " + outputMap); + return errors; + } + final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMap, new TableCodec(null),false); + if (!isOutputMapHeaderValid((List)parser.getHeader())) { + errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with OUTPUT_MAP_READ_GROUP_FIELD_NAME as first column and output as second column."); + } + return errors; + } - static void validateOutputParamsNotByReadGroup(final File output, final File outputMap, final List errors) { - if (outputMap != null) { - errors.add("Cannot provide outputMap when '--output-by-read' isn't set. Provide output instead."); - } - if (output == null) { - errors.add("output is required when '--output-by-read'"); - return; - } - if (Files.isDirectory(output.toPath())) { - errors.add("output " + output + " should not be a directory when '--output-by-read'"); - } + static List validateOutputParamsNotByReadGroup(final String output, final String outputMap) throws IOException { + final List errors = new ArrayList<>(); + if (outputMap != null) { + errors.add("Cannot provide outputMap when '--output-by-read' isn't set. Provide output instead."); + } + if (output == null) { + errors.add("output is required when '--output-by-read'"); + return errors; } + if (Files.isDirectory(IOUtil.getPath(output))) { + errors.add("output " + output + " should not be a directory when '--output-by-read'"); + } + return errors; + } - /** - * If we are going to override sampleAlias or libraryName, make sure all the read - * groups have the same values. - */ - static void validateHeaderOverrides( - final SAMFileHeader inHeader, - final String sampleAlias, - final String libraryName) { - - final List rgs = inHeader.getReadGroups(); - if (sampleAlias != null || libraryName != null) { - boolean allSampleAliasesIdentical = true; - boolean allLibraryNamesIdentical = true; - for (int i = 1; i < rgs.size(); i++) { - if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) { - allSampleAliasesIdentical = false; - } - if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) { - allLibraryNamesIdentical = false; - } - } - if (sampleAlias != null && !allSampleAliasesIdentical) { - throw new GATKException("Read groups have multiple values for sample. " + - "A value for sampleAlias cannot be supplied."); + /** + * If we are going to override sampleAlias or libraryName, make sure all the read + * groups have the same values. + */ + static void validateHeaderOverrides( + final SAMFileHeader inHeader, + final String sampleAlias, + final String libraryName) { + + final List rgs = inHeader.getReadGroups(); + if (sampleAlias != null || libraryName != null) { + boolean allSampleAliasesIdentical = true; + boolean allLibraryNamesIdentical = true; + for (int i = 1; i < rgs.size(); i++) { + if (!rgs.get(0).getSample().equals(rgs.get(i).getSample())) { + allSampleAliasesIdentical = false; } - if (libraryName != null && !allLibraryNamesIdentical) { - throw new GATKException("Read groups have multiple values for library name. " + - "A value for library name cannot be supplied."); + if (!rgs.get(0).getLibrary().equals(rgs.get(i).getLibrary())) { + allLibraryNamesIdentical = false; } } + if (sampleAlias != null && !allSampleAliasesIdentical) { + throw new UserException("Read groups have multiple values for sample. " + + "A value for sampleAlias cannot be supplied."); + } + if (libraryName != null && !allLibraryNamesIdentical) { + throw new UserException("Read groups have multiple values for library name. " + + "A value for library name cannot be supplied."); + } } + } - static void assertAllReadGroupsMapped(final Map outputMap, final List readGroups) { - for (final SAMReadGroupRecord readGroup : readGroups) { - final String id = readGroup.getId(); - final File output = outputMap.get(id); - if (output == null) { - throw new GATKException("Read group id " + id + " not found in outputMap " + outputMap); - } + static void assertAllReadGroupsMapped(final Map outputMap, final List readGroups) { + for (final SAMReadGroupRecord readGroup : readGroups) { + final String id = readGroup.getId(); + final Path output = outputMap.get(id); + if (output == null) { + throw new GATKException("Read group id " + id + " not found in outputMap " + outputMap); } } + } - static boolean isOutputMapHeaderValid(final List columnLabels) { - return columnLabels.size() >= 2 && - "READ_GROUP_ID".equals(columnLabels.get(0)) && - "OUTPUT".equals(columnLabels.get(1)); - } + static boolean isOutputMapHeaderValid(final List columnLabels) { + return columnLabels.size() >= 2 && + OUTPUT_MAP_READ_GROUP_FIELD_NAME.equals(columnLabels.get(0)) && + OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME.equals(columnLabels.get(1)); } + } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java index 98051ad0e96..d25fb877bd1 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSpark.java @@ -97,7 +97,7 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil SAMFileHeader headerForTool = header.clone(); // If the input isn't queryname sorted, sort it before duplicate marking - final JavaRDD sortedReadsForMarking = querynameSortReadsIfNecessary(reads, numReducers, headerForTool); + final JavaRDD sortedReadsForMarking = SparkUtils.querynameSortReadsIfNecessary(reads, numReducers, headerForTool); // If we need to remove optical duplicates or tag them, then make sure we are keeping track final boolean markOpticalDups = (taggingPolicy != MarkDuplicates.DuplicateTaggingPolicy.DontTag); @@ -171,20 +171,6 @@ public static JavaRDD mark(final JavaRDD reads, final SAMFil } - /** - * Sort reads into queryname order if they are not already sorted - */ - private static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader headerForTool) { - JavaRDD sortedReadsForMarking; - if (ReadUtils.isReadNameGroupedBam(headerForTool)) { - sortedReadsForMarking = reads; - } else { - headerForTool.setSortOrder(SAMFileHeader.SortOrder.queryname); - sortedReadsForMarking = SparkUtils.sortReadsAccordingToHeader(reads, headerForTool, numReducers); - } - return sortedReadsForMarking; - } - /** * A custom partitioner designed to cut down on spark shuffle costs. * This is designed such that getPartition(key) is called on a key which corresponds to the already known target partition diff --git a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java index e0fc3ef3f49..a914caea5ad 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java @@ -282,4 +282,17 @@ protected Tuple2> computeNext() { }; } + /** + * Sort reads into queryname order if they are not already sorted + */ + public static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader headerForTool) { + JavaRDD sortedReadsForMarking; + if (ReadUtils.isReadNameGroupedBam(headerForTool)) { + sortedReadsForMarking = reads; + } else { + headerForTool.setSortOrder(SAMFileHeader.SortOrder.queryname); + sortedReadsForMarking = sortReadsAccordingToHeader(reads, headerForTool, numReducers); + } + return sortedReadsForMarking; + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index fb433ffdd0d..9b987869a69 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -4,15 +4,17 @@ import htsjdk.samtools.util.CloserUtil; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; +import java.io.IOException; import java.io.PrintWriter; import java.nio.file.Files; +import java.nio.file.Path; import java.util.*; @Test(groups = "Spark") @@ -78,13 +80,13 @@ public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean r args.addOutput(output); if (outputByReadGroup) { - args.addPositionalArgument("--"+RevertSamSpark.OUTPUT_BY_READGROUP_ARG); + args.addPositionalArgument("--"+RevertSamSpark.OUTPUT_BY_READGROUP_LONG_NAME); } if (so != null) { args.addArgument("sort-order",so.name()); //TODO decide on sort order outputing } if (!removeAlignmentInfo) { - args.addPositionalArgument("--"+RevertSamSpark.DONT_REMOVE_ALIGNMENT_INFORMATION_ARG); + args.addPositionalArgument("--"+RevertSamSpark.DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME); } if (sample != null) { args.addArgument("sample-alias",sample); @@ -119,7 +121,7 @@ public void testOutputByReadGroupWithOutputMap() throws Exception { final String outputPath1 = outputDir + "/rg1.sam"; final String outputPath2 = outputDir + "/my_rg2.bam"; final String outputPath3 = outputDir + "/my_rg3.sam";//TODO not used? - mapWriter.println("READ_GROUP_ID\tOUTPUT"); + mapWriter.println("OUTPUT_MAP_READ_GROUP_FIELD_NAME\tOUTPUT_MAP_OUTPUT_FILE_FIELD_NAME"); mapWriter.println("0\t" + outputPath0); mapWriter.println("2\t" + outputPath2); mapWriter.println("1\t" + outputPath1); @@ -141,7 +143,7 @@ public void testOutputByReadGroupWithOutputMap() throws Exception { "--sort-order",SAMFileHeader.SortOrder.queryname.name(), "--"+RevertSamSpark.SAMPLE_ALIAS_ARG,"test_sample_1", "--"+RevertSamSpark.LIBRARY_NAME_ARG,"test_library_1", - "--"+RevertSamSpark.ATTRIBUTE_TO_CLEAR_ARG,SAMTag.NM.name() + "--"+RevertSamSpark.ATTRIBUTE_TO_CLEAR_LONG_NAME,SAMTag.NM.name() }; runCommandLine(args); @@ -154,11 +156,12 @@ public void testOutputByReadGroupWithOutputMap() throws Exception { verifyPositiveResults(output2, reverter, true, true, true, true, "2", 2, "test_sample_1", "test_library_1"); } - @Test (expectedExceptions = UserException.class) + @Test + // TODO the purpose of this test is unclear to me public void testSingleEndSanitize() throws Exception { final File output = File.createTempFile("single_end_reverted", ".sam"); output.deleteOnExit(); - final String args[] = { "-I " + singleEndSamToRevert, "-O " + output.getAbsolutePath(), "--sanitize"}; + final String args[] = { "-I", singleEndSamToRevert.getAbsolutePath(), "-O", output.getAbsolutePath(), "--sanitize"}; runCommandLine(args); } @@ -255,7 +258,7 @@ public void testSanitizeAndDeduplicateRecords() throws Exception { "--input", input.getAbsolutePath(), "--sanitize", "--keep-first-duplicate", - "--"+RevertSamSpark.DONT_RESTORE_ORIGINAL_QUALITIES_ARG, + "--"+RevertSamSpark.DONT_RESTORE_ORIGINAL_QUALITIES_LONG_NAME, "-O", output.getAbsolutePath() }; runCommandLine(args); @@ -287,71 +290,62 @@ public Object[][] getNegativeTestData() { } @Test - public void testValidateOutputParamsByReadGroupMapValid() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, validOutputMap, errors); + public void testValidateOutputParamsByReadGroupMapValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, validOutputMap.getAbsolutePath()); Assert.assertEquals(errors.size(), 0); } @Test - public void testValidateOutputParamsByReadGroupMissingMap() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, nonExistentOutputMap, errors); + public void testValidateOutputParamsByReadGroupMissingMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, nonExistentOutputMap.getAbsolutePath()); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("Cannot read"), true); } @Test - public void testValidateOutputParamsByReadGroupBadHeaderMap() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, badHeaderOutputMap, errors); + public void testValidateOutputParamsByReadGroupBadHeaderMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, badHeaderOutputMap.getAbsolutePath()); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("Invalid header"), true); } @Test - public void testValidateOutputParamsByReadGroupNoMapOrDir() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(null, null, errors); + public void testValidateOutputParamsByReadGroupNoMapOrDir() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, null); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("Must provide either"), true); } @Test - public void testValidateOutputParamsByReadGroupDirValid() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupValid"), null, errors); + public void testValidateOutputParamsByReadGroupDirValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupValid").getAbsolutePath(), null); Assert.assertEquals(errors.size(), 0); } @Test - public void testValidateOutputParamsNotByReadGroupValid() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(createTempFile("testValidateOutputParamsNotByReadGroupValid",""), null, errors); + public void testValidateOutputParamsNotByReadGroupValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempFile("testValidateOutputParamsNotByReadGroupValid","").getAbsolutePath(), null); Assert.assertEquals(errors.size(), 0); } @Test - public void testValidateOutputParamsNotByReadGroupNoOutput() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(null, null, errors); + public void testValidateOutputParamsNotByReadGroupNoOutput() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, null); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("output is required"), true); } @Test - public void testValidateOutputParamsNotByReadGroupMap() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(null, validOutputMap, errors); + public void testValidateOutputParamsNotByReadGroupMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, validOutputMap.getAbsolutePath()); Assert.assertEquals(errors.size(), 2); Assert.assertEquals(errors.get(0).contains("Cannot provide outputMap"), true); Assert.assertEquals(errors.get(1).contains("output is required"), true); } @Test - public void testValidateOutputParamsNotByReadGroupDir() { - final List errors = new ArrayList(); - RevertSamSpark.ValidationUtil.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir"), null, errors); + public void testValidateOutputParamsNotByReadGroupDir() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir").getAbsolutePath(), null); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("should not be a directory"), true); } @@ -361,12 +355,12 @@ public void testAssertAllReadGroupsMappedSuccess() { final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); - final Map outputMap = new HashMap(); - outputMap.put("rg1", new File("/foo/bar/rg1.bam")); - outputMap.put("rg2", new File("/foo/bar/rg2.bam")); - RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2)); - RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1)); - RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg2)); + final Map outputMap = new HashMap<>(); + outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2)); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1)); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg2)); } @Test(expectedExceptions = {GATKException.class}) @@ -375,21 +369,21 @@ public void testAssertAllReadGroupsMappedFailure() { final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); final SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3"); - final Map outputMap = new HashMap(); - outputMap.put("rg1", new File("/foo/bar/rg1.bam")); - outputMap.put("rg2", new File("/foo/bar/rg2.bam")); - RevertSamSpark.ValidationUtil.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2, rg3)); + final Map outputMap = new HashMap<>(); + outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2, rg3)); } @Test public void testIsOutputMapHeaderValid() { - boolean isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Arrays.asList("READ_GROUP_ID", "OUTPUT")); + boolean isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT_MAP_READ_GROUP_FIELD_NAME", "OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME")); Assert.assertEquals(isValid, true); - isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Arrays.asList("OUTPUT")); + isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME")); Assert.assertEquals(isValid, false); - isValid = RevertSamSpark.ValidationUtil.isOutputMapHeaderValid(Collections.EMPTY_LIST); + isValid = RevertSamSpark.isOutputMapHeaderValid(Collections.EMPTY_LIST); Assert.assertEquals(isValid, false); } @@ -398,16 +392,16 @@ public void testFilePathsWithoutMapFile() { final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); - final Map outputMap = RevertSamSpark.getOutputMap(null, new File("/foo/bar"), ".bam", Arrays.asList(rg1, rg2), true); - Assert.assertEquals(outputMap.get("rg1"), new File("/foo/bar/rg1.bam")); - Assert.assertEquals(outputMap.get("rg2"), new File("/foo/bar/rg2.bam")); + final Map outputMap = RevertSamSpark.getOutputMap(null, new File("/foo/bar").getAbsolutePath(), ".bam", Arrays.asList(rg1, rg2), true); + Assert.assertEquals(outputMap.get("rg1"), IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + Assert.assertEquals(outputMap.get("rg2"), IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); } @Test public void testFilePathsWithMapFile() { - final Map outputMap = RevertSamSpark.getOutputMap(validOutputMap, null, ".bam", Collections.emptyList(), true); - Assert.assertEquals(outputMap.get("rg1"), new File("/path/to/my_rg_1.ubam")); - Assert.assertEquals(outputMap.get("rg2"), new File("/path/to/my_rg_2.ubam")); + final Map outputMap = RevertSamSpark.getOutputMap(validOutputMap.getAbsolutePath(), null, ".bam", Collections.emptyList(), true); + Assert.assertEquals(outputMap.get("rg1"), IOUtils.getPath(new File("/path/to/my_rg_1.ubam").getAbsolutePath())); + Assert.assertEquals(outputMap.get("rg2"), IOUtils.getPath(new File("/path/to/my_rg_2.ubam").getAbsolutePath())); } @Test From e7327390f6b292b791a8021d9eeac990a968d64b Mon Sep 17 00:00:00 2001 From: James Date: Thu, 6 Dec 2018 16:16:58 -0500 Subject: [PATCH 03/15] responding to another round of comments with nothing more than rowdy backtalk and refusal to respond --- .../tools/spark/RevertSamSpark.java | 44 ++-- .../utils/codecs/table/TableCodec.java | 28 ++- .../runtime/StreamingProcessController.java | 2 +- .../hellbender/utils/spark/SparkUtils.java | 14 +- ...andLineProgramExecutorIntegrationTest.java | 2 +- .../spark/RevertSamSparkIntegrationTest.java | 211 +++++------------- .../tools/spark/RevertSamSparkUnitTest.java | 144 ++++++------ .../MarkDuplicatesSparkUtilsUnitTest.java | 2 +- 8 files changed, 191 insertions(+), 256 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java index c38e8603977..db79dd066a4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -77,16 +77,18 @@ *
    * Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to * LENIENT or SILENT if the failures are expected to be obviated by the reversion process - * (e.g. invalid alignment information will be obviated when the dontRemoveAlignmentInformation option is used). + * (e.g. invalid alignment information will be obviated when the keepAlignmentInformation option is used). */ @DocumentedFeature @CommandLineProgramProperties( - summary =RevertSamSpark.USAGE_DETAILS, - oneLineSummary =RevertSamSpark.USAGE_SUMMARY, + summary = RevertSamSpark.USAGE_DETAILS, + oneLineSummary = RevertSamSpark.USAGE_SUMMARY, programGroup = ReadDataManipulationProgramGroup.class) @BetaFeature public class RevertSamSpark extends GATKSparkTool { + private static final long serialVersionUID = 1L; + static final String USAGE_SUMMARY = "Reverts SAM or BAM files to a previous state."; static final String USAGE_DETAILS = "This tool removes or restores certain properties of the SAM records, including alignment " + "information, which can be used to produce an unmapped BAM (uBAM) from a previously aligned BAM. It is also capable of " + @@ -115,7 +117,7 @@ public class RevertSamSpark extends GATKSparkTool { " Output format can be overridden with the outputByReadgroupFileFormat option.\n" + "Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to " + "LENIENT or SILENT if the failures are expected to be obviated by the reversion process " + - "(e.g. invalid alignment information will be obviated when the dontRemoveAlignmentInformation option is used).\n" + + "(e.g. invalid alignment information will be obviated when the keepAlignmentInformation option is used).\n" + ""; public static final String OUTPUT_MAP_READ_GROUP_FIELD_NAME = "READ_GROUP_ID"; public static final String OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME = "OUTPUT"; @@ -167,9 +169,9 @@ public class RevertSamSpark extends GATKSparkTool { " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") public boolean dontRemoveDuplicateInformation = false; - public static final String DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME = "remove-alignment-information"; - @Argument(fullName = DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME, doc = "Remove all alignment information from the file.") - public boolean dontRemoveAlignmentInformation = false; + public static final String KEEP_ALIGNMENT_INFORMATION = "keep-alignment-information"; + @Argument(fullName = KEEP_ALIGNMENT_INFORMATION, doc = "Remove all alignment information from the file.") + public boolean keepAlignmentInformation = false; public static final String ATTRIBUTE_TO_CLEAR_LONG_NAME = "attributes-to-clear"; @Argument(fullName = ATTRIBUTE_TO_CLEAR_LONG_NAME, doc = "When removing alignment information, the set of optional tags to remove.", optional = true) @@ -196,7 +198,7 @@ public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } - public static List DEFAULT_ATTRIBUTES_TO_CLEAR = new ArrayList() {{ + public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){{ add(SAMTag.NM.name()); add(SAMTag.UQ.name()); add(SAMTag.PG.name()); @@ -205,7 +207,7 @@ public List getDefaultReadFilters() { add(SAMTag.SA.name()); // Supplementary alignment metadata add(SAMTag.MC.name()); // Mate Cigar add(SAMTag.AS.name()); - }}; + }}); public enum FileType implements CommandLineParser.ClpEnum { sam("Generate SAM files."), @@ -234,7 +236,9 @@ protected String[] customCommandLineValidation() { final List errors = new ArrayList<>(); validateOutputParams(outputByReadGroup, output, outputMap); - if (!sanitize && keepFirstDuplicate) errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'"); + if (!sanitize && keepFirstDuplicate) { + errors.add("'keepFirstDuplicate' cannot be used without 'sanitize'"); + } if (!errors.isEmpty()) { return errors.toArray(new String[errors.size()]); @@ -311,11 +315,13 @@ private Map createReadGroupFormatMap( final JavaRDD< final boolean restoreOriginalQualities) { final Map output = new HashMap<>(); - inHeader.getValue().getReadGroups().stream().parallel().forEach(rg -> { + inHeader.getValue().getReadGroups().stream().forEach(rg -> { // For each readgroup filter down to just the reads in that group final String key = rg.getId(); JavaRDD filtered = reads.filter(r -> r.getReadGroup().equals(key)); + // NOTE: this method has the potential to be expensive as it may end up pulling on the first partition many times, and potentially + // end up iterating over the entire genome in the case where there are readgroups missing from the bam if (!filtered.isEmpty()) { // take the number of reads required by QualityEncodingDetector to determine quality score map @@ -352,13 +358,15 @@ public SAMRecord next() { } /** - * If this is run, we want to be careful to remove duplicated reads from the bam. + * If this is run, we want to be careful to remove copied reads from the bam. * * In order to do this we group each read by its readname and randomly select one read labeled as first in pair - * and one read labled as second in pair to treat as the representative reads, throwing away the rest. + * and one read labeled as second in pair to treat as the representative reads, throwing away the rest. */ private JavaRDD sanitize(final Map readGroupToFormat, final JavaRDD reads, final SAMFileHeader header, final boolean keepFirstDuplicate) { - JavaRDD sortedReads = SparkUtils.querynameSortReadsIfNecessary(reads.filter(r -> r.getBases().length == r.getBaseQualities().length), getRecommendedNumReducers(), header); + JavaRDD sortedReads = SparkUtils.querynameSortReadsIfNecessary( + reads.filter(r -> r.getLength() == r.getBaseQualityCount()), + getRecommendedNumReducers(), header); JavaPairRDD> readsByGroup = spanReadsByKey(sortedReads); return readsByGroup.flatMap(group -> { @@ -463,12 +471,12 @@ private Map getReadGroupHeaderMap(SAMFileHeader inHeader, assertAllReadGroupsMapped(writerMap, inHeader.getReadGroups()); headerMap = new HashMap<>(); for (final SAMReadGroupRecord readGroup : inHeader.getReadGroups()) { - final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); + final SAMFileHeader header = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation); header.addReadGroup(readGroup); headerMap.put(readGroup.getId(), header); } } else { - final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !dontRemoveAlignmentInformation); + final SAMFileHeader singleOutHeader = createOutHeader(inHeader, sortOrder, !keepAlignmentInformation); inHeader.getReadGroups().forEach(singleOutHeader::addReadGroup); headerMap = Collections.singletonMap(null, singleOutHeader); } @@ -525,7 +533,7 @@ public JavaRDD revertReads(JavaRDD reads, List attri reads = reads.map(r -> {r.setIsDuplicate(false); return r;}); } - if (!dontRemoveAlignmentInformation) { + if (!keepAlignmentInformation) { reads = reads.map(rec -> { if (rec.isReverseStrand()) { rec.reverseComplement(); @@ -640,7 +648,7 @@ static List validateOutputParamsByReadGroup(final String output, final } final FeatureReader parser = AbstractFeatureReader.getFeatureReader(outputMap, new TableCodec(null),false); if (!isOutputMapHeaderValid((List)parser.getHeader())) { - errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with OUTPUT_MAP_READ_GROUP_FIELD_NAME as first column and output as second column."); + errors.add("Invalid header: " + outputMap + ". Must be a tab-separated file with +"+OUTPUT_MAP_READ_GROUP_FIELD_NAME+"+ as first column and output as second column."); } return errors; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java index e896b1a4b15..142cc1a09d9 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java @@ -2,6 +2,7 @@ import htsjdk.tribble.AsciiFeatureCodec; import htsjdk.tribble.readers.LineIterator; +import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; @@ -52,7 +53,6 @@ public final class TableCodec extends AsciiFeatureCodec { protected static final String COMMENT_DELIMITER = "#"; private final String headerDelimiter; - private final String commentDelimiter; protected String delimiter_regex = "\\s+"; @@ -60,16 +60,22 @@ public final class TableCodec extends AsciiFeatureCodec { private boolean havePassedHeader = false; - public TableCodec(final String headerLineDelimiter, final String commentLineDelimiter) { + /** + * Create a TableCodec with a configured header line delimiter + * + * @param headerLineDelimiter the delimeter for comment header lines, or null if the header is a single commented line- + */ + public TableCodec(final String headerLineDelimiter) { super(TableFeature.class); + if ( "".equals(headerLineDelimiter) ) { + throw new GATKException("HeaderLineDelimiter must either be a valid delimiter or null"); + } headerDelimiter = headerLineDelimiter; - commentDelimiter = commentLineDelimiter; - } - - public TableCodec(final String headerLineDelimiter) { - this(headerLineDelimiter, COMMENT_DELIMITER); } + /** + * Create a TableCodec for IGV track data. + */ public TableCodec() { this(DEFAULT_HEADER_DELIMITER); } @@ -78,7 +84,7 @@ public TableCodec() { public TableFeature decode(final String line) { if ((headerDelimiter != null && ! line.startsWith(headerDelimiter)) || (headerDelimiter == null && !havePassedHeader) || - line.startsWith(commentDelimiter) || line.startsWith(IGV_HEADER_DELIMITER)) { + line.startsWith(COMMENT_DELIMITER) || line.startsWith(IGV_HEADER_DELIMITER)) { havePassedHeader = true; return null; } @@ -94,10 +100,10 @@ public List readActualHeader(final LineIterator reader) { boolean isFirst = true; while (reader.hasNext()) { final String line = reader.peek(); // Peek to avoid reading non-header data - if ( isFirst && ! line.startsWith(commentDelimiter) && headerDelimiter != null && ! line.startsWith(headerDelimiter) ) { + if ( isFirst && ! line.startsWith(COMMENT_DELIMITER) && headerDelimiter != null && ! line.startsWith(headerDelimiter) ) { throw new UserException.MalformedFile("TableCodec file does not have a header"); } - isFirst &= line.startsWith(commentDelimiter); + isFirst &= line.startsWith(COMMENT_DELIMITER); if (headerDelimiter == null || line.startsWith(headerDelimiter)) { reader.next(); // "Commit" the peek if (!header.isEmpty()) { @@ -106,7 +112,7 @@ public List readActualHeader(final LineIterator reader) { final String[] spl = line.split(delimiter_regex); Collections.addAll(header, spl); return header; - } else if (line.startsWith(commentDelimiter)) { + } else if (line.startsWith(COMMENT_DELIMITER)) { reader.next(); // "Commit" the peek } else { break; diff --git a/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java b/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java index 25ad6ce5ecc..59e89775135 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/runtime/StreamingProcessController.java @@ -363,7 +363,7 @@ protected void tryCleanShutdown() { } } if (process != null) { - // terminate the app by closing the process' input stream + // terminate the app by closing the process' INPUT stream IOUtils.closeQuietly(process.getOutputStream()); } } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java index a914caea5ad..2f527732f32 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/spark/SparkUtils.java @@ -20,10 +20,8 @@ import org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSink; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.spark.transforms.markduplicates.MarkDuplicatesSparkUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.*; -import org.broadinstitute.hellbender.utils.read.markduplicates.ReadsKey; import scala.Tuple2; import java.io.*; @@ -242,7 +240,7 @@ public static JavaRDD putReadsWithTheSameNameInTheSamePartition(final * @return an RDD where each the values for each key are grouped into an iterable collection */ public static JavaPairRDD> spanByKey(JavaPairRDD rdd) { - return rdd.mapPartitionsToPair(SparkUtils::spanningIterator); + return rdd.mapPartitionsToPair(SparkUtils::getSpanningIterator); } /** @@ -252,7 +250,7 @@ public static JavaPairRDD> spanByKey(JavaPairRDD rdd * @param type of values * @return an iterator over pairs of keys and grouped values */ - public static Iterator>> spanningIterator(Iterator> iterator) { + public static Iterator>> getSpanningIterator(Iterator> iterator) { final PeekingIterator> iter = Iterators.peekingIterator(iterator); return new AbstractIterator>>() { @Override @@ -285,13 +283,13 @@ protected Tuple2> computeNext() { /** * Sort reads into queryname order if they are not already sorted */ - public static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader headerForTool) { + public static JavaRDD querynameSortReadsIfNecessary(JavaRDD reads, int numReducers, SAMFileHeader header) { JavaRDD sortedReadsForMarking; - if (ReadUtils.isReadNameGroupedBam(headerForTool)) { + if (ReadUtils.isReadNameGroupedBam(header)) { sortedReadsForMarking = reads; } else { - headerForTool.setSortOrder(SAMFileHeader.SortOrder.queryname); - sortedReadsForMarking = sortReadsAccordingToHeader(reads, headerForTool, numReducers); + header.setSortOrder(SAMFileHeader.SortOrder.queryname); + sortedReadsForMarking = sortReadsAccordingToHeader(reads, header, numReducers); } return sortedReadsForMarking; } diff --git a/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java index 822d8cf400d..548b9385e7d 100644 --- a/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/cmdline/PicardCommandLineProgramExecutorIntegrationTest.java @@ -52,7 +52,7 @@ public void testPicardNormalizeFastaWithBadArgs() throws IOException { final File outfile = createTempFile("normalized", ".fasta"); // Use GATK-style lower case argument names, which are rejected by Picard - // because it uses upper cased argument names (--input/--output) + // because it uses upper cased argument names (--INPUT/--OUTPUT) final String[] args = { "--input", input.getAbsolutePath(), "--output", outfile.getAbsolutePath(), diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index 9b987869a69..dbd6e93a81d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -4,7 +4,9 @@ import htsjdk.samtools.util.CloserUtil; import org.broadinstitute.hellbender.CommandLineProgramTest; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.testutils.BaseTest; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.DataProvider; @@ -20,20 +22,9 @@ @Test(groups = "Spark") public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { - @Override - public String getToolTestDataDir() { - return "src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark"; - } public static List defaultAttributesToClearPlusXT = new ArrayList() {{ - add(SAMTag.NM.name()); - add(SAMTag.UQ.name()); - add(SAMTag.PG.name()); - add(SAMTag.MD.name()); - add(SAMTag.MQ.name()); - add(SAMTag.SA.name()); // Supplementary alignment metadata - add(SAMTag.MC.name()); // Mate Cigar - add(SAMTag.AS.name()); + addAll(RevertSamSpark.DEFAULT_ATTRIBUTES_TO_CLEAR); add("XT"); }}; @@ -54,7 +45,7 @@ public String getToolTestDataDir() { @DataProvider(name="positiveTestData") - public Object[][] getPostitiveTestData() { + public Object[][] positiveTestData() { return new Object[][] { {null, false, false, true, true, null, null, Collections.EMPTY_LIST}, {SAMFileHeader.SortOrder.queryname, false, false, true, false, "Hey,Dad!", null, defaultAttributesToClearPlusXT}, @@ -63,17 +54,16 @@ public Object[][] getPostitiveTestData() { }; } - @Test(dataProvider="positiveTestData") + @Test(dataProvider= "positiveTestData") public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean removeDuplicates, final boolean removeAlignmentInfo, final boolean restoreOriginalQualities, final boolean outputByReadGroup, final String sample, final String library, final List attributesToClear) throws Exception { - final File output = outputByReadGroup?Files.createTempDirectory("picardRevertSamSparkTest").toFile():File.createTempFile("reverted", ".sam"); + final File output = outputByReadGroup ? Files.createTempDirectory("picardRevertSamSparkTest").toFile() : BaseTest.createTempFile("reverted", ".sam"); File output0 = new File(output.getPath()+"/0.sam"); File output1 = new File(output.getPath()+"/1.sam"); File output2 = new File(output.getPath()+"/2.sam"); File output3 = new File(output.getPath()+"/3.sam"); - output.deleteOnExit(); final RevertSamSpark reverter = new RevertSamSpark(); final ArgumentsBuilder args = new ArgumentsBuilder(); args.addInput(basicSamToRevert); @@ -86,7 +76,7 @@ public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean r args.addArgument("sort-order",so.name()); //TODO decide on sort order outputing } if (!removeAlignmentInfo) { - args.addPositionalArgument("--"+RevertSamSpark.DONT_REMOVE_ALIGNMENT_INFORMATION_LONG_NAME); + args.addPositionalArgument("--"+RevertSamSpark.KEEP_ALIGNMENT_INFORMATION); } if (sample != null) { args.addArgument("sample-alias",sample); @@ -113,15 +103,14 @@ public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean r @Test public void testOutputByReadGroupWithOutputMap() throws Exception { final File outputDir = createTempDir("testOutputByReadGroupWithOutputMap"); - outputDir.deleteOnExit(); // Create the output map - final File outputMapFile = Files.createTempFile("picardRevertSamSparkTestOutputMap", ".txt").toFile(); + final File outputMapFile = BaseTest.createTempFile("picardRevertSamSparkTestOutputMap", ".txt"); final PrintWriter mapWriter = new PrintWriter(outputMapFile); final String outputPath0 = outputDir + "/my_rg0.sam"; final String outputPath1 = outputDir + "/rg1.sam"; final String outputPath2 = outputDir + "/my_rg2.bam"; final String outputPath3 = outputDir + "/my_rg3.sam";//TODO not used? - mapWriter.println("OUTPUT_MAP_READ_GROUP_FIELD_NAME\tOUTPUT_MAP_OUTPUT_FILE_FIELD_NAME"); + mapWriter.println("READ_GROUP_ID\tOUTPUT"); mapWriter.println("0\t" + outputPath0); mapWriter.println("2\t" + outputPath2); mapWriter.println("1\t" + outputPath1); @@ -131,7 +120,6 @@ public void testOutputByReadGroupWithOutputMap() throws Exception { System.out.println("outputFile: " + outputPath2); System.out.println("outputFile: " + outputPath3); mapWriter.close(); - outputMapFile.deleteOnExit(); final RevertSamSpark reverter = new RevertSamSpark(); @@ -157,10 +145,8 @@ public void testOutputByReadGroupWithOutputMap() throws Exception { } @Test - // TODO the purpose of this test is unclear to me public void testSingleEndSanitize() throws Exception { - final File output = File.createTempFile("single_end_reverted", ".sam"); - output.deleteOnExit(); + final File output = createTempFile("single_end_reverted", ".sam"); final String args[] = { "-I", singleEndSamToRevert.getAbsolutePath(), "-O", output.getAbsolutePath(), "--sanitize"}; runCommandLine(args); } @@ -175,67 +161,66 @@ private void verifyPositiveResults( final String readGroupId, final int numReadsExpected, final String sample, - final String library) { - - outputFile.deleteOnExit(); - final SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(outputFile); - final SAMFileHeader header = reader.getFileHeader(); - Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.queryname); - Assert.assertEquals(header.getProgramRecords().size(), removeAlignmentInfo ? 0 : 1); - final List readGroups = header.getReadGroups(); - if (outputByReadGroup) { - Assert.assertEquals(readGroups.size(), 1); - Assert.assertEquals(readGroups.get(0).getId(), readGroupId); - } - for (final SAMReadGroupRecord rg : header.getReadGroups()) { - if (sample != null) { - Assert.assertEquals(rg.getSample(), sample); - } else { - Assert.assertEquals(rg.getSample(), "Hi,Mom!"); + final String library) throws IOException { + + try (SamReader reader = SamReaderFactory.makeDefault().referenceSequence(referenceFasta).open(outputFile)) { + final SAMFileHeader header = reader.getFileHeader(); + Assert.assertEquals(header.getSortOrder(), SAMFileHeader.SortOrder.queryname); + Assert.assertEquals(header.getProgramRecords().size(), removeAlignmentInfo ? 0 : 1); + final List readGroups = header.getReadGroups(); + if (outputByReadGroup) { + Assert.assertEquals(readGroups.size(), 1); + Assert.assertEquals(readGroups.get(0).getId(), readGroupId); } - if (library != null) { - Assert.assertEquals(rg.getLibrary(), library); - } else { - Assert.assertEquals(rg.getLibrary(), "my-library"); - } - } - int numReads = 0; - for (final SAMRecord rec : reader) { - numReads++; - if (removeDuplicates) { - Assert.assertFalse(rec.getDuplicateReadFlag(), - "Duplicates should have been removed: " + rec.getReadName()); + for (final SAMReadGroupRecord rg : header.getReadGroups()) { + if (sample != null) { + Assert.assertEquals(rg.getSample(), sample); + } else { + Assert.assertEquals(rg.getSample(), "Hi,Mom!"); + } + if (library != null) { + Assert.assertEquals(rg.getLibrary(), library); + } else { + Assert.assertEquals(rg.getLibrary(), "my-library"); + } } + int numReads = 0; + for (final SAMRecord rec : reader) { + numReads++; + if (removeDuplicates) { + Assert.assertFalse(rec.getDuplicateReadFlag(), + "Duplicates should have been removed: " + rec.getReadName()); + } - if (removeAlignmentInfo) { - Assert.assertTrue(rec.getReadUnmappedFlag(), - "Alignment info should have been removed: " + rec.getReadName()); - } + if (removeAlignmentInfo) { + Assert.assertTrue(rec.getReadUnmappedFlag(), + "Alignment info should have been removed: " + rec.getReadName()); + } - if (restoreOriginalQualities && !unmappedRead.equals( - rec.getReadName() + "/" + (rec.getFirstOfPairFlag() ? "1" : "2"))) { + if (restoreOriginalQualities && !unmappedRead.equals( + rec.getReadName() + "/" + (rec.getFirstOfPairFlag() ? "1" : "2"))) { - Assert.assertEquals(rec.getBaseQualityString(), revertedQualities); - } else { - Assert.assertNotSame(rec.getBaseQualityString(), revertedQualities); - } + Assert.assertEquals(rec.getBaseQualityString(), revertedQualities); + } else { + Assert.assertNotSame(rec.getBaseQualityString(), revertedQualities); + } - for (final SAMRecord.SAMTagAndValue attr : rec.getAttributes()) { - if (removeAlignmentInfo || (!attr.tag.equals("PG") && !attr.tag.equals("NM") - && !attr.tag.equals(SAMTag.MQ.toString()))) { - Assert.assertFalse(reverter.attributesToClear.contains(attr.tag), - attr.tag + " should have been cleared."); + for (final SAMRecord.SAMTagAndValue attr : rec.getAttributes()) { + if (removeAlignmentInfo || (!attr.tag.equals("PG") && !attr.tag.equals("NM") + && !attr.tag.equals(SAMTag.MQ.toString()))) { + Assert.assertFalse(reverter.attributesToClear.contains(attr.tag), + attr.tag + " should have been cleared."); + } } } + Assert.assertEquals(numReads, numReadsExpected); } - Assert.assertEquals(numReads, numReadsExpected); - CloserUtil.close(reader); } @Test public void testSanitizeAndDeduplicateRecords() throws Exception { - final File input = File.createTempFile("test-input-santize-and-deduplicate-records", ".sam"); - final File output = File.createTempFile("test-output-santize-and-deduplicate-records", ".sam"); + final File input = BaseTest.createTempFile("test-input-santize-and-deduplicate-records", ".sam"); + final File output = BaseTest.createTempFile("test-output-santize-and-deduplicate-records", ".sam"); // Create a SAM file that has duplicate records final SamReader reader = SamReaderFactory.makeDefault().open(basicSamToRevert); @@ -265,7 +250,7 @@ public void testSanitizeAndDeduplicateRecords() throws Exception { verifyPositiveResults(output, new RevertSamSpark(), false, true, false, false, null, 8, null, null); } - @Test(dataProvider="overrideTestData", expectedExceptions = {GATKException.class}) + @Test(dataProvider="overrideTestData", expectedExceptions = {UserException.class}) public void testSampleLibraryOverride(final String sample, final String library) throws Exception { final File output = createTempFile("bad", ".sam"); ArgumentsBuilder args = new ArgumentsBuilder(); @@ -289,67 +274,6 @@ public Object[][] getNegativeTestData() { }; } - @Test - public void testValidateOutputParamsByReadGroupMapValid() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, validOutputMap.getAbsolutePath()); - Assert.assertEquals(errors.size(), 0); - } - - @Test - public void testValidateOutputParamsByReadGroupMissingMap() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, nonExistentOutputMap.getAbsolutePath()); - Assert.assertEquals(errors.size(), 1); - Assert.assertEquals(errors.get(0).contains("Cannot read"), true); - } - - @Test - public void testValidateOutputParamsByReadGroupBadHeaderMap() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, badHeaderOutputMap.getAbsolutePath()); - Assert.assertEquals(errors.size(), 1); - Assert.assertEquals(errors.get(0).contains("Invalid header"), true); - } - - @Test - public void testValidateOutputParamsByReadGroupNoMapOrDir() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, null); - Assert.assertEquals(errors.size(), 1); - Assert.assertEquals(errors.get(0).contains("Must provide either"), true); - } - - @Test - public void testValidateOutputParamsByReadGroupDirValid() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupValid").getAbsolutePath(), null); - Assert.assertEquals(errors.size(), 0); - } - - @Test - public void testValidateOutputParamsNotByReadGroupValid() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempFile("testValidateOutputParamsNotByReadGroupValid","").getAbsolutePath(), null); - Assert.assertEquals(errors.size(), 0); - } - - @Test - public void testValidateOutputParamsNotByReadGroupNoOutput() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, null); - Assert.assertEquals(errors.size(), 1); - Assert.assertEquals(errors.get(0).contains("output is required"), true); - } - - @Test - public void testValidateOutputParamsNotByReadGroupMap() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, validOutputMap.getAbsolutePath()); - Assert.assertEquals(errors.size(), 2); - Assert.assertEquals(errors.get(0).contains("Cannot provide outputMap"), true); - Assert.assertEquals(errors.get(1).contains("output is required"), true); - } - - @Test - public void testValidateOutputParamsNotByReadGroupDir() throws IOException { - final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir").getAbsolutePath(), null); - Assert.assertEquals(errors.size(), 1); - Assert.assertEquals(errors.get(0).contains("should not be a directory"), true); - } - @Test public void testAssertAllReadGroupsMappedSuccess() { final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); @@ -377,26 +301,16 @@ public void testAssertAllReadGroupsMappedFailure() { @Test public void testIsOutputMapHeaderValid() { - boolean isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT_MAP_READ_GROUP_FIELD_NAME", "OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME")); + boolean isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("READ_GROUP_ID","OUTPUT")); Assert.assertEquals(isValid, true); - isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME")); + isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT")); Assert.assertEquals(isValid, false); isValid = RevertSamSpark.isOutputMapHeaderValid(Collections.EMPTY_LIST); Assert.assertEquals(isValid, false); } - @Test - public void testFilePathsWithoutMapFile() { - final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); - final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); - - final Map outputMap = RevertSamSpark.getOutputMap(null, new File("/foo/bar").getAbsolutePath(), ".bam", Arrays.asList(rg1, rg2), true); - Assert.assertEquals(outputMap.get("rg1"), IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); - Assert.assertEquals(outputMap.get("rg2"), IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); - } - @Test public void testFilePathsWithMapFile() { final Map outputMap = RevertSamSpark.getOutputMap(validOutputMap.getAbsolutePath(), null, ".bam", Collections.emptyList(), true); @@ -405,7 +319,7 @@ public void testFilePathsWithMapFile() { } @Test - public void testGetDefaultExtension() { + public static void testGetDefaultExtension() { Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.sam", RevertSamSpark.FileType.dynamic), ".sam"); //Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.cram", RevertSamSpark.FileType.dynamic), ".cram"); Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.bam", RevertSamSpark.FileType.dynamic), ".bam"); @@ -414,8 +328,7 @@ public void testGetDefaultExtension() { @Test public void testNoRgInfoSanitize() throws Exception { - final File output = File.createTempFile("no-rg-reverted", ".sam"); - output.deleteOnExit(); + final File output = BaseTest.createTempFile("no-rg-reverted", ".sam"); final String [] args = new String[]{ "-I",missingRGInfo.getAbsolutePath(), "--sanitize", diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java index 8341b22751d..3924133d5e9 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java @@ -1,84 +1,94 @@ package org.broadinstitute.hellbender.tools.spark; -import htsjdk.samtools.SAMFileHeader; +import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.hellbender.CommandLineProgramTest; -import org.broadinstitute.hellbender.GATKBaseTest; -import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; -import org.testng.annotations.DataProvider; +import org.broadinstitute.hellbender.utils.io.IOUtils; +import org.testng.Assert; import org.testng.annotations.Test; -import picard.sam.RevertSam; import java.io.File; -import java.nio.file.Files; -import java.nio.file.Paths; +import java.io.IOException; +import java.nio.file.Path; import java.util.Arrays; -import java.util.Collections; import java.util.List; +import java.util.Map; public class RevertSamSparkUnitTest extends CommandLineProgramTest { private final File basicSamToRevert = getTestFile("revert_sam_basic.sam"); - @DataProvider(name="positiveTestData") - public Object[][] getPostitiveTestData() { - return new Object[][] { - {null, true, true, true, true, null, null, Collections.EMPTY_LIST}, - {SAMFileHeader.SortOrder.queryname, true, true, true, false, "Hey,Dad!", null, Arrays.asList("XT")}, - {null, false, true, false, false, "Hey,Dad!", "NewLibraryName", Arrays.asList("XT")}, - {null, false, false, false, false, null, null, Collections.EMPTY_LIST} - }; + private final File validOutputMap = getTestFile("revert_sam_valid_output_map.txt"); + private final File nonExistentOutputMap = getTestFile("revert_sam_does_not_exist.txt"); + private final File badHeaderOutputMap = getTestFile("revert_sam_bad_header_output_map.txt"); + + @Test + public static void testFilePathsWithoutMapFile() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + + final Map outputMap = RevertSamSpark.getOutputMap(null, new File("/foo/bar").getAbsolutePath(), ".bam", Arrays.asList(rg1, rg2), true); + Assert.assertEquals(outputMap.get("rg1"), IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + Assert.assertEquals(outputMap.get("rg2"), IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); + } + + @Test + public void testValidateOutputParamsByReadGroupMapValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, validOutputMap.getAbsolutePath()); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsByReadGroupMissingMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, nonExistentOutputMap.getAbsolutePath()); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Cannot read"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupBadHeaderMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, badHeaderOutputMap.getAbsolutePath()); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Invalid header"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupNoMapOrDir() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(null, null); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("Must provide either"), true); + } + + @Test + public void testValidateOutputParamsByReadGroupDirValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupValid").getAbsolutePath(), null); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsNotByReadGroupValid() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempFile("testValidateOutputParamsNotByReadGroupValid","").getAbsolutePath(), null); + Assert.assertEquals(errors.size(), 0); + } + + @Test + public void testValidateOutputParamsNotByReadGroupNoOutput() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, null); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("output is required"), true); + } + + @Test + public void testValidateOutputParamsNotByReadGroupMap() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(null, validOutputMap.getAbsolutePath()); + Assert.assertEquals(errors.size(), 2); + Assert.assertEquals(errors.get(0).contains("Cannot provide outputMap"), true); + Assert.assertEquals(errors.get(1).contains("output is required"), true); } - @Test(dataProvider="positiveTestData") - public void basicPositiveTests(final SAMFileHeader.SortOrder so, final boolean removeDuplicates, final boolean removeAlignmentInfo, - final boolean restoreOriginalQualities, final boolean outputByReadGroup, final String sample, final String library, - final List attributesToClear) throws Exception { - - final File output = outputByReadGroup?Files.createTempDirectory("picardRevertSamTest").toFile():File.createTempFile("reverted", ".sam"); - File output0 = createTempFile("0", ".sam"); - File output1 = createTempFile("1", ".sam"); - File output2 = createTempFile("2", ".sam"); -// if (outputByReadGroup) { -// output = Files.createTempDirectory("picardRevertSamTest").toFile(); -// output0 = Paths.get(output.toString(), "0.sam").toFile(); -// output1 = Paths.get(output.toString(), "1.sam").toFile(); -// output2 = Paths.get(output.toString(), "2.sam").toFile(); -// } else { -// output = File.createTempFile("reverted", ".sam"); -// } - output.deleteOnExit(); - final RevertSam reverter = new RevertSam(); - final ArgumentsBuilder args = new ArgumentsBuilder(); - args.addInput(basicSamToRevert); - args.addOutput(output); - - if (outputByReadGroup) { - args.add("output-by-readgroup"); - } - if (so != null) { - args.addArgument("sort-order",so.name()); //TODO decide on sort order outputing - } -// args[index++] = "dontRemoveDuplicateInformation=" + removeDuplicates; //TODO this is unsuported - args.add("remove-alignment-inormation"); - args.add("restore-original-qualities"); - if (sample != null) { - args.addArgument("sample-alias",sample); - } - if (library != null) { - args.addArgument("library-name",library); - } - for (final String attr : attributesToClear) { - args.addArgument("attributes-to-clear",attr); - } - - runCommandLine(args); - -// if (outputByReadGroup) { -// verifyPositiveResults(output0, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "0", 2, sample, library); -// verifyPositiveResults(output1, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "1", 4, sample, library); -// verifyPositiveResults(output2, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, "2", 2, sample, library); -// } else { -// verifyPositiveResults(output, reverter, removeDuplicates, removeAlignmentInfo, restoreOriginalQualities, outputByReadGroup, null, 8, sample, library); -// } + @Test + public static void testValidateOutputParamsNotByReadGroupDir() throws IOException { + final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir").getAbsolutePath(), null); + Assert.assertEquals(errors.size(), 1); + Assert.assertEquals(errors.get(0).contains("should not be a directory"), true); } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java index 7fa97a7b7bc..44978d4464e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/transforms/markduplicates/MarkDuplicatesSparkUtilsUnitTest.java @@ -52,7 +52,7 @@ private String getReadGroupId(final SAMFileHeader header, final int index) { } private static void check(Iterator> it, List>> expected) { - Iterator>> spanning = SparkUtils.spanningIterator(it); + Iterator>> spanning = SparkUtils.getSpanningIterator(it); ArrayList>> actual = Lists.newArrayList(spanning); Assert.assertEquals(actual, expected); } From 2b696b7b17515c34e12bd79fd9964d1043d6c6a1 Mon Sep 17 00:00:00 2001 From: James Date: Mon, 10 Dec 2018 10:35:05 -0500 Subject: [PATCH 04/15] fixing a warning --- .../broadinstitute/hellbender/tools/spark/RevertSamSpark.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java index db79dd066a4..e44a1940603 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -198,7 +198,8 @@ public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } - public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){{ + public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){ + private static final long serialVersionUID = 1L;{ add(SAMTag.NM.name()); add(SAMTag.UQ.name()); add(SAMTag.PG.name()); From 2e486004a948ff7a34c518d0aa5ac4bb9ef96f0e Mon Sep 17 00:00:00 2001 From: James Date: Wed, 2 Jan 2019 14:53:32 -0500 Subject: [PATCH 05/15] responding to final round of comments --- .../hellbender/tools/spark/RevertSamSpark.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java index e44a1940603..6147789391b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -117,7 +117,7 @@ public class RevertSamSpark extends GATKSparkTool { " Output format can be overridden with the outputByReadgroupFileFormat option.\n" + "Note: If the program fails due to a SAM validation error, consider setting the VALIDATION_STRINGENCY option to " + "LENIENT or SILENT if the failures are expected to be obviated by the reversion process " + - "(e.g. invalid alignment information will be obviated when the keepAlignmentInformation option is used).\n" + + "(e.g. invalid alignment information will be obviated when the keep-alignment-information option is used).\n" + ""; public static final String OUTPUT_MAP_READ_GROUP_FIELD_NAME = "READ_GROUP_ID"; public static final String OUTPUT_MAP_OUTPUT_FILE_FIELD_NAME = "OUTPUT"; @@ -167,10 +167,10 @@ public class RevertSamSpark extends GATKSparkTool { public static final String DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME = "remove-duplicate-information"; @Argument(fullName = DONT_REMOVE_DUPLICATE_INFORMATION_LONG_NAME, doc = "By default we remove duplicate read flags from all reads. Note that if this is true, " + " the output may have the unusual but sometimes desirable trait of having unmapped reads that are marked as duplicates.") - public boolean dontRemoveDuplicateInformation = false; + public boolean keepDuplicateInformation = false; public static final String KEEP_ALIGNMENT_INFORMATION = "keep-alignment-information"; - @Argument(fullName = KEEP_ALIGNMENT_INFORMATION, doc = "Remove all alignment information from the file.") + @Argument(fullName = KEEP_ALIGNMENT_INFORMATION, doc = "Don't remove any of the alignment information from the file.") public boolean keepAlignmentInformation = false; public static final String ATTRIBUTE_TO_CLEAR_LONG_NAME = "attributes-to-clear"; @@ -198,7 +198,7 @@ public List getDefaultReadFilters() { return Collections.singletonList(ReadFilterLibrary.ALLOW_ALL_READS); } - public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){ + final public static List DEFAULT_ATTRIBUTES_TO_CLEAR = Collections.unmodifiableList(new ArrayList(){ private static final long serialVersionUID = 1L;{ add(SAMTag.NM.name()); add(SAMTag.UQ.name()); @@ -530,7 +530,7 @@ public JavaRDD revertReads(JavaRDD reads, List attri }); } - if (!dontRemoveDuplicateInformation) { + if (!keepDuplicateInformation) { reads = reads.map(r -> {r.setIsDuplicate(false); return r;}); } From a8d83389faf6ffb3c6663e9a755eea44048aa4f8 Mon Sep 17 00:00:00 2001 From: James Date: Wed, 2 Jan 2019 15:06:23 -0500 Subject: [PATCH 06/15] fixing the compiler warnings --- .../tools/spark/RevertSamSparkIntegrationTest.java | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index dbd6e93a81d..b382e202ad3 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -23,7 +23,9 @@ public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { - public static List defaultAttributesToClearPlusXT = new ArrayList() {{ + private static List defaultAttributesToClearPlusXT = new ArrayList() { + private static final long serialVersionUID = 1L; + { addAll(RevertSamSpark.DEFAULT_ATTRIBUTES_TO_CLEAR); add("XT"); }}; @@ -307,7 +309,7 @@ public void testIsOutputMapHeaderValid() { isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("OUTPUT")); Assert.assertEquals(isValid, false); - isValid = RevertSamSpark.isOutputMapHeaderValid(Collections.EMPTY_LIST); + isValid = RevertSamSpark.isOutputMapHeaderValid(Collections.emptyList()); Assert.assertEquals(isValid, false); } From bd0c6f5b7110f458e8fbe07aaad471ae85f3d833 Mon Sep 17 00:00:00 2001 From: James Date: Thu, 3 Jan 2019 10:12:28 -0500 Subject: [PATCH 07/15] cleaning up a mistaken change --- .../tools/spark/RevertSamSparkIntegrationTest.java | 6 +++++- .../hellbender/testutils/SamAssertionUtils.java | 6 +++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index b382e202ad3..ba218672027 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -23,6 +23,7 @@ public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { + private static List defaultAttributesToClearPlusXT = new ArrayList() { private static final long serialVersionUID = 1L; { @@ -44,7 +45,10 @@ public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { private static final String unmappedRead = "both_reads_present_only_first_aligns/2"; - + @Override + public String getToolTestDataDir() { + return "src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark"; + } @DataProvider(name="positiveTestData") public Object[][] positiveTestData() { diff --git a/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java b/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java index 1e4e9dd05ad..4c6fa323b5b 100644 --- a/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java +++ b/src/testUtils/java/org/broadinstitute/hellbender/testutils/SamAssertionUtils.java @@ -440,11 +440,11 @@ private static void sortSam(final File input, final File output, final File refe // We can't use ArgumentsBuilder since it assumes GATK argument names, but we're running a Picard // tool, which uses upper case argument names. final List args = new ArrayList<>(6); - args.add("--input"); + args.add("-I"); args.add(input.getAbsolutePath()); - args.add("--output"); + args.add("-O"); args.add(output.getAbsolutePath()); - args.add("--sortOrder"); + args.add("-SO"); args.add(SAMFileHeader.SortOrder.coordinate.name()); args.add("--VALIDATION_STRINGENCY"); args.add(stringency.name()); From 2b78ecd23270315aeb652ac53ec07028238ca17c Mon Sep 17 00:00:00 2001 From: James Date: Thu, 3 Jan 2019 11:42:02 -0500 Subject: [PATCH 08/15] silly !, save it for snake --- .../hellbender/utils/codecs/table/TableCodec.java | 2 +- .../tools/spark/RevertSamSparkIntegrationTest.java | 4 +--- .../hellbender/tools/spark/RevertSamSparkUnitTest.java | 5 +++++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java index 142cc1a09d9..d859faf74a3 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java @@ -82,7 +82,7 @@ public TableCodec() { @Override public TableFeature decode(final String line) { - if ((headerDelimiter != null && ! line.startsWith(headerDelimiter)) || + if ((headerDelimiter != null && line.startsWith(headerDelimiter)) || (headerDelimiter == null && !havePassedHeader) || line.startsWith(COMMENT_DELIMITER) || line.startsWith(IGV_HEADER_DELIMITER)) { havePassedHeader = true; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index ba218672027..168115679ed 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -22,8 +22,6 @@ @Test(groups = "Spark") public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { - - private static List defaultAttributesToClearPlusXT = new ArrayList() { private static final long serialVersionUID = 1L; { @@ -47,7 +45,7 @@ public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { @Override public String getToolTestDataDir() { - return "src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark"; + return "src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark"; } @DataProvider(name="positiveTestData") diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java index 3924133d5e9..44b2611d98a 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java @@ -21,6 +21,11 @@ public class RevertSamSparkUnitTest extends CommandLineProgramTest { private final File nonExistentOutputMap = getTestFile("revert_sam_does_not_exist.txt"); private final File badHeaderOutputMap = getTestFile("revert_sam_bad_header_output_map.txt"); + @Override + public String getToolTestDataDir() { + return "src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark"; + } + @Test public static void testFilePathsWithoutMapFile() { final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); From f55d6b3758dd546db3da01a377376ed634a4cce3 Mon Sep 17 00:00:00 2001 From: emeryj Date: Thu, 3 Jan 2019 11:55:31 -0500 Subject: [PATCH 09/15] solving the worlds problems --- .../spark/revertsamspark/missing-rg-info.sam | 243 ------------------ .../revert_sam_bad_header_output_map.txt | 3 - .../spark/revertsamspark/revert_sam_basic.sam | 22 -- .../revert_sam_sample_library_override.sam | 21 -- .../revertsamspark/revert_sam_single_end.sam | 5 - .../revert_sam_valid_output_map.txt | 3 - .../tools/spark/revertsamspark/test.fasta | 40 --- 7 files changed, 337 deletions(-) delete mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam delete mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_bad_header_output_map.txt delete mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_basic.sam delete mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_sample_library_override.sam delete mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_single_end.sam delete mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt delete mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam deleted file mode 100644 index 6f3ef07d858..00000000000 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/missing-rg-info.sam +++ /dev/null @@ -1,243 +0,0 @@ -@HD VN:1.5 SO:queryname -@SQ SN:20 LN:63025520 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens -@PG ID:GATK PrintReads VN:3.6-0-g89b7209 CL:readGroup=null platform=null number=-1 sample_file=[] sample_name=[] simplify=false no_pg_tag=false -H0164ALXX140820:2:1101:17727:54981 83 20 10000954 48 151M = 10000786 -319 TAATATTTGTAACTTACAATTACTTCAACTGAATAATAAAAGAATTGGACTAGATTTCTCCAACATCTCTCTCTTTTGGCTTTATGTTAGATAATGCTAAATTTTCATCATATCCAAACATGCTATATAATTTTATGAACTGTTACAGAGT A-BFGBGGDBE>@>D00F@0FB.6@D@=/..@8@@.8BFFDCGEGDGGGGEG0FGGGGBGGGGGGGGEBGGGGGGGGGGGGGGGGGGGGGEGGGGGGEGEGGGE@DGCGFGGGGGEGCF=@FGGGG RG:Z:A diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt deleted file mode 100644 index ed5a13a1647..00000000000 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/revert_sam_valid_output_map.txt +++ /dev/null @@ -1,3 +0,0 @@ -READ_GROUP_ID OUTPUT -rg1 /path/to/my_rg_1.ubam -rg2 /path/to/my_rg_2.ubam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta b/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta deleted file mode 100644 index 6bcf2a8ce01..00000000000 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark/test.fasta +++ /dev/null @@ -1,40 +0,0 @@ ->chr1 -TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC -TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA -A ->chr2 -CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC -TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT -T ->chr3 -CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT -AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA -A ->chr4 -CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT -TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC -C ->chr5 -NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT -CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC -T ->chr6 -NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA -ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT -C ->chr7 -CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG -TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA -AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA -GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA -CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG -TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA -AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA -GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA -CAAC ->chr8 -CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA -GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC -CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG -AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG -CA From de998df1e3014dac4b387e47d8128fab834aaa72 Mon Sep 17 00:00:00 2001 From: James Date: Thu, 3 Jan 2019 11:56:14 -0500 Subject: [PATCH 10/15] getting rid of that grossness --- .../tools/spark/RevertSamSparkIntegrationTest.java | 5 ----- .../hellbender/tools/spark/RevertSamSparkUnitTest.java | 5 ----- 2 files changed, 10 deletions(-) diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index 168115679ed..10928173f0a 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -43,11 +43,6 @@ public class RevertSamSparkIntegrationTest extends CommandLineProgramTest { private static final String unmappedRead = "both_reads_present_only_first_aligns/2"; - @Override - public String getToolTestDataDir() { - return "src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark"; - } - @DataProvider(name="positiveTestData") public Object[][] positiveTestData() { return new Object[][] { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java index 44b2611d98a..3924133d5e9 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java @@ -21,11 +21,6 @@ public class RevertSamSparkUnitTest extends CommandLineProgramTest { private final File nonExistentOutputMap = getTestFile("revert_sam_does_not_exist.txt"); private final File badHeaderOutputMap = getTestFile("revert_sam_bad_header_output_map.txt"); - @Override - public String getToolTestDataDir() { - return "src/test/resources/org/broadinstitute/hellbender/tools/spark/revertsamspark"; - } - @Test public static void testFilePathsWithoutMapFile() { final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); From 0c00b231e758668f9eb533be7de2a8dfb5b8c1e7 Mon Sep 17 00:00:00 2001 From: emeryj Date: Thu, 3 Jan 2019 11:57:51 -0500 Subject: [PATCH 11/15] readding the files i deleted by mistake --- .../revertsamspark/missing-rg-info.sam | 243 ++++++++++++++++++ .../revert_sam_bad_header_output_map.txt | 3 + .../revertsamspark/revert_sam_basic.sam | 22 ++ .../revert_sam_sample_library_override.sam | 21 ++ .../revertsamspark/revert_sam_single_end.sam | 5 + .../revert_sam_valid_output_map.txt | 3 + .../RevertSamSpark/revertsamspark/test.fasta | 40 +++ 7 files changed, 337 insertions(+) create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_bad_header_output_map.txt create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_basic.sam create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_sample_library_override.sam create mode 100755 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_single_end.sam create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt create mode 100644 src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam new file mode 100644 index 00000000000..6f3ef07d858 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam @@ -0,0 +1,243 @@ +@HD VN:1.5 SO:queryname +@SQ SN:20 LN:63025520 UR:http://www.broadinstitute.org/ftp/pub/seq/references/Homo_sapiens_assembly19.fasta AS:GRCh37 M5:0dec9660ec1efaaf33281c0d5ea2560f SP:Homo Sapiens +@PG ID:GATK PrintReads VN:3.6-0-g89b7209 CL:readGroup=null platform=null number=-1 sample_file=[] sample_name=[] simplify=false no_pg_tag=false +H0164ALXX140820:2:1101:17727:54981 83 20 10000954 48 151M = 10000786 -319 TAATATTTGTAACTTACAATTACTTCAACTGAATAATAAAAGAATTGGACTAGATTTCTCCAACATCTCTCTCTTTTGGCTTTATGTTAGATAATGCTAAATTTTCATCATATCCAAACATGCTATATAATTTTATGAACTGTTACAGAGT A-BFGBGGDBE>@>D00F@0FB.6@D@=/..@8@@.8BFFDCGEGDGGGGEG0FGGGGBGGGGGGGGEBGGGGGGGGGGGGGGGGGGGGGEGGGGGGEGEGGGE@DGCGFGGGGGEGCF=@FGGGG RG:Z:A diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt new file mode 100644 index 00000000000..ed5a13a1647 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt @@ -0,0 +1,3 @@ +READ_GROUP_ID OUTPUT +rg1 /path/to/my_rg_1.ubam +rg2 /path/to/my_rg_2.ubam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta new file mode 100644 index 00000000000..6bcf2a8ce01 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta @@ -0,0 +1,40 @@ +>chr1 +TTCATGCTGAAGCCCTCTTACGATCGTACAGATGCAAATATTAACAAACC +TTTAAGGGCAAAAAAAAAACAATACAATAATAGAGTACGTTAACACTCCA +A +>chr2 +CATCTCTACAAGCGCGTCCTACCAGACGCGCTTCCGATCTGAGAGCATAC +TTTTCATTGGATTCCAGCACAACTCCATTTTTGATCCACTTGACACCTTT +T +>chr3 +CGTATGCGCTTTTTATGTCGCCCACAGTGCCTAGTATAGCCCCTGCTAAT +AAAAAGAGATGAATACGTTTACTTAAAAAACTGAAACTAGGAATGTGCAA +A +>chr4 +CGTGATACCAACTCATGTTCACAGCCAAAGCCTGAAGCTGTCTATTATAT +TTCTCAACCATAAACTTTTGCCTCAGGCATCCGCAGAATGGTTTGCAGCC +C +>chr5 +NTCTCATTTAAAAATGGTTATAAAAACATTTATGCTGAAAAGGTGAAGTT +CATTAATGAACAGGCTGACTGTCTCACTATCGCGTTCGCAAGACGTTATC +T +>chr6 +NAATTGTTCTTAGTTTCTCGGTTTATGTGCTCTTCCAGGTGGGTAACACA +ATAATGGCCTTCCAGATCGTAAGAGCGACGTGTGTTGCACCAGTGTCGAT +C +>chr7 +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CAACAGAAGGGGGGATCTGTGTTTGTGTTTCGGATTTCCTGCTGAAAAGG +TTTTCGGGTCCCCCCCCCATCCCGATTTCCTTCCGCAGCTTACCTCCCGA +AACGCGGCATCCCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCA +GCGCGTCGTGTAGGTCACTATGGTACATCTTGTCGTGCGGCCAGAGCATA +CAAC +>chr8 +CACATCGTGAATCTTACAATCTGCGGTTTCAGATGTGGAGCGATGTGTGA +GAGATTGAGCAACTGATCTGAAAAGCAGACACAGCTATTCCTAAGATGAC +CCCAGGTTCAAATGTGCAGCCCCTTTTGAGAGATTTTTTTTTTGGGCTGG +AAAAAAGACACAGCTATTCCTAAGATGACAAGATCAGAAAAAAAGTCAAG +CA From 7238f2fd1c5567757b7d0206c7ea00f76d3769da Mon Sep 17 00:00:00 2001 From: emeryj Date: Thu, 3 Jan 2019 14:20:49 -0500 Subject: [PATCH 12/15] moving files because directories are hard --- .../spark/RevertSamSpark/{revertsamspark => }/missing-rg-info.sam | 0 .../{revertsamspark => }/revert_sam_bad_header_output_map.txt | 0 .../RevertSamSpark/{revertsamspark => }/revert_sam_basic.sam | 0 .../{revertsamspark => }/revert_sam_sample_library_override.sam | 0 .../RevertSamSpark/{revertsamspark => }/revert_sam_single_end.sam | 0 .../{revertsamspark => }/revert_sam_valid_output_map.txt | 0 .../tools/spark/RevertSamSpark/{revertsamspark => }/test.fasta | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/missing-rg-info.sam (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/revert_sam_bad_header_output_map.txt (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/revert_sam_basic.sam (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/revert_sam_sample_library_override.sam (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/revert_sam_single_end.sam (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/revert_sam_valid_output_map.txt (100%) rename src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/{revertsamspark => }/test.fasta (100%) diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/missing-rg-info.sam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/missing-rg-info.sam rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/missing-rg-info.sam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_bad_header_output_map.txt b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_bad_header_output_map.txt similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_bad_header_output_map.txt rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_bad_header_output_map.txt diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_basic.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_basic.sam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_basic.sam rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_basic.sam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_sample_library_override.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_sample_library_override.sam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_sample_library_override.sam rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_sample_library_override.sam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_single_end.sam b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_single_end.sam similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_single_end.sam rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_single_end.sam diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_valid_output_map.txt similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/revert_sam_valid_output_map.txt rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revert_sam_valid_output_map.txt diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta b/src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta similarity index 100% rename from src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/revertsamspark/test.fasta rename to src/test/resources/org/broadinstitute/hellbender/tools/spark/RevertSamSpark/test.fasta From 01b2bc876f6298eb6f263cd64fb2023702ec4480 Mon Sep 17 00:00:00 2001 From: James Date: Thu, 3 Jan 2019 15:18:31 -0500 Subject: [PATCH 13/15] responded to yet another round of comments, when, pray tell, will this chirade end? --- .../tools/spark/RevertSamSpark.java | 5 +-- .../utils/codecs/table/TableCodec.java | 5 ++- .../hellbender/utils/read/GATKRead.java | 20 ----------- .../spark/RevertSamSparkIntegrationTest.java | 33 ----------------- .../tools/spark/RevertSamSparkUnitTest.java | 35 +++++++++++++++++++ ...tMarkDuplicatesCommandLineProgramTest.java | 2 +- 6 files changed, 41 insertions(+), 59 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java index 6147789391b..376e689d93c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/RevertSamSpark.java @@ -29,6 +29,7 @@ import org.broadinstitute.hellbender.utils.codecs.table.TableFeature; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.ReadUtils; import org.broadinstitute.hellbender.utils.spark.SparkUtils; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; import scala.Tuple2; @@ -521,7 +522,7 @@ public JavaRDD revertReads(JavaRDD reads, List attri if (!dontRestoreOriginalQualities) { reads = reads.map(r -> { - final byte[] oq = r.getOriginalBaseQualities(); + final byte[] oq = ReadUtils.getOriginalBaseQualities(r); if (oq != null) { r.setBaseQualities(oq); r.setAttribute("OQ", (String)null); @@ -545,7 +546,7 @@ public JavaRDD revertReads(JavaRDD reads, List attri rec.setIsUnplaced(); rec.setCigar(SAMRecord.NO_ALIGNMENT_CIGAR); - rec.setInferredInsertSize(0); + rec.setFragmentLength(0); rec.setIsSecondaryAlignment(false); rec.setIsProperlyPaired(false); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java index d859faf74a3..e5ebc5cdabe 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java @@ -5,6 +5,7 @@ import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.SimpleInterval; +import org.broadinstitute.hellbender.utils.Utils; import java.util.ArrayList; import java.util.Arrays; @@ -67,9 +68,7 @@ public final class TableCodec extends AsciiFeatureCodec { */ public TableCodec(final String headerLineDelimiter) { super(TableFeature.class); - if ( "".equals(headerLineDelimiter) ) { - throw new GATKException("HeaderLineDelimiter must either be a valid delimiter or null"); - } + Utils.nonEmpty(headerLineDelimiter); headerDelimiter = headerLineDelimiter; } diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java b/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java index 66139a4ca89..c7c72db9104 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/GATKRead.java @@ -304,20 +304,6 @@ default String getBasesString() { */ byte[] getBaseQualities(); - /** - * If the original base quality scores have been store in the "OQ" tag will return the numeric - * score as a byte[] - */ - default byte[] getOriginalBaseQualities() { - final String oqString = getAttributeAsString("OQ"); - if (oqString != null && !oqString.isEmpty()) { - return SAMUtils.fastqToPhred(oqString); - } - else { - return null; - } - } - /** * @return Base qualities as binary phred scores (not ASCII), or an empty byte[] if base qualities are not present. * @@ -495,12 +481,6 @@ default int numCigarElements(){ */ void setIsUnplaced(); - /** - * insert size (difference btw 5' end of read & 5' end of mate), if possible, else 0. - * Negative if mate maps to lower position than read. - */ - void setInferredInsertSize(int insertSize); - /** * @return True if this read's mate is unmapped (this includes mates that have a position but are explicitly marked as unmapped, * as well as mates that lack a fully-defined position but are not explicitly marked as unmapped). Otherwise false. diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java index 10928173f0a..e0e64ec7282 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkIntegrationTest.java @@ -273,31 +273,6 @@ public Object[][] getNegativeTestData() { }; } - @Test - public void testAssertAllReadGroupsMappedSuccess() { - final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); - final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); - - final Map outputMap = new HashMap<>(); - outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); - outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); - RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2)); - RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1)); - RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg2)); - } - - @Test(expectedExceptions = {GATKException.class}) - public void testAssertAllReadGroupsMappedFailure() { - final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); - final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); - final SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3"); - - final Map outputMap = new HashMap<>(); - outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); - outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); - RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2, rg3)); - } - @Test public void testIsOutputMapHeaderValid() { boolean isValid = RevertSamSpark.isOutputMapHeaderValid(Arrays.asList("READ_GROUP_ID","OUTPUT")); @@ -317,14 +292,6 @@ public void testFilePathsWithMapFile() { Assert.assertEquals(outputMap.get("rg2"), IOUtils.getPath(new File("/path/to/my_rg_2.ubam").getAbsolutePath())); } - @Test - public static void testGetDefaultExtension() { - Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.sam", RevertSamSpark.FileType.dynamic), ".sam"); - //Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.cram", RevertSamSpark.FileType.dynamic), ".cram"); - Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.bam", RevertSamSpark.FileType.dynamic), ".bam"); - Assert.assertEquals(RevertSamSpark.getDefaultExtension("foo", RevertSamSpark.FileType.dynamic), ".bam"); - } - @Test public void testNoRgInfoSanitize() throws Exception { final File output = BaseTest.createTempFile("no-rg-reverted", ".sam"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java index 3924133d5e9..82052a0df05 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/RevertSamSparkUnitTest.java @@ -2,6 +2,7 @@ import htsjdk.samtools.SAMReadGroupRecord; import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.testng.Assert; import org.testng.annotations.Test; @@ -10,6 +11,7 @@ import java.io.IOException; import java.nio.file.Path; import java.util.Arrays; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -85,10 +87,43 @@ public void testValidateOutputParamsNotByReadGroupMap() throws IOException { Assert.assertEquals(errors.get(1).contains("output is required"), true); } + @Test + public static void testGetDefaultExtension() { + Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.sam", RevertSamSpark.FileType.dynamic), ".sam"); + //Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.cram", RevertSamSpark.FileType.dynamic), ".cram"); //TODO https://github.com/broadinstitute/gatk/issues/5559 + Assert.assertEquals(RevertSamSpark.getDefaultExtension("this.is.a.bam", RevertSamSpark.FileType.dynamic), ".bam"); + Assert.assertEquals(RevertSamSpark.getDefaultExtension("foo", RevertSamSpark.FileType.dynamic), ".bam"); + } + @Test public static void testValidateOutputParamsNotByReadGroupDir() throws IOException { final List errors = RevertSamSpark.validateOutputParamsNotByReadGroup(createTempDir("testValidateOutputParamsNotByReadGroupDir").getAbsolutePath(), null); Assert.assertEquals(errors.size(), 1); Assert.assertEquals(errors.get(0).contains("should not be a directory"), true); } + + @Test + public void testAssertAllReadGroupsMappedSuccess() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + + final Map outputMap = new HashMap<>(); + outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2)); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1)); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg2)); + } + + @Test(expectedExceptions = {GATKException.class}) + public void testAssertAllReadGroupsMappedFailure() { + final SAMReadGroupRecord rg1 = new SAMReadGroupRecord("rg1"); + final SAMReadGroupRecord rg2 = new SAMReadGroupRecord("rg2"); + final SAMReadGroupRecord rg3 = new SAMReadGroupRecord("rg3"); + + final Map outputMap = new HashMap<>(); + outputMap.put("rg1", IOUtils.getPath(new File("/foo/bar/rg1.bam").getAbsolutePath())); + outputMap.put("rg2", IOUtils.getPath(new File("/foo/bar/rg2.bam").getAbsolutePath())); + RevertSamSpark.assertAllReadGroupsMapped(outputMap, Arrays.asList(rg1, rg2, rg3)); + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java index f5395c98278..7b99364050a 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/markduplicates/AbstractMarkDuplicatesCommandLineProgramTest.java @@ -755,7 +755,7 @@ public void testDuplicateDetectionDataProviderWithMetrics(final File sam, final final List lines = FileUtils.readLines(metricsFile, StandardCharsets.UTF_8); Assert.assertTrue(lines.get(0).startsWith("##"), lines.get(0)); Assert.assertTrue(lines.get(1).startsWith("#"), lines.get(1)); - Assert.assertTrue(lines.get(1).toLowerCase().contains("--input"), lines.get(1)); //Note: lowercase because picard uses input and GATK uses input for full name + Assert.assertTrue(lines.get(1).toLowerCase().contains("--input"), lines.get(1)); //Note: lowercase because picard uses INPUT and GATK uses input for full name Assert.assertTrue(lines.get(2).startsWith("##"), lines.get(2)); Assert.assertTrue(lines.get(3).startsWith("# Started on:"), lines.get(3)); Assert.assertTrue(lines.get(4).trim().isEmpty()); From f5b70621b6f935b69d21ccf98cf7afec4ad5bf91 Mon Sep 17 00:00:00 2001 From: James Date: Thu, 3 Jan 2019 16:01:42 -0500 Subject: [PATCH 14/15] fixed a spurious override --- .../hellbender/utils/read/SAMRecordToGATKReadAdapter.java | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java b/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java index dd517240cd6..595027df2c0 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/read/SAMRecordToGATKReadAdapter.java @@ -447,13 +447,6 @@ public void setIsUnplaced() { samRecord.setMappingQuality(SAMRecord.NO_MAPPING_QUALITY); } - @Override - public void setInferredInsertSize(int insertSize) { - clearCachedValues(); - - samRecord.setInferredInsertSize(insertSize); - } - @Override public boolean mateIsUnmapped() { Utils.validate(isPaired(), "Cannot get mate information for an unpaired read"); @@ -494,7 +487,6 @@ public void setMateIsUnplaced() { samRecord.setMateAlignmentStart(SAMRecord.NO_ALIGNMENT_START); } - @Override public boolean isReverseStrand() { return samRecord.getReadNegativeStrandFlag(); From 2730a471a8d62ef2f81d18e631ec0f6c89867cef Mon Sep 17 00:00:00 2001 From: James Date: Fri, 4 Jan 2019 10:34:33 -0500 Subject: [PATCH 15/15] No, that really did want to check for emptieness --- .../hellbender/utils/codecs/table/TableCodec.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java index e5ebc5cdabe..53b01d59251 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/codecs/table/TableCodec.java @@ -68,7 +68,10 @@ public final class TableCodec extends AsciiFeatureCodec { */ public TableCodec(final String headerLineDelimiter) { super(TableFeature.class); - Utils.nonEmpty(headerLineDelimiter); + if ( "".equals(headerLineDelimiter) ) { + // Note, it is valid for headerLineDelimiter to be null, just not empty as the regex breaks in that case. + throw new GATKException("HeaderLineDelimiter must either be a valid delimiter or null"); + } headerDelimiter = headerLineDelimiter; }