Skip to content

Commit

Permalink
Added an option to opt out of the protections to sorting multiple bam…
Browse files Browse the repository at this point in the history
… inputs and the ability to treat unsorted inputs as queryGroup sorted in MarkDuplicatesSpark (#5974)
  • Loading branch information
jamesemery authored May 31, 2019
1 parent a47aa22 commit 44ca2e9
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.broadinstitute.barclay.argparser.Advanced;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ArgumentCollection;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
Expand Down Expand Up @@ -127,6 +128,8 @@
programGroup = ReadDataManipulationProgramGroup.class)
public final class MarkDuplicatesSpark extends GATKSparkTool {
private static final long serialVersionUID = 1L;
public static final String ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG = "allow-multiple-sort-orders-in-input";
public static final String TREAT_UNSORTED_AS_ORDERED = "treat-unsorted-as-querygroup-ordered";

@Override
public boolean requiresReads() { return true; }
Expand All @@ -140,6 +143,16 @@ public final class MarkDuplicatesSpark extends GATKSparkTool {
fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME)
protected String metricsFile;

@Advanced
@Argument(doc = "Allow non-queryname sorted inputs when specifying multiple input bams.", optional=true,
fullName = ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG)
protected boolean allowMultipleSortOrders = false;

@Advanced
@Argument(doc = "Treat unsorted files as query-group orderd files. WARNING: This option disables a basic safety check and may result in unexpected behavior if the file is truly unordered", optional=true,
fullName = TREAT_UNSORTED_AS_ORDERED)
protected boolean treatUnsortedAsOrdered = false;

@ArgumentCollection
protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection();

Expand Down Expand Up @@ -298,10 +311,35 @@ protected void runTool(final JavaSparkContext ctx) {
// Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this
Map<String, SAMFileHeader> headerMap = getReadSourceHeaderMap();
if (headerMap.size() > 1) {
headerMap.entrySet().stream().forEach(h -> {if(!ReadUtils.isReadNameGroupedBam(h.getValue())) {
throw new UserException("Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted or querygroup-sorted for multi-input processing but input "+h.getKey()+" was sorted in "+h.getValue().getSortOrder()+" order");
}});
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
final Optional<Map.Entry<String, SAMFileHeader>> badlySorted = headerMap.entrySet()
.stream()
.filter(h -> !treatAsReadGroupOrdered(h.getValue(), treatUnsortedAsOrdered))
.findFirst();

if(badlySorted.isPresent()) {
if (allowMultipleSortOrders) {
//don't set an ordering, the files will all be sorted downstream
logger.info("Input files are not all grouped by read name so they will be sorted together.");
} else {
final Map.Entry<String, SAMFileHeader> badPair = badlySorted.get();
throw new UserException(
"Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted " +
"or querygroup-sorted for multi-input processing but input " + badPair.getKey() + " was sorted in " + badPair
.getValue().getSortOrder() + " order");
}
} else {
// The default sort order for merged input files is unsorted, so this will be fed to the tool to be sorted
if (!allowMultipleSortOrders) {
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
}
}

// If there is only one file and we are in treatUnsortedAsOrdered mode than set its group order accordingly.
} else {
if (treatUnsortedAsOrdered && (mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unknown) || mergedHeader.getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) {
logger.warn("Input bam was marked as " + mergedHeader.getSortOrder().toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query);
}
}

JavaRDD<GATKRead> reads = getReads();
Expand Down Expand Up @@ -332,4 +370,16 @@ protected void runTool(final JavaSparkContext ctx) {
writeReads(ctx, output, readsForWriting, mergedHeader, true);
}

// helper method to determin if an input header is to be treated as a query group sorted file.
private boolean treatAsReadGroupOrdered(SAMFileHeader header, boolean treatUnsortedAsReadGrouped) {
final SAMFileHeader.SortOrder sortOrder = header.getSortOrder();
if( ReadUtils.isReadNameGroupedBam(header) ){
return true;
} else if ( treatUnsortedAsReadGrouped && (sortOrder.equals(SAMFileHeader.SortOrder.unknown) || sortOrder.equals(SAMFileHeader.SortOrder.unsorted))) {
logger.warn("Input bam was marked as " + sortOrder.toString() + " but " + TREAT_UNSORTED_AS_ORDERED + " is specified so it's being treated as read name grouped");
return true;
}
return false;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -482,4 +482,26 @@ public void testAssertCorrectSortOrderMultipleBams() {
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"));
runCommandLine(args);
}

@Test
public void testAssertCorrectSortOrderMultipleBamsOverriding() {
final File output = createTempFile("supplementaryReadUnmappedMate", "bam");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(output);
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.bam"));
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam"));
args.addArgument(MarkDuplicatesSpark.ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG);
runCommandLine(args);
}

@Test
public void testAssertAssumeUnsortedFilesAreQueryGroupedFiles() {
final File output = createTempFile("supplementaryReadUnmappedMate", "bam");
final ArgumentsBuilder args = new ArgumentsBuilder();
args.addOutput(output);
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.queryname.bam"));
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.unsorted.querygrouped.sam"));
args.addArgument(MarkDuplicatesSpark.TREAT_UNSORTED_AS_ORDERED);
runCommandLine(args);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
@HD VN:1.5 SO:unsorted
@SQ SN:chr1 LN:101
@SQ SN:chr2 LN:101
@SQ SN:chr3 LN:101
@SQ SN:chr4 LN:101
@SQ SN:chr5 LN:101
@SQ SN:chr6 LN:101
@SQ SN:chr7 LN:404
@SQ SN:chr8 LN:202
@RG ID:1AAXX.1 SM:Hi,Mom! LB:mylib PL:ILLUMINA
@PG ID:MarkDuplicates PN:MarkDuplicates VN:1 CL:MarkDuplicates merge1.sam PP:bwa
@PG ID:bwa PN:bwa VN:1 CL:bwa aln
C4N4WACXX140821:8:1112:2344:1984 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1984 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1985 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1
C4N4WACXX140821:8:1112:2344:1985 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1

0 comments on commit 44ca2e9

Please sign in to comment.