-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Added an option to opt out of the protections to sorting multiple bam inputs #5974
Changes from 2 commits
174cb31
61e6e48
da6fff1
ff5cfa1
79b6337
0de40f9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,6 +6,7 @@ | |||||
import org.apache.spark.api.java.JavaPairRDD; | ||||||
import org.apache.spark.api.java.JavaRDD; | ||||||
import org.apache.spark.api.java.JavaSparkContext; | ||||||
import org.broadinstitute.barclay.argparser.Advanced; | ||||||
import org.broadinstitute.barclay.argparser.Argument; | ||||||
import org.broadinstitute.barclay.argparser.ArgumentCollection; | ||||||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||||||
|
@@ -127,6 +128,8 @@ | |||||
programGroup = ReadDataManipulationProgramGroup.class) | ||||||
public final class MarkDuplicatesSpark extends GATKSparkTool { | ||||||
private static final long serialVersionUID = 1L; | ||||||
public static final String ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG = "allow-multiple-sort-orders-in-input"; | ||||||
public static final String TREAT_UNSORTED_AS_OREDED = "treat-unsorted-as-querygroup-ordered-for-multiple-inputs"; | ||||||
|
||||||
@Override | ||||||
public boolean requiresReads() { return true; } | ||||||
|
@@ -140,6 +143,16 @@ public final class MarkDuplicatesSpark extends GATKSparkTool { | |||||
fullName = StandardArgumentDefinitions.METRICS_FILE_LONG_NAME) | ||||||
protected String metricsFile; | ||||||
|
||||||
@Advanced | ||||||
@Argument(doc = "Override to allow non-queryname sorted inputs for multiple input bams.", optional=true, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
fullName = ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG) | ||||||
protected boolean allowMultipleSortOrders = false; | ||||||
|
||||||
@Advanced | ||||||
@Argument(doc = "Treat unsorted files as query-group orderd files. NOTE: this may result in mark duplicates crashing if the file is unordered", optional=true, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I might make this warning a bit stronger. |
||||||
fullName = TREAT_UNSORTED_AS_OREDED) | ||||||
protected boolean treatUnsortedAsOrdered = false; | ||||||
|
||||||
@ArgumentCollection | ||||||
protected MarkDuplicatesSparkArgumentCollection markDuplicatesSparkArgumentCollection = new MarkDuplicatesSparkArgumentCollection(); | ||||||
|
||||||
|
@@ -298,10 +311,14 @@ protected void runTool(final JavaSparkContext ctx) { | |||||
// Check if we are using multiple inputs that the headers are all in the correct querygrouped ordering, if so set the aggregate header to reflect this | ||||||
Map<String, SAMFileHeader> headerMap = getReadSourceHeaderMap(); | ||||||
if (headerMap.size() > 1) { | ||||||
headerMap.entrySet().stream().forEach(h -> {if(!ReadUtils.isReadNameGroupedBam(h.getValue())) { | ||||||
throw new UserException("Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted or querygroup-sorted for multi-input processing but input "+h.getKey()+" was sorted in "+h.getValue().getSortOrder()+" order"); | ||||||
}}); | ||||||
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); | ||||||
if (!allowMultipleSortOrders) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So if this is set then multiple bams will always be sorted even if they are all coordinate grouped already? It probably shouldn't do that. |
||||||
headerMap.entrySet().stream().forEach(h -> { | ||||||
if (!ReadUtils.isReadNameGroupedBam(h.getValue()) && (!treatUnsortedAsOrdered && h.getValue().getSortOrder().equals(SAMFileHeader.SortOrder.unsorted))) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. treatUnsortedAsOrdered should work for either I would probably pull out a function here that does this check to make it a little clearer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Treat unsortedAsOrdered should also apply when using a single input bam. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||||||
throw new UserException("Multiple inputs to MarkDuplicatesSpark detected. MarkDuplicatesSpark requires all inputs to be queryname sorted or querygroup-sorted for multi-input processing but input " + h.getKey() + " was sorted in " + h.getValue().getSortOrder() + " order. Try running with '"+ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG+"' to run by sorting all the input."); | ||||||
} | ||||||
}); | ||||||
mergedHeader.setGroupOrder(SAMFileHeader.GroupOrder.query); | ||||||
} | ||||||
} | ||||||
|
||||||
JavaRDD<GATKRead> reads = getReads(); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -482,4 +482,26 @@ public void testAssertCorrectSortOrderMultipleBams() { | |
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam")); | ||
runCommandLine(args); | ||
} | ||
|
||
@Test | ||
public void testAssertCorrectSortOrderMultipleBamsOverriding() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a test case for a single unsorted bam, and one that rejects when there's a coordinate sorted bam along side an unsorted bam |
||
final File output = createTempFile("supplementaryReadUnmappedMate", "bam"); | ||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addOutput(output); | ||
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.bam")); | ||
args.addInput(new File(TEST_DATA_DIR,"example.chr1.1-1K.unmarkedDups.noDups.bam")); | ||
args.addArgument(MarkDuplicatesSpark.ALLOW_MULTIPLE_SORT_ORDERS_IN_INPUT_ARG); | ||
runCommandLine(args); | ||
} | ||
|
||
@Test | ||
public void testAssertAssumeUnsortedFilesAreQueryGroupedFiles() { | ||
final File output = createTempFile("supplementaryReadUnmappedMate", "bam"); | ||
final ArgumentsBuilder args = new ArgumentsBuilder(); | ||
args.addOutput(output); | ||
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.queryname.bam")); | ||
args.addInput(new File(TEST_DATA_DIR,"optical_dupes.unsorted.querygrouped.sam")); | ||
args.addArgument(MarkDuplicatesSpark.TREAT_UNSORTED_AS_OREDED); | ||
runCommandLine(args); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
@HD VN:1.5 SO:unsorted | ||
@SQ SN:chr1 LN:101 | ||
@SQ SN:chr2 LN:101 | ||
@SQ SN:chr3 LN:101 | ||
@SQ SN:chr4 LN:101 | ||
@SQ SN:chr5 LN:101 | ||
@SQ SN:chr6 LN:101 | ||
@SQ SN:chr7 LN:404 | ||
@SQ SN:chr8 LN:202 | ||
@RG ID:1AAXX.1 SM:Hi,Mom! LB:mylib PL:ILLUMINA | ||
@PG ID:MarkDuplicates PN:MarkDuplicates VN:1 CL:MarkDuplicates merge1.sam PP:bwa | ||
@PG ID:bwa PN:bwa VN:1 CL:bwa aln | ||
C4N4WACXX140821:8:1112:2344:1984 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1 | ||
C4N4WACXX140821:8:1112:2344:1984 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1 | ||
C4N4WACXX140821:8:1112:2344:1985 83 chr7 1 255 101M = 302 201 CAACAGAAGCNGGNATCTGTGTTTGTGTTTCGGATTTCCTGCTGAANNGNTTNTCGNNTCNNNNNNNNATCCCGATTTCNTTCCGCAGCTNACCTCCCAAN )'.*.+2,))&&'&*/)-&*-)&.-)&)&),/-&&..)./.,.).*&&,&.&&-)&&&0*&&&&&&&&/32/,01460&&/6/*0*/2/283//36868/& PG:Z:MarkDuplicates RG:Z:1AAXX.1 | ||
C4N4WACXX140821:8:1112:2344:1985 163 chr7 302 255 101M = 1 -201 NCGCGGCATCNCGATTTCTTTCCGCAGCTAACCTCCCGACAGATCGGCAGCGCGTCGTGTAGGTTATTATGGTACATCTTGTCGTGCGGCNAGAGCATACA &/15445666651/566666553+2/14/&/555512+3/)-'/-&-'*+))*''13+3)'//++''/'))/3+&*5++)&'2+&+/*&-&&*)&-./1'1 PG:Z:MarkDuplicates RG:Z:1AAXX.1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo OREDED -> ORDERED