diff --git a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadGroupBlackListReadFilter.java b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadGroupBlackListReadFilter.java index b4f204ff688..18fa48f5f90 100644 --- a/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadGroupBlackListReadFilter.java +++ b/src/main/java/org/broadinstitute/hellbender/engine/filters/ReadGroupBlackListReadFilter.java @@ -39,14 +39,15 @@ public final class ReadGroupBlackListReadFilter extends ReadFilter implements Se // Command line parser requires a no-arg constructor public ReadGroupBlackListReadFilter() {}; - /** - * Creates a filter using the lists of files with blacklisted read groups. - * Any entry can be a path to a file (ending with "list" or "txt" which - * will load blacklist from that file. This scheme works recursively - * (ie the file may contain names of further files etc). - */ + /** + * Creates a filter using the lists of files with blacklisted read groups. + * Any entry can be a path to a file (ending with "list" or "txt" which + * will load blacklist from that file. This scheme works recursively + * (ie the file may contain names of further files etc). + */ public ReadGroupBlackListReadFilter(final List blackLists, final SAMFileHeader header) { super.setHeader(header); + this.blackList.addAll(blackLists); final Map> filters = new TreeMap<>(); for (String blackList : blackLists) { try { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java index ec0cc27ab59..76fc4a456a9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/vqsr/CNNScoreVariants.java @@ -10,9 +10,9 @@ import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; -import org.broadinstitute.hellbender.engine.filters.VariantFilter; -import org.broadinstitute.hellbender.engine.filters.VariantFilterLibrary; +import org.broadinstitute.hellbender.engine.filters.*; import org.broadinstitute.hellbender.exceptions.GATKException; +import org.broadinstitute.hellbender.utils.haplotype.HaplotypeBAMWriter; import org.broadinstitute.hellbender.utils.io.Resource; import org.broadinstitute.hellbender.utils.io.IOUtils; import org.broadinstitute.hellbender.utils.python.StreamingPythonScriptExecutor; @@ -170,7 +170,7 @@ public class CNNScoreVariants extends VariantWalker { private int curBatchSize = 0; private int windowEnd = windowSize / 2; - private int windowStart = (windowSize / 2) - 1; + private int windowStart = windowSize / 2; private boolean waitforBatchCompletion = false; private File scoreFile; @@ -190,7 +190,6 @@ protected String[] customCommandLineValidation() { return new String[]{"No default architecture for tensor type:" + tensorType.name()}; } } - return null; } @@ -208,6 +207,17 @@ protected VariantFilter makeVariantFilter(){ } } + @Override + public List getDefaultReadFilters() { + List readFilters = new ArrayList<>(); + readFilters.addAll(super.getDefaultReadFilters()); + List filterList = new ArrayList<>(); + filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_HAPLOTYPE_READ_GROUP_ID); + filterList.add("ID:" + HaplotypeBAMWriter.DEFAULT_GATK3_HAPLOTYPE_READ_GROUP_ID); + readFilters.add(new ReadGroupBlackListReadFilter(filterList, null)); + return readFilters; + } + @Override public void onTraversalStart() { scoreKey = getScoreKeyAndCheckModelAndReadsHarmony(); @@ -319,7 +329,7 @@ private void transferReadsToPythonViaFifo(final VariantContext variant, final Re } Iterator readIt = readsContext.iterator(); if (!readIt.hasNext()) { - logger.warn("No reads at contig:" + variant.getContig() + "site:" + String.valueOf(variant.getStart())); + logger.warn("No reads at contig:" + variant.getContig() + " site:" + String.valueOf(variant.getStart())); } while (readIt.hasNext()) { sb.append(GATKReadToString(readIt.next())); diff --git a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriter.java b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriter.java index 18e43d47cca..82ee03ceaac 100644 --- a/src/main/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriter.java +++ b/src/main/java/org/broadinstitute/hellbender/utils/haplotype/HaplotypeBAMWriter.java @@ -25,7 +25,9 @@ public abstract class HaplotypeBAMWriter implements AutoCloseable { */ private long uniqueNameCounter = 1; - private static final String DEFAULT_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotypeRG"; + public static final String DEFAULT_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotypeRG"; + // For read filters that need backwards compatibility with the GATK3 artificial haplotype + public static final String DEFAULT_GATK3_HAPLOTYPE_READ_GROUP_ID = "ArtificialHaplotype"; private static final int bestHaplotypeMQ = 60; private static final int otherMQ = 0; diff --git a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py index 420174c21c9..d2d70bccc32 100644 --- a/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py +++ b/src/main/python/org/broadinstitute/hellbender/vqsr_cnn/vqsr_cnn/inference.py @@ -43,14 +43,14 @@ def score_and_write_batch(args, model, file_out, fifo, batch_size, python_batch_ variant_data = [] read_batch = [] - for i in range(batch_size): + for _ in range(batch_size): fifo_line = fifo.readline() fifo_data = fifo_line.split(defines.SEPARATOR_CHAR) variant_data.append(fifo_data[0] + '\t' + fifo_data[1] + '\t' + fifo_data[2] + '\t' + fifo_data[3]) reference_batch.append(reference_string_to_tensor(fifo_data[4])) annotation_batch.append(annotation_string_to_tensor(args, fifo_data[5])) - variant_types.append(fifo_data[6]) + variant_types.append(fifo_data[6].strip()) fidx = 7 # 7 Because above we parsed: contig pos ref alt reference_string annotation variant_type if args.tensor_name in defines.TENSOR_MAPS_2D and len(fifo_data) > fidx: @@ -69,7 +69,7 @@ def score_and_write_batch(args, model, file_out, fifo, batch_size, python_batch_ _, ref_start, _ = get_variant_window(args, var) insert_dict = get_inserts(args, read_tuples, var) tensor = read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict) - reference_sequence_into_tensor(args, fifo_data[4], tensor) + reference_sequence_into_tensor(args, fifo_data[4], tensor, insert_dict) if os.path.exists(tensor_dir): _write_tensor_to_hd5(args, tensor, annotation_batch[-1], fifo_data[0], fifo_data[1], fifo_data[6]) read_batch.append(tensor) @@ -201,8 +201,8 @@ def cigar_string_to_tuples(cigar): def get_variant_window(args, variant): index_offset = (args.window_size//2) - reference_start = variant.pos-(index_offset+1) - reference_end = variant.pos+index_offset + reference_start = variant.pos-index_offset + reference_end = variant.pos + index_offset + (args.window_size%2) return index_offset, reference_start, reference_end @@ -245,6 +245,7 @@ def read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict): if i == args.window_size: break + if b == defines.SKIP_CHAR: continue elif flag_start == -1: @@ -290,8 +291,6 @@ def read_tuples_to_read_tensor(args, read_tuples, ref_start, insert_dict): else: tensor[channel_map['mapping_quality'], j, flag_start:flag_end] = mq - - return tensor @@ -335,9 +334,15 @@ def sequence_and_qualities_from_read(args, read, ref_start, insert_dict): return rseq, rqual -def reference_sequence_into_tensor(args, reference_seq, tensor): +def reference_sequence_into_tensor(args, reference_seq, tensor, insert_dict): ref_offset = len(set(args.input_symbols.values())) + for i in sorted(insert_dict.keys(), key=int, reverse=True): + if i < 0: + reference_seq = defines.INDEL_CHAR*insert_dict[i] + reference_seq + else: + reference_seq = reference_seq[:i] + defines.INDEL_CHAR*insert_dict[i] + reference_seq[i:] + for i,b in enumerate(reference_seq): if i == args.window_size: break diff --git a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py index 0bf801d7b70..4fe90ac70fd 100644 --- a/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py +++ b/src/main/resources/org/broadinstitute/hellbender/tools/walkers/vqsr/training.py @@ -607,11 +607,11 @@ def reference_sequence_into_tensor(args, reference_seq, tensor): else: tensor[ref_offset+args.input_symbols[b], :, i] = 1.0 elif b in vqsr_cnn.AMBIGUITY_CODES: - ambiguous_vector = np.transpose(np.tile(vqsr_cnn.AMBIGUITY_CODES[b], (args.read_limit, 1))) + ambiguous_vector = np.tile(vqsr_cnn.AMBIGUITY_CODES[b], (args.read_limit, 1)) if K.image_data_format() == 'channels_last': tensor[:, i, ref_offset:ref_offset+4] = ambiguous_vector else: - tensor[ref_offset:ref_offset+4, :, i] = ambiguous_vector + tensor[ref_offset:ref_offset+4, :, i] = np.transpose(ambiguous_vector) def flag_to_array(flag): diff --git a/src/test/resources/large/VQSR/expected/cnn_1d_chr20_subset_expected.vcf b/src/test/resources/large/VQSR/expected/cnn_1d_chr20_subset_expected.vcf index 6089f28b3ef..42d1f857e2c 100644 --- a/src/test/resources/large/VQSR/expected/cnn_1d_chr20_subset_expected.vcf +++ b/src/test/resources/large/VQSR/expected/cnn_1d_chr20_subset_expected.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cc53d3d586ab308119932bb6e4dea09ebcb3a4e4b16bd5115a591a0e5aa93d8 -size 149999 +oid sha256:5f14de41c9d12091e3900372b80950a582955be1b43f021f88415edc7ab2325c +size 150006 diff --git a/src/test/resources/large/VQSR/expected/cnn_2d_chr20_subset_expected.vcf b/src/test/resources/large/VQSR/expected/cnn_2d_chr20_subset_expected.vcf index 6e484cd3e6c..438dc0b4f67 100644 --- a/src/test/resources/large/VQSR/expected/cnn_2d_chr20_subset_expected.vcf +++ b/src/test/resources/large/VQSR/expected/cnn_2d_chr20_subset_expected.vcf @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c5366f7d7589e14f76086ff4bde8c341860833c891bf2698e85480eb3d980a1 -size 150070 +oid sha256:8f4a4ccf2ebbaed80ee40313f0f50be4287b947fc1e331159de871f7f0856c7c +size 150076