From 0ed9e2270d34b8e7511707b7428179851864eae2 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 27 Aug 2024 13:51:03 -0400 Subject: [PATCH 01/17] Pushing naive implementation to test for correctness --- .dockstore.yml | 1 + scripts/variantstore/wdl/GvsAssignIds.wdl | 15 ++++ .../wdl/GvsJointVariantCalling.wdl | 2 + .../tools/gvs/common/PloidyUtils.java | 55 ++++++++++++ .../gvs/extract/ExtractCohortEngine.java | 21 +---- .../gvs/ingest/CreateVariantIngestFiles.java | 26 ++++-- .../tools/gvs/ingest/RefCreator.java | 33 +++++-- .../tools/gvs/ingest/SamplePloidyCreator.java | 87 +++++++++++++++++++ 8 files changed, 208 insertions(+), 32 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java diff --git a/.dockstore.yml b/.dockstore.yml index ed23039d3d4..2ad77ca4753 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -251,6 +251,7 @@ workflows: branches: - master - ah_var_store + - ah_var_store tags: - /.*/ - name: GvsBeta diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl index 795fc7d9699..66926bca94e 100644 --- a/scripts/variantstore/wdl/GvsAssignIds.wdl +++ b/scripts/variantstore/wdl/GvsAssignIds.wdl @@ -28,6 +28,7 @@ workflow GvsAssignIds { String vcf_header_lines_schema_json = '[{"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}, {"name":"vcf_header_lines","type":"STRING","mode":"REQUIRED"},{"name":"is_expected_unique","type":"BOOLEAN","mode":"REQUIRED"}]' String sample_vcf_header_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"}, {"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}]' String sample_load_status_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name":"status","type":"STRING","mode":"REQUIRED"}, {"name":"event_timestamp","type":"TIMESTAMP","mode":"REQUIRED"}]' + String sample_chromosome_ploidy_schema_json = '[{"name": "sample_Id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]' if (!defined(git_hash) || !defined(cloud_sdk_docker)) { call Utils.GetToolVersions { @@ -71,6 +72,20 @@ workflow GvsAssignIds { cloud_sdk_docker = effective_cloud_sdk_docker, } + call GvsCreateTables.CreateTables as CreateSamplePloidyMapTable { + input: + project_id = project_id, + dataset_name = dataset_name, + go = ValidateSamples.done, + datatype = "sample_chromosome_ploidy", + schema_json = sample_chromosome_ploidy_schema_json, + max_table_id = 1, + superpartitioned = "false", + partitioned = "false", + clustering_field = "sample_id", + cloud_sdk_docker = effective_cloud_sdk_docker, + } + if (load_vcf_headers) { call GvsCreateTables.CreateTables as CreateScratchVCFHeaderLinesTable { input: diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl index fb6ee176b57..13f1c0b7f67 100644 --- a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl +++ b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl @@ -89,6 +89,7 @@ workflow GvsJointVariantCalling { String destination_project = project_id String destination_dataset = dataset_name String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}" + String ploidy_table_name = "sample_chromosome_ploidy" if (false) { Int extract_maxretries_override = "" @@ -231,6 +232,7 @@ workflow GvsJointVariantCalling { is_wgs = is_wgs, maximum_alternate_alleles = maximum_alternate_alleles, target_interval_list = target_interval_list, + ploidy_table_name = ploidy_table_name, } output { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java new file mode 100644 index 00000000000..ad511ee4c5b --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java @@ -0,0 +1,55 @@ +package org.broadinstitute.hellbender.tools.gvs.common; + +import htsjdk.samtools.util.CollectionUtil; +import htsjdk.samtools.util.Interval; +import htsjdk.samtools.util.Lazy; +import htsjdk.variant.variantcontext.VariantContext; + +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +public class PloidyUtils { + /** + * This copies the PAR definition used in the tool FindMendelianViolations with the addition of the Y regions + * https://github.com/broadinstitute/picard/blob/master/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203 + * Wikipedia data: https://en.wikipedia.org/wiki/Pseudoautosomal_region + */ + private static final Set PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil .makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415"); + + public static Lazy> ParIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS_DEFINITION))); + + public static boolean doesVariantOverlapPAR(VariantContext variant) { + return doesIntervalOverlapPAR(variant.getContig(), variant.getStart(), variant.getEnd()); + } + + public static boolean isLocationInPAR(long location) { + // Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll + // need to factor that into how we do our mapping from pure location numbers to contigs and positions + return doesIntervalOverlapPAR(SchemaUtils.decodeContig(location), SchemaUtils.decodePosition(location), SchemaUtils.decodePosition(location) + 1); + } + + public static boolean doesIntervalOverlapPAR(String chrom, int positionStart, int positionEnd) { + // Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll + // need to factor that into how we do our mapping from pure location numbers to contigs and positions + Interval locationAsInterval = new Interval(chrom, positionStart, positionEnd); + for (Interval par : ParIntervals.get()) { + if (par.intersects(locationAsInterval)) { + return true; + } + } + return false; + } + + private static Set parseIntervalLists(final Set intervalLists){ + // Parse the PARs + final Set intervals = new HashSet<>(intervalLists.size()); + for (final String par : intervalLists) { + final String[] splits1 = par.split(":"); + final String[] splits2 = splits1[1].split("-"); + intervals.add(new Interval(splits1[0], Integer.parseInt(splits2[0]), Integer.parseInt(splits2[1]))); + } + return intervals; + } + +} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index 935a52a159f..48d0b4b29e4 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -35,15 +35,6 @@ public class ExtractCohortEngine { private final Logger logger; - // These represent the PARs given for hg38. They are defined differently for other references, so we'll have to store this in - // a better, ref-dependent different manner if/when we get around to supporting other references - // See: https://en.wikipedia.org/wiki/Pseudoautosomal_region - private final List PARRegions = List.of( - new LongRange(23000000010001L, 23000002781479L), // PAR1, X - new LongRange(24000000010001L, 24000002781479L), // PAR1, Y - new LongRange(23000155701383L, 23000156030895L), // PAR2, X - new LongRange(24000056887903L, 24000057217415L)); // PAR2, Y - private final boolean printDebugInformation; private final int localSortMaxRecordsInRam; private final List traversalIntervals; @@ -604,7 +595,7 @@ VariantContext finalizeCurrentVariant(final List unmergedVariant // If we have looked up the ploidy in our table and it says 1, use a haploid reference // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2); - if (ploidy == 2 || isInPAR(location)) { + if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) { genotypeBuilder.alleles(gtAlleles); } else if (ploidy == 1) { genotypeBuilder.alleles(gtHaploidAlleles); @@ -677,16 +668,6 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI return chromosome+"-"+sampleId; } - private boolean isInPAR(final long location) { - for (LongRange region : PARRegions) { - if (region.containsLong(location)) { - return true; - } - } - return false; - } - - private VariantContext filterSiteByAlleleSpecificVQScore(VariantContext mergedVC, Map> scoreMap, Map> vqScoreMap, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index ffb2ca39dab..3c2a5e1fa0a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -26,11 +26,7 @@ import java.io.File; import java.io.IOException; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; /** * Ingest variant walker @@ -47,6 +43,7 @@ public final class CreateVariantIngestFiles extends VariantWalker { private RefCreator refCreator; private VetCreator vetCreator; private VcfHeaderLineScratchCreator vcfHeaderLineScratchCreator; + private SamplePloidyCreator samplePloidyCreator; private LoadStatus loadStatus; private final Map allLineHeaders = new HashMap<>(); @@ -292,6 +289,10 @@ public void onTraversalStart() { if (enableReferenceRanges && !refRangesRowsExist) { refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences); + + // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypea + // directly. If we're not ingesting references, we can't compute and ingest ploidy either + samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName); } if (enableVet && !vetRowsExist) { @@ -384,6 +385,18 @@ public Object onTraversalSuccess() { } // Wait until all data has been submitted and in pending state to commit refCreator.commitData(); + + // this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we + // later separate their creation, throwing the ploidy stuff explicity behind a flag + if (samplePloidyCreator != null) { + try { + samplePloidyCreator.apply(refCreator.getReferencePloidyData()); + } catch (IOException ioe) { + throw new GATKException("Error writing ploidy data", ioe); + } + + samplePloidyCreator.commitData(); + } } if (vetCreator != null) { @@ -407,6 +420,9 @@ public void closeTool() { if (vetCreator != null) { vetCreator.closeTool(); } + if (samplePloidyCreator != null) { + samplePloidyCreator.closeTool(); + } if (vcfHeaderLineScratchCreator != null) { vcfHeaderLineScratchCreator.closeTool(); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index 0876d860f5b..e57b6d54c41 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -5,21 +5,17 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.gvs.common.CommonCode; -import org.broadinstitute.hellbender.tools.gvs.common.GQStateEnum; -import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants; -import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; +import org.broadinstitute.hellbender.tools.gvs.common.*; import org.broadinstitute.hellbender.utils.GenomeLoc; import org.broadinstitute.hellbender.utils.GenomeLocParser; import org.broadinstitute.hellbender.utils.GenomeLocSortedSet; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils; +import picard.vcf.MendelianViolations.FindMendelianViolations; import java.io.File; import java.io.IOException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; +import java.util.*; public final class RefCreator { @@ -38,6 +34,8 @@ public final class RefCreator { private static final String PREFIX_SEPARATOR = "_"; private final static String REF_RANGES_FILETYPE_PREFIX = "ref_ranges_"; + private Map ploidiesPerChromosome = null; + public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, Long sampleId) { if (outputType != CommonCode.OutputType.BQ) return false; return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId); @@ -50,6 +48,8 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin this.storeCompressedReferences = storeCompressedReferences; this.gqStatesToIgnore = gqStatesToIgnore; + this.ploidiesPerChromosome = new HashMap<>(); + coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); try { @@ -76,6 +76,21 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin } public void apply(VariantContext variant, List intervalsToWrite) throws IOException { + // Record ploidy if this is not in a PAR + if (!PloidyUtils.doesVariantOverlapPAR(variant)) { + // create the bitset for this ploidy if it isn't there + if (!ploidiesPerChromosome.containsKey(variant.getContig())) { + ploidiesPerChromosome.put(variant.getContig(), new BitSet()); + } + + // set the bit for this ploidy so we record having seen it + BitSet ploidies = ploidiesPerChromosome.get(variant.getContig()); + int ploidy = variant.getMaxPloidy(1); + if (!ploidies.get(ploidy)) { + ploidies.set(ploidy); + } + } + final String variantChr = variant.getContig(); for (GenomeLoc genomeLoc : intervalsToWrite) { @@ -262,6 +277,10 @@ public static Set getGQStateEnumGreaterThan(GQStateEnum s){ return ret; } + public Map getReferencePloidyData() { + return ploidiesPerChromosome; + } + public void commitData() { if (outputType == CommonCode.OutputType.BQ) { if (writeReferenceRanges && refRangesWriter != null) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java new file mode 100644 index 00000000000..d5551a6dd6f --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -0,0 +1,87 @@ +package org.broadinstitute.hellbender.tools.gvs.ingest; + +import com.google.protobuf.Descriptors; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter; +import org.json.JSONObject; + +import java.io.IOException; +import java.util.BitSet; +import java.util.Map; +import java.util.concurrent.ExecutionException; + +public class SamplePloidyCreator { + private static final Logger logger = LogManager.getLogger(SamplePloidyCreator.class); + + private PendingBQWriter samplePloidyBQJsonWriter = null; + private static final String SAMPLE_PLOIDY_TABLE_NAME = "sample_chromosome_ploidy"; + + private final Long sampleId; + private final String projectId; + private final String datasetName; + + + public SamplePloidyCreator(Long sampleId, String projectId, String datasetName) { + try { + this.sampleId = sampleId; + this.projectId = projectId; + this.datasetName = datasetName; + + if (projectId == null || datasetName == null) { + throw new UserException("Must specify project-id and dataset-name."); + } + samplePloidyBQJsonWriter = new PendingBQWriter(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME); + } catch (Exception e) { + throw new UserException("Could not create VCF Header Scratch Table Writer", e); + } + } + + public void apply(Map ploidyData) throws IOException { + for (final Map.Entry ploidyLine : ploidyData.entrySet()) { + try { + // make sure we don't have mixed ploidy in a single chromosome + if (ploidyLine.getValue().length() > 1) { + throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()); + } + + samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidyLine.getValue().nextSetBit(0))); + } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) { + throw new IOException("BQ exception", ex); + } + } + } + + public JSONObject createJson(Long sampleId, Long chromosome, Integer ploidy) { + JSONObject record = new JSONObject(); + record.put("sample_id", sampleId); + record.put("chromosome", chromosome); + record.put("ploidy", ploidy); + return record; + } + + + public void commitData() { + if (samplePloidyBQJsonWriter != null) { + samplePloidyBQJsonWriter.flushBuffer(); + samplePloidyBQJsonWriter.commitWriteStreams(); + } + } + + public void closeTool() { + if (samplePloidyBQJsonWriter != null) { + try { + samplePloidyBQJsonWriter.close(); + } catch (final Exception e) { + throw new IllegalArgumentException("Couldn't close Sample Ploidy Table writer", e); + } + } + if (samplePloidyBQJsonWriter != null) { + samplePloidyBQJsonWriter.close(); + } + } + +} From 9fb61c33237e6ba41e650c79ecfeb281ab89970c Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 27 Aug 2024 14:00:20 -0400 Subject: [PATCH 02/17] stupid dockstore error --- .dockstore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 2ad77ca4753..66b7572ca70 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -251,7 +251,7 @@ workflows: branches: - master - ah_var_store - - ah_var_store + - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples tags: - /.*/ - name: GvsBeta From 34ce0468e6ac7263cca8e7254ebd3fed2b5993f4 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Wed, 4 Sep 2024 15:47:11 -0400 Subject: [PATCH 03/17] Updates to track down weird bugs --- scripts/variantstore/wdl/GvsAssignIds.wdl | 2 +- .../hellbender/tools/gvs/common/SchemaUtils.java | 2 +- .../broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java | 3 +++ .../hellbender/tools/gvs/ingest/SamplePloidyCreator.java | 2 +- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl index 66926bca94e..a48144f1e3a 100644 --- a/scripts/variantstore/wdl/GvsAssignIds.wdl +++ b/scripts/variantstore/wdl/GvsAssignIds.wdl @@ -28,7 +28,7 @@ workflow GvsAssignIds { String vcf_header_lines_schema_json = '[{"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}, {"name":"vcf_header_lines","type":"STRING","mode":"REQUIRED"},{"name":"is_expected_unique","type":"BOOLEAN","mode":"REQUIRED"}]' String sample_vcf_header_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"}, {"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}]' String sample_load_status_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name":"status","type":"STRING","mode":"REQUIRED"}, {"name":"event_timestamp","type":"TIMESTAMP","mode":"REQUIRED"}]' - String sample_chromosome_ploidy_schema_json = '[{"name": "sample_Id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]' + String sample_chromosome_ploidy_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]' if (!defined(git_hash) || !defined(cloud_sdk_docker)) { call Utils.GetToolVersions { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java index fd06c77f978..4f346fb7e85 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java @@ -79,7 +79,7 @@ public class SchemaUtils { CALL_PGT, CALL_PID, CALL_PS); - public static final List SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, GENOTYPE, PLOIDY); + public static final List SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, PLOIDY); public static final List EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index e57b6d54c41..d6faea5958a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -88,9 +88,12 @@ public void apply(VariantContext variant, List intervalsToWrite) thro int ploidy = variant.getMaxPloidy(1); if (!ploidies.get(ploidy)) { ploidies.set(ploidy); + logger.info("Recording reference ploidy "+ploidy+" from sample "+sampleId+" at "+variant.getContig()+" "+variant.getStart() +" with gt "+variant.getGenotype(0)); } } + + final String variantChr = variant.getContig(); for (GenomeLoc genomeLoc : intervalsToWrite) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index d5551a6dd6f..8392ef1e02b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -44,7 +44,7 @@ public void apply(Map ploidyData) throws IOException { for (final Map.Entry ploidyLine : ploidyData.entrySet()) { try { // make sure we don't have mixed ploidy in a single chromosome - if (ploidyLine.getValue().length() > 1) { + if (ploidyLine.getValue().cardinality() > 1) { throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()); } From 44f8c89bee3acafe726e92258d0f61285bbe7b61 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Wed, 4 Sep 2024 16:23:52 -0400 Subject: [PATCH 04/17] Updating fix to only scan reference blocks in RefCreator --- .../broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index d6faea5958a..af8631cbd6a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -77,7 +77,7 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin public void apply(VariantContext variant, List intervalsToWrite) throws IOException { // Record ploidy if this is not in a PAR - if (!PloidyUtils.doesVariantOverlapPAR(variant)) { + if (!PloidyUtils.doesVariantOverlapPAR(variant) && variant.isReferenceBlock()) { // create the bitset for this ploidy if it isn't there if (!ploidiesPerChromosome.containsKey(variant.getContig())) { ploidiesPerChromosome.put(variant.getContig(), new BitSet()); From 7a41da67418cfa35d26725a41e9f2ca5735447ea Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 8 Oct 2024 10:06:13 -0400 Subject: [PATCH 05/17] Finally verified the successful ploidy run based on this code --- .../gvs/extract/ExtractCohortEngine.java | 23 ++++++- .../gvs/ingest/CreateVariantIngestFiles.java | 2 +- .../tools/gvs/ingest/RefCreator.java | 57 +++++++++++------ .../tools/gvs/ingest/SamplePloidyCreator.java | 64 +++++++++++++++++-- 4 files changed, 117 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index 48d0b4b29e4..33804a281ce 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -298,6 +298,7 @@ public void traverse() { if (samplePloidyTableRef != null) { try (StorageAPIAvroReader reader = new StorageAPIAvroReader(samplePloidyTableRef, ploidyTableRestriction, projectID)) { + logger.info("Found ploidy lookup table. Reading it into memory"); for (final GenericRecord queryRow : reader) { // the key will be a basic joining of chromosome (represented as a location) with sample name String chromosome = queryRow.get(SchemaUtils.CHROMOSOME).toString(); @@ -306,6 +307,7 @@ public void traverse() { samplePloidyMap.put(makePloidyLookupKey(chromosome, sampleId), ploidy); } processBytesScanned(reader); + logger.info("Finished reading ploidy table into memory. "+samplePloidyMap.size()+" entries read."); } } @@ -618,7 +620,26 @@ VariantContext finalizeCurrentVariant(final List unmergedVariant for (int sampleId = samplesNotEncountered.nextSetBit(0); sampleId >= 0; sampleId = samplesNotEncountered.nextSetBit(sampleId + 1)) { genotypeBuilder.reset(false); genotypeBuilder.name(sampleIdToName.get((long) sampleId)); - genotypeBuilder.alleles(gtAlleles); + + long chromAsPosition = location - SchemaUtils.decodePosition(location); + String key = makePloidyLookupKey(Long.toString(chromAsPosition), Integer.toString(sampleId)); + + // Logic for determining the correct ploidy for reference data + // If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference. + // If we have looked up the ploidy in our table and it says 1, use a haploid reference + // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case + if (!samplePloidyMap.containsKey(key)) { + logger.info("Sample ploidy information not found for key "+key+"--defaulting to 2"); + } + int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2); + if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) { + genotypeBuilder.alleles(gtAlleles); + } else if (ploidy == 1) { + genotypeBuilder.alleles(gtHaploidAlleles); + } else { + throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+"."); + } +// genotypeBuilder.alleles(gtAlleles); genotypeBuilder.GQ(inferredReferenceState.getReferenceGQ()); genotypes.add(genotypeBuilder.make()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index 3c2a5e1fa0a..96d7d1dfca7 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -390,7 +390,7 @@ public Object onTraversalSuccess() { // later separate their creation, throwing the ploidy stuff explicity behind a flag if (samplePloidyCreator != null) { try { - samplePloidyCreator.apply(refCreator.getReferencePloidyData()); + samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries()); } catch (IOException ioe) { throw new GATKException("Error writing ploidy data", ioe); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java index af8631cbd6a..bd09e5d9961 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java @@ -11,7 +11,6 @@ import org.broadinstitute.hellbender.utils.GenomeLocSortedSet; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.gvs.bigquery.BigQueryUtils; -import picard.vcf.MendelianViolations.FindMendelianViolations; import java.io.File; import java.io.IOException; @@ -35,6 +34,10 @@ public final class RefCreator { private final static String REF_RANGES_FILETYPE_PREFIX = "ref_ranges_"; private Map ploidiesPerChromosome = null; + private Map> ploidiesCountPerChromosome = null; + + // for easily calculating percentages later + private long totalRefEntries = 0L; public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, Long sampleId) { if (outputType != CommonCode.OutputType.BQ) return false; @@ -49,6 +52,7 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin this.gqStatesToIgnore = gqStatesToIgnore; this.ploidiesPerChromosome = new HashMap<>(); + this.ploidiesCountPerChromosome = new HashMap<>(); coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary)); @@ -76,24 +80,6 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin } public void apply(VariantContext variant, List intervalsToWrite) throws IOException { - // Record ploidy if this is not in a PAR - if (!PloidyUtils.doesVariantOverlapPAR(variant) && variant.isReferenceBlock()) { - // create the bitset for this ploidy if it isn't there - if (!ploidiesPerChromosome.containsKey(variant.getContig())) { - ploidiesPerChromosome.put(variant.getContig(), new BitSet()); - } - - // set the bit for this ploidy so we record having seen it - BitSet ploidies = ploidiesPerChromosome.get(variant.getContig()); - int ploidy = variant.getMaxPloidy(1); - if (!ploidies.get(ploidy)) { - ploidies.set(ploidy); - logger.info("Recording reference ploidy "+ploidy+" from sample "+sampleId+" at "+variant.getContig()+" "+variant.getStart() +" with gt "+variant.getGenotype(0)); - } - } - - - final String variantChr = variant.getContig(); for (GenomeLoc genomeLoc : intervalsToWrite) { @@ -118,6 +104,31 @@ public void apply(VariantContext variant, List intervalsToWrite) thro // if we are writing ref ranges, and this is a reference block, write it! if (writeReferenceRanges) { if (variant.isReferenceBlock()) { + + // Record reference ploidy if this is not in a PAR + if (!PloidyUtils.doesVariantOverlapPAR(variant)) { + // create the bitset for this ploidy if it isn't there + if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) { + ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>()); + } + // set the bit for this ploidy so we record having seen it + Map ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig()); + + int ploidy = variant.getMaxPloidy(1); + + Long currentCount = 0L; + if (ploidyCounts.containsKey(ploidy)) { + currentCount = ploidyCounts.get(ploidy); + } + + // increment counts for this one and put it back + ++currentCount; + ploidyCounts.put(ploidy, currentCount); + + ++totalRefEntries; + } + + // break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE int localStart = start; while ( localStart <= end ) { @@ -280,8 +291,12 @@ public static Set getGQStateEnumGreaterThan(GQStateEnum s){ return ret; } - public Map getReferencePloidyData() { - return ploidiesPerChromosome; + public Map> getReferencePloidyData() { + return ploidiesCountPerChromosome; + } + + public long getTotalRefEntries() { + return totalRefEntries; } public void commitData() { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index 8392ef1e02b..9d23c0f3c2c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -12,6 +12,7 @@ import java.io.IOException; import java.util.BitSet; import java.util.Map; +import java.util.Set; import java.util.concurrent.ExecutionException; public class SamplePloidyCreator { @@ -40,15 +41,66 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName) } } - public void apply(Map ploidyData) throws IOException { - for (final Map.Entry ploidyLine : ploidyData.entrySet()) { + + public void apply(Map> ploidyData, long totalRefEntries) throws IOException { +// for (final Map.Entry ploidyLine : ploidyData.entrySet()) { +// try { +// // make sure we don't have mixed ploidy in a single chromosome +// if (ploidyLine.getValue().cardinality() > 1) { +// throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()); +// } +// +// samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidyLine.getValue().nextSetBit(0))); +// } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) { +// throw new IOException("BQ exception", ex); +// } +// } + for (final Map.Entry> ploidyLine : ploidyData.entrySet()) { try { - // make sure we don't have mixed ploidy in a single chromosome - if (ploidyLine.getValue().cardinality() > 1) { - throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()); + Map ploidiesWithCounts = ploidyLine.getValue(); + // This is the happy path we'll normally follow--no mixed ploidy detected + if (ploidiesWithCounts.size() == 1) { + // we know there's only one item here, so we can just sent that off + samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidiesWithCounts.keySet().iterator().next())); + continue; + } + + int bestPloidy = -1; + double highestPercentage = -1; + long highestCount = 0L; + + int secondBestPloidy = -1; + double secondHighestPercentage = -1; + long secondHighestCount = 0L; + // We can detect mixed ploidy for many reasons. Some of which come down to bugs in the GATK code. + for (Map.Entry ploidyEntryWithCounts : ploidiesWithCounts.entrySet()) { + double percentage = ploidyEntryWithCounts.getValue() / totalRefEntries; + // if necessary, update our best ploidy + if (percentage > highestPercentage) { + secondBestPloidy = bestPloidy; + secondHighestPercentage = highestPercentage; + secondHighestCount = highestCount; + + bestPloidy = ploidyEntryWithCounts.getKey(); + highestPercentage = percentage; + highestCount = ploidyEntryWithCounts.getValue(); + } else if (percentage > secondHighestPercentage) { + secondBestPloidy = ploidyEntryWithCounts.getKey(); + secondHighestPercentage = percentage; + secondHighestCount = ploidyEntryWithCounts.getValue(); + } + } + + // first, see if the second best ploidy is for greater than 5% of the sample (this is likely way too generous). + // If so, there may be a deeper error going on and we should just quit + // decide which ploidy to keep + if (secondHighestPercentage > 0.05) { + throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", with second ploidy of "+secondBestPloidy+" detected in "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total) of samples"); } + // it's a small enough number to just note and move on with + logger.warn("WARNING: Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", but second ploidy of "+secondBestPloidy+" detected in only "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total)of samples. Going with dominant ploidy of "+bestPloidy); - samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidyLine.getValue().nextSetBit(0))); + samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), bestPloidy)); } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) { throw new IOException("BQ exception", ex); } From 3d724f751b64b3daf9708e40164a2473976e17ad Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 8 Oct 2024 13:17:29 -0400 Subject: [PATCH 06/17] updating gatk version --- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index bb405972a75..f4af1969b3d 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -74,7 +74,7 @@ task GetToolVersions { String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-09-30-alpine-27b8708f5808" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_08_19-gatkbase-cd5b6b7821b2" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-08-gatkbase-2e8764db86c3" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" From d6970190cdb65c446944fc1caa21c5aa86313394 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Wed, 9 Oct 2024 09:39:09 -0400 Subject: [PATCH 07/17] cleaning things up a bit --- .../tools/gvs/ingest/SamplePloidyCreator.java | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index 9d23c0f3c2c..a26ed540109 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -43,18 +43,6 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName) public void apply(Map> ploidyData, long totalRefEntries) throws IOException { -// for (final Map.Entry ploidyLine : ploidyData.entrySet()) { -// try { -// // make sure we don't have mixed ploidy in a single chromosome -// if (ploidyLine.getValue().cardinality() > 1) { -// throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()); -// } -// -// samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidyLine.getValue().nextSetBit(0))); -// } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) { -// throw new IOException("BQ exception", ex); -// } -// } for (final Map.Entry> ploidyLine : ploidyData.entrySet()) { try { Map ploidiesWithCounts = ploidyLine.getValue(); @@ -91,13 +79,14 @@ public void apply(Map> ploidyData, long totalRefEntri } } - // first, see if the second best ploidy is for greater than 5% of the sample (this is likely way too generous). + + // Decide which ploidy to keep + // First, see if the second best ploidy is for greater than 5% of the sample (this is likely way too generous). // If so, there may be a deeper error going on and we should just quit - // decide which ploidy to keep if (secondHighestPercentage > 0.05) { throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", with second ploidy of "+secondBestPloidy+" detected in "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total) of samples"); } - // it's a small enough number to just note and move on with + // It's a small enough number to just note and move on with logger.warn("WARNING: Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", but second ploidy of "+secondBestPloidy+" detected in only "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total)of samples. Going with dominant ploidy of "+bestPloidy); samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), bestPloidy)); From bd0a99c6b38f92aea2ab16489831948e27973909 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Wed, 9 Oct 2024 09:42:16 -0400 Subject: [PATCH 08/17] cleaning things up a bit v2 --- .../hellbender/tools/gvs/extract/ExtractCohortEngine.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index 33804a281ce..9bf607bc89a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -628,9 +628,6 @@ VariantContext finalizeCurrentVariant(final List unmergedVariant // If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference. // If we have looked up the ploidy in our table and it says 1, use a haploid reference // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case - if (!samplePloidyMap.containsKey(key)) { - logger.info("Sample ploidy information not found for key "+key+"--defaulting to 2"); - } int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2); if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) { genotypeBuilder.alleles(gtAlleles); From c685c887d6820c479cb662915e0f1f7e12e32461 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 24 Oct 2024 09:53:06 -0400 Subject: [PATCH 09/17] PR feedback --- .../hellbender/tools/gvs/common/PloidyUtils.java | 12 ++++++++---- .../tools/gvs/extract/ExtractCohortEngine.java | 1 - .../tools/gvs/ingest/CreateVariantIngestFiles.java | 2 +- .../tools/gvs/ingest/SamplePloidyCreator.java | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java index ad511ee4c5b..1ab9a6640ce 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java @@ -9,13 +9,19 @@ import java.util.HashSet; import java.util.Set; +/** + * Holds static methods for determining whether a particular location falls within a PAR, for the purposes of accurately determining the correct ploidy + *

+ * Some methods, due to their implementation, make implicit hg38 assumptions. If we choose to support other references, we'll need to update this class. + */ public class PloidyUtils { /** * This copies the PAR definition used in the tool FindMendelianViolations with the addition of the Y regions - * https://github.com/broadinstitute/picard/blob/master/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203 + * @see picard.vcf.MendelianViolations.FindMendelianViolations + * https://github.com/broadinstitute/picard/blob/63138fc8b7ae33eb0490a8ef707b61befa2f51d4/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203 * Wikipedia data: https://en.wikipedia.org/wiki/Pseudoautosomal_region */ - private static final Set PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil .makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415"); + private static final Set PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil.makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415"); public static Lazy> ParIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS_DEFINITION))); @@ -30,8 +36,6 @@ public static boolean isLocationInPAR(long location) { } public static boolean doesIntervalOverlapPAR(String chrom, int positionStart, int positionEnd) { - // Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll - // need to factor that into how we do our mapping from pure location numbers to contigs and positions Interval locationAsInterval = new Interval(chrom, positionStart, positionEnd); for (Interval par : ParIntervals.get()) { if (par.intersects(locationAsInterval)) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java index 9bf607bc89a..00db073165a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java @@ -636,7 +636,6 @@ VariantContext finalizeCurrentVariant(final List unmergedVariant } else { throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+"."); } -// genotypeBuilder.alleles(gtAlleles); genotypeBuilder.GQ(inferredReferenceState.getReferenceGQ()); genotypes.add(genotypeBuilder.make()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index 96d7d1dfca7..e742db141a8 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -290,7 +290,7 @@ public void onTraversalStart() { if (enableReferenceRanges && !refRangesRowsExist) { refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences); - // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypea + // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes // directly. If we're not ingesting references, we can't compute and ingest ploidy either samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index a26ed540109..344a8cfc98a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -37,7 +37,7 @@ public SamplePloidyCreator(Long sampleId, String projectId, String datasetName) } samplePloidyBQJsonWriter = new PendingBQWriter(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME); } catch (Exception e) { - throw new UserException("Could not create VCF Header Scratch Table Writer", e); + throw new UserException("Could not create Sample Ploidy Table Writer", e); } } @@ -48,7 +48,7 @@ public void apply(Map> ploidyData, long totalRefEntri Map ploidiesWithCounts = ploidyLine.getValue(); // This is the happy path we'll normally follow--no mixed ploidy detected if (ploidiesWithCounts.size() == 1) { - // we know there's only one item here, so we can just sent that off + // we know there's only one item here, so we can just send that off samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidiesWithCounts.keySet().iterator().next())); continue; } From 9366d91e3da0f430b6d4026b50781b2ea1ef66bf Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 24 Oct 2024 11:48:38 -0400 Subject: [PATCH 10/17] Updating the docker image --- scripts/variantstore/wdl/GvsUtils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index d24da5b4cfb..dba72ef294b 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -72,7 +72,7 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-17-alpine-6dd528be8d8d" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-08-gatkbase-2e8764db86c3" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" From fef793831a47c4d2c6c999594e9fb05505b3ccb1 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 24 Oct 2024 15:53:05 -0400 Subject: [PATCH 11/17] dockstore --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index c7d9e4664e3..6465644df4e 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -298,6 +298,7 @@ workflows: branches: - master - ah_var_store + - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples tags: - /.*/ - name: GvsQuickstartHailIntegration From f8450dfdbc0a6a0e9780e19c7a89d5559ac648b1 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 24 Oct 2024 15:56:06 -0400 Subject: [PATCH 12/17] dockstore --- .dockstore.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.dockstore.yml b/.dockstore.yml index 6465644df4e..81ef630ca62 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -298,7 +298,6 @@ workflows: branches: - master - ah_var_store - - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples tags: - /.*/ - name: GvsQuickstartHailIntegration @@ -318,6 +317,7 @@ workflows: - master - ah_var_store - vs_1456_status_writes_bug + - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples tags: - /.*/ - name: GvsIngestTieout From 187c1945c69ed9a84b80c37295e5072f480528b9 Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Fri, 25 Oct 2024 10:01:05 -0400 Subject: [PATCH 13/17] updating the CORRECT image this time --- scripts/variantstore/wdl/GvsUtils.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index dba72ef294b..b299d3bd7bb 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -72,9 +72,9 @@ task GetToolVersions { # GVS generally uses the smallest `alpine` version of the Google Cloud SDK as it suffices for most tasks, but # there are a handlful of tasks that require the larger GNU libc-based `slim`. String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim" - String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443" + String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-17-alpine-6dd528be8d8d" String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19" - String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-08-gatkbase-2e8764db86c3" + String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443" String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest" String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623" String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e" From 4b1256e6e34dfb11536c4ab270d23a31a8d1351e Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 29 Oct 2024 14:35:47 -0400 Subject: [PATCH 14/17] Updating quickstart expected info to account for haploid bge sample discovered as part of ticket --- scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index f40e33223e2..2419c493271 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -36,7 +36,7 @@ workflow GvsQuickstartIntegration { File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list" File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" - File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-08-13/" + expected_subdir + File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-10-29/" + expected_subdir # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { From 43da8dedd5e40512ecac52545c7870e5809c1eff Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 31 Oct 2024 16:56:57 -0400 Subject: [PATCH 15/17] updating dockstore for final ingestion test around header-only ingest --- .dockstore.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.dockstore.yml b/.dockstore.yml index 81ef630ca62..b9642b97db8 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -192,6 +192,7 @@ workflows: branches: - master - ah_var_store + - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples tags: - /.*/ - name: GvsPrepareRangesCallset From a76ea779fa7b8ef1aabaa4c14f3a2c9710b4319c Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Tue, 5 Nov 2024 11:08:20 -0500 Subject: [PATCH 16/17] Ensuring that we put writing to a BQ table behind explicit code, so existing tests that go through the tsv pathway don't break --- .../gvs/ingest/CreateVariantIngestFiles.java | 2 +- .../tools/gvs/ingest/SamplePloidyCreator.java | 26 ++++++++++++++----- 2 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java index e742db141a8..45b0640c6dd 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java @@ -292,7 +292,7 @@ public void onTraversalStart() { // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes // directly. If we're not ingesting references, we can't compute and ingest ploidy either - samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName); + samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType); } if (enableVet && !vetRowsExist) { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index 344a8cfc98a..2ebf1d19e3a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -4,6 +4,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.gvs.common.CommonCode; import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils; import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.gvs.bigquery.PendingBQWriter; @@ -25,17 +26,24 @@ public class SamplePloidyCreator { private final String projectId; private final String datasetName; + private final CommonCode.OutputType outputType; - public SamplePloidyCreator(Long sampleId, String projectId, String datasetName) { + + public SamplePloidyCreator(Long sampleId, String projectId, String datasetName, final CommonCode.OutputType outputType) { try { this.sampleId = sampleId; this.projectId = projectId; this.datasetName = datasetName; + this.outputType = outputType; - if (projectId == null || datasetName == null) { - throw new UserException("Must specify project-id and dataset-name."); + // For now, ignore all other output types because they aren't supported YET + if (outputType == CommonCode.OutputType.BQ) { + if (projectId == null || datasetName == null) { + throw new UserException("Must specify project-id and dataset-name."); + } + samplePloidyBQJsonWriter = new PendingBQWriter(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME); } - samplePloidyBQJsonWriter = new PendingBQWriter(projectId, datasetName, SAMPLE_PLOIDY_TABLE_NAME); + } catch (Exception e) { throw new UserException("Could not create Sample Ploidy Table Writer", e); } @@ -49,7 +57,9 @@ public void apply(Map> ploidyData, long totalRefEntri // This is the happy path we'll normally follow--no mixed ploidy detected if (ploidiesWithCounts.size() == 1) { // we know there's only one item here, so we can just send that off - samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), ploidiesWithCounts.keySet().iterator().next())); + if (outputType == CommonCode.OutputType.BQ) { + samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(), 0), ploidiesWithCounts.keySet().iterator().next())); + } continue; } @@ -89,7 +99,11 @@ public void apply(Map> ploidyData, long totalRefEntri // It's a small enough number to just note and move on with logger.warn("WARNING: Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", but second ploidy of "+secondBestPloidy+" detected in only "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total)of samples. Going with dominant ploidy of "+bestPloidy); - samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), bestPloidy)); + if (outputType == CommonCode.OutputType.BQ) { + samplePloidyBQJsonWriter.addJsonRow(createJson(this.sampleId, SchemaUtils.encodeLocation(ploidyLine.getKey(),0), bestPloidy)); + } + + } catch (Descriptors.DescriptorValidationException | ExecutionException | InterruptedException ex) { throw new IOException("BQ exception", ex); } From ce1d34f59eec3c87a1903619158b4694db01e5db Mon Sep 17 00:00:00 2001 From: Aaron Hatcher Date: Thu, 7 Nov 2024 10:15:55 -0500 Subject: [PATCH 17/17] pr feedback: begrudgingly moving magic number to a constant. Much grumbling. --- .../hellbender/tools/gvs/ingest/SamplePloidyCreator.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java index 2ebf1d19e3a..54c4bcd7429 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/SamplePloidyCreator.java @@ -21,6 +21,9 @@ public class SamplePloidyCreator { private PendingBQWriter samplePloidyBQJsonWriter = null; private static final String SAMPLE_PLOIDY_TABLE_NAME = "sample_chromosome_ploidy"; + // When we detect mixed ploidy, we allow for a small fraction due to DRAGEN bugs. This constant determines the cutoff + // after which we'll throw an error instead of just normalizing the ploidy + private static final double MIXED_PLOIDY_ERROR_CUTOFF = 0.05; private final Long sampleId; private final String projectId; @@ -93,7 +96,7 @@ public void apply(Map> ploidyData, long totalRefEntri // Decide which ploidy to keep // First, see if the second best ploidy is for greater than 5% of the sample (this is likely way too generous). // If so, there may be a deeper error going on and we should just quit - if (secondHighestPercentage > 0.05) { + if (secondHighestPercentage > MIXED_PLOIDY_ERROR_CUTOFF) { throw new UserException("Detected mixed ploidy in sample "+this.sampleId+" on chromosome "+ploidyLine.getKey()+", with second ploidy of "+secondBestPloidy+" detected in "+(secondHighestPercentage * 100)+"% ("+secondHighestCount+" total) of samples"); } // It's a small enough number to just note and move on with