Vs 1416 modify ingest to correctly handle ploidy differences in drage…

…n 3 7 8 samples (#8994) * Pushing naive implementation to test for correctness * stupid dockstore error * Updates to track down weird bugs * Updating fix to only scan reference blocks in RefCreator * Finally verified the successful ploidy run based on this code * updating gatk version * cleaning things up a bit * cleaning things up a bit v2 * PR feedback * Updating the docker image * dockstore * dockstore * updating the CORRECT image this time * Updating quickstart expected info to account for haploid bge sample discovered as part of ticket * updating dockstore for final ingestion test around header-only ingest * Ensuring that we put writing to a BQ table behind explicit code, so existing tests that go through the tsv pathway don't break * pr feedback: begrudgingly moving magic number to a constant. Much grumbling.
broadinstitute · Nov 7, 2024 · 441e7f4 · 441e7f4
1 parent a178fdd
commit 441e7f4
Show file tree

Hide file tree

Showing 11 changed files with 311 additions and 36 deletions.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -192,6 +192,7 @@ workflows:
          branches:
              - master
              - ah_var_store
+             - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
          tags:
              - /.*/
    - name: GvsPrepareRangesCallset
@@ -259,6 +260,7 @@ workflows:
          branches:
              - master
              - ah_var_store
+             - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
          tags:
              - /.*/
    - name: GvsBeta
@@ -316,6 +318,7 @@ workflows:
              - master
              - ah_var_store
              - vs_1456_status_writes_bug
+             - VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
          tags:
              - /.*/
    - name: GvsIngestTieout

diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl
@@ -28,6 +28,7 @@ workflow GvsAssignIds {
   String vcf_header_lines_schema_json = '[{"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}, {"name":"vcf_header_lines","type":"STRING","mode":"REQUIRED"},{"name":"is_expected_unique","type":"BOOLEAN","mode":"REQUIRED"}]'
   String sample_vcf_header_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"}, {"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}]'
   String sample_load_status_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name":"status","type":"STRING","mode":"REQUIRED"}, {"name":"event_timestamp","type":"TIMESTAMP","mode":"REQUIRED"}]'
+  String sample_chromosome_ploidy_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]'
 
   if (!defined(git_hash) || !defined(cloud_sdk_docker)) {
     call Utils.GetToolVersions {
@@ -71,6 +72,20 @@ workflow GvsAssignIds {
       cloud_sdk_docker = effective_cloud_sdk_docker,
   }
 
+  call GvsCreateTables.CreateTables as CreateSamplePloidyMapTable {
+    input:
+      project_id = project_id,
+      dataset_name = dataset_name,
+      go = ValidateSamples.done,
+      datatype = "sample_chromosome_ploidy",
+      schema_json = sample_chromosome_ploidy_schema_json,
+      max_table_id = 1,
+      superpartitioned = "false",
+      partitioned = "false",
+      clustering_field = "sample_id",
+      cloud_sdk_docker = effective_cloud_sdk_docker,
+  }
+
   if (load_vcf_headers) {
     call GvsCreateTables.CreateTables as CreateScratchVCFHeaderLinesTable {
       input:

diff --git a/scripts/variantstore/wdl/GvsJointVariantCalling.wdl b/scripts/variantstore/wdl/GvsJointVariantCalling.wdl
@@ -90,6 +90,7 @@ workflow GvsJointVariantCalling {
     String destination_project = project_id
     String destination_dataset = dataset_name
     String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}"
+    String ploidy_table_name = "sample_chromosome_ploidy"
 
     if (false) {
       Int extract_maxretries_override = ""
@@ -233,6 +234,7 @@ workflow GvsJointVariantCalling {
             is_wgs = is_wgs,
             maximum_alternate_alleles = maximum_alternate_alleles,
             target_interval_list = target_interval_list,
+            ploidy_table_name = ploidy_table_name,
     }
 
     output {

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -74,7 +74,7 @@ task GetToolVersions {
     String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
     String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-22-alpine-e7443149b8db"
     String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
-    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_10_10-gatkbase-1cd1f9652cb9"
+    String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443"
     String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
     String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
     String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"

diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl
@@ -34,7 +34,7 @@ workflow GvsQuickstartIntegration {
     File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
     File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"
     String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/"  else ""
-    File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-08-13/" + expected_subdir
+    File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-10-29/" + expected_subdir
 
     # WDL 1.0 trick to set a variable ('none') to be undefined.
     if (false) {

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/PloidyUtils.java
@@ -0,0 +1,59 @@
+package org.broadinstitute.hellbender.tools.gvs.common;
+
+import htsjdk.samtools.util.CollectionUtil;
+import htsjdk.samtools.util.Interval;
+import htsjdk.samtools.util.Lazy;
+import htsjdk.variant.variantcontext.VariantContext;
+
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+/**
+ * Holds static methods for determining whether a particular location falls within a PAR, for the purposes of accurately determining the correct ploidy
+ * <p>
+ * Some methods, due to their implementation, make implicit hg38 assumptions.  If we choose to support other references, we'll need to update this class.
+ */
+public class PloidyUtils {
+    /**
+     * This copies the PAR definition used in the tool FindMendelianViolations with the addition of the Y regions
+     * @see picard.vcf.MendelianViolations.FindMendelianViolations
+     * https://github.com/broadinstitute/picard/blob/63138fc8b7ae33eb0490a8ef707b61befa2f51d4/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203
+     * Wikipedia data: https://en.wikipedia.org/wiki/Pseudoautosomal_region
+     */
+    private static final Set<String> PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil.makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415");
+
+    public static Lazy<Set<Interval>> ParIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS_DEFINITION)));
+
+    public static boolean doesVariantOverlapPAR(VariantContext variant) {
+        return doesIntervalOverlapPAR(variant.getContig(), variant.getStart(), variant.getEnd());
+    }
+
+    public static boolean isLocationInPAR(long location) {
+        // Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll
+        // need to factor that into how we do our mapping from pure location numbers to contigs and positions
+        return doesIntervalOverlapPAR(SchemaUtils.decodeContig(location), SchemaUtils.decodePosition(location), SchemaUtils.decodePosition(location) + 1);
+    }
+
+    public static boolean doesIntervalOverlapPAR(String chrom, int positionStart, int positionEnd) {
+        Interval locationAsInterval = new Interval(chrom, positionStart, positionEnd);
+        for (Interval par : ParIntervals.get()) {
+            if (par.intersects(locationAsInterval)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static Set<Interval> parseIntervalLists(final Set<String> intervalLists){
+        // Parse the PARs
+        final Set<Interval> intervals = new HashSet<>(intervalLists.size());
+        for (final String par : intervalLists) {
+            final String[] splits1 = par.split(":");
+            final String[] splits2 = splits1[1].split("-");
+            intervals.add(new Interval(splits1[0], Integer.parseInt(splits2[0]), Integer.parseInt(splits2[1])));
+        }
+        return intervals;
+    }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/common/SchemaUtils.java
@@ -79,7 +79,7 @@ public class SchemaUtils {
             CALL_PGT, CALL_PID, CALL_PS);
 
 
-    public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, GENOTYPE, PLOIDY);
+    public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, PLOIDY);
 
     public static final List<String> EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME);
 

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/extract/ExtractCohortEngine.java
@@ -35,15 +35,6 @@
 public class ExtractCohortEngine {
     private final Logger logger;
 
-    // These represent the PARs given for hg38.  They are defined differently for other references, so we'll have to store this in
-    // a better, ref-dependent different manner if/when we get around to supporting other references
-    // See: https://en.wikipedia.org/wiki/Pseudoautosomal_region
-    private final List<LongRange> PARRegions = List.of(
-            new LongRange(23000000010001L, 23000002781479L),    // PAR1, X
-            new LongRange(24000000010001L, 24000002781479L),    // PAR1, Y
-            new LongRange(23000155701383L, 23000156030895L),    // PAR2, X
-            new LongRange(24000056887903L, 24000057217415L));   // PAR2, Y
-
     private final boolean printDebugInformation;
     private final int localSortMaxRecordsInRam;
     private final List<SimpleInterval> traversalIntervals;
@@ -307,6 +298,7 @@ public void traverse() {
 
         if (samplePloidyTableRef != null) {
             try (StorageAPIAvroReader reader = new StorageAPIAvroReader(samplePloidyTableRef, ploidyTableRestriction, projectID)) {
+                logger.info("Found ploidy lookup table.  Reading it into memory");
                 for (final GenericRecord queryRow : reader) {
                     // the key will be a basic joining of chromosome (represented as a location) with sample name
                     String chromosome = queryRow.get(SchemaUtils.CHROMOSOME).toString();
@@ -315,6 +307,7 @@ public void traverse() {
                     samplePloidyMap.put(makePloidyLookupKey(chromosome, sampleId), ploidy);
                 }
                 processBytesScanned(reader);
+                logger.info("Finished reading ploidy table into memory. "+samplePloidyMap.size()+" entries read.");
             }
         }
 
@@ -604,7 +597,7 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
             // If we have looked up the ploidy in our table and it says 1, use a haploid reference
             // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
             int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
-            if (ploidy == 2 || isInPAR(location)) {
+            if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
                 genotypeBuilder.alleles(gtAlleles);
             } else if (ploidy == 1) {
                 genotypeBuilder.alleles(gtHaploidAlleles);
@@ -627,7 +620,22 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
         for (int sampleId = samplesNotEncountered.nextSetBit(0); sampleId >= 0; sampleId = samplesNotEncountered.nextSetBit(sampleId + 1)) {
             genotypeBuilder.reset(false);
             genotypeBuilder.name(sampleIdToName.get((long) sampleId));
-            genotypeBuilder.alleles(gtAlleles);
+
+            long chromAsPosition = location - SchemaUtils.decodePosition(location);
+            String key = makePloidyLookupKey(Long.toString(chromAsPosition), Integer.toString(sampleId));
+
+            // Logic for determining the correct ploidy for reference data
+            // If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference.
+            // If we have looked up the ploidy in our table and it says 1, use a haploid reference
+            // Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
+            int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
+            if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
+                genotypeBuilder.alleles(gtAlleles);
+            } else if (ploidy == 1) {
+                genotypeBuilder.alleles(gtHaploidAlleles);
+            } else {
+                throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+".");
+            }
             genotypeBuilder.GQ(inferredReferenceState.getReferenceGQ());
             genotypes.add(genotypeBuilder.make());
         }
@@ -677,16 +685,6 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI
         return chromosome+"-"+sampleId;
     }
 
-    private boolean isInPAR(final long location) {
-        for (LongRange region : PARRegions) {
-            if (region.containsLong(location)) {
-                return true;
-            }
-        }
-        return false;
-    }
-
-
     private VariantContext filterSiteByAlleleSpecificVQScore(VariantContext mergedVC,
                                                              Map<Allele, Map<Allele, Double>> scoreMap,
                                                              Map<Allele, Map<Allele, Double>> vqScoreMap,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/CreateVariantIngestFiles.java
@@ -26,11 +26,7 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 
 /**
  * Ingest variant walker
@@ -47,6 +43,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {
     private RefCreator refCreator;
     private VetCreator vetCreator;
     private VcfHeaderLineScratchCreator vcfHeaderLineScratchCreator;
+    private SamplePloidyCreator samplePloidyCreator;
     private LoadStatus loadStatus;
 
     private final Map<String, Boolean> allLineHeaders = new HashMap<>();
@@ -292,6 +289,10 @@ public void onTraversalStart() {
 
         if (enableReferenceRanges && !refRangesRowsExist) {
             refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);
+
+            // The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
+            // directly.  If we're not ingesting references, we can't compute and ingest ploidy either
+            samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
         }
 
         if (enableVet && !vetRowsExist) {
@@ -384,6 +385,18 @@ public Object onTraversalSuccess() {
             }
             // Wait until all data has been submitted and in pending state to commit
             refCreator.commitData();
+
+            // this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we
+            // later separate their creation, throwing the ploidy stuff explicity behind a flag
+            if (samplePloidyCreator != null) {
+                try {
+                    samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries());
+                } catch (IOException ioe) {
+                    throw new GATKException("Error writing ploidy data", ioe);
+                }
+
+                samplePloidyCreator.commitData();
+            }
         }
 
         if (vetCreator != null) {
@@ -407,6 +420,9 @@ public void closeTool() {
         if (vetCreator != null) {
             vetCreator.closeTool();
         }
+        if (samplePloidyCreator != null) {
+            samplePloidyCreator.closeTool();
+        }
         if (vcfHeaderLineScratchCreator != null) {
             vcfHeaderLineScratchCreator.closeTool();
         }

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java b/src/main/java/org/broadinstitute/hellbender/tools/gvs/ingest/RefCreator.java
@@ -5,10 +5,7 @@
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.broadinstitute.hellbender.exceptions.UserException;
-import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
-import org.broadinstitute.hellbender.tools.gvs.common.GQStateEnum;
-import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
-import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
+import org.broadinstitute.hellbender.tools.gvs.common.*;
 import org.broadinstitute.hellbender.utils.GenomeLoc;
 import org.broadinstitute.hellbender.utils.GenomeLocParser;
 import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
@@ -17,9 +14,7 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Set;
+import java.util.*;
 
 
 public final class RefCreator {
@@ -38,6 +33,12 @@ public final class RefCreator {
     private static final String PREFIX_SEPARATOR = "_";
     private final static String REF_RANGES_FILETYPE_PREFIX = "ref_ranges_";
 
+    private Map<String, BitSet> ploidiesPerChromosome = null;
+    private Map<String, Map<Integer, Long>> ploidiesCountPerChromosome = null;
+
+    // for easily calculating percentages later
+    private long totalRefEntries = 0L;
+
     public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, Long sampleId) {
         if (outputType != CommonCode.OutputType.BQ) return false;
         return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
@@ -50,6 +51,9 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
         this.storeCompressedReferences = storeCompressedReferences;
         this.gqStatesToIgnore = gqStatesToIgnore;
 
+        this.ploidiesPerChromosome = new HashMap<>();
+        this.ploidiesCountPerChromosome = new HashMap<>();
+
         coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
 
         try {
@@ -100,6 +104,31 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
                 // if we are writing ref ranges, and this is a reference block, write it!
                 if (writeReferenceRanges) {
                     if (variant.isReferenceBlock()) {
+
+                        // Record reference ploidy if this is not in a PAR
+                        if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
+                            // create the bitset for this ploidy if it isn't there
+                            if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
+                                ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
+                            }
+                            // set the bit for this ploidy so we record having seen it
+                            Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());
+
+                            int ploidy = variant.getMaxPloidy(1);
+
+                            Long currentCount = 0L;
+                            if (ploidyCounts.containsKey(ploidy)) {
+                                currentCount = ploidyCounts.get(ploidy);
+                            }
+
+                            // increment counts for this one and put it back
+                            ++currentCount;
+                            ploidyCounts.put(ploidy, currentCount);
+
+                            ++totalRefEntries;
+                        }
+
+
                         // break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE
                         int localStart = start;
                         while ( localStart <= end ) {
@@ -262,6 +291,14 @@ public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
         return ret;
     }
 
+    public Map<String, Map<Integer, Long>> getReferencePloidyData() {
+        return ploidiesCountPerChromosome;
+    }
+
+    public long getTotalRefEntries() {
+        return totalRefEntries;
+    }
+
     public void commitData() {
         if (outputType == CommonCode.OutputType.BQ) {
             if (writeReferenceRanges && refRangesWriter != null) {