Skip to content

Commit

Permalink
Vs 1416 modify ingest to correctly handle ploidy differences in drage…
Browse files Browse the repository at this point in the history
…n 3 7 8 samples (#8994)

* Pushing naive implementation to test for correctness

* stupid dockstore error

* Updates to track down weird bugs

* Updating fix to only scan reference blocks in RefCreator

* Finally verified the successful ploidy run based on this code

* updating gatk version

* cleaning things up a bit

* cleaning things up a bit v2

* PR feedback

* Updating the docker image

* dockstore

* dockstore

* updating the CORRECT image this time

* Updating quickstart expected info to account for haploid bge sample discovered as part of ticket

* updating dockstore for final ingestion test around header-only ingest

* Ensuring that we put writing to a BQ table behind explicit code, so existing tests that go through the tsv pathway don't break

* pr feedback: begrudgingly moving magic number to a constant. Much grumbling.
  • Loading branch information
koncheto-broad authored Nov 7, 2024
1 parent a178fdd commit 441e7f4
Show file tree
Hide file tree
Showing 11 changed files with 311 additions and 36 deletions.
3 changes: 3 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ workflows:
branches:
- master
- ah_var_store
- VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
tags:
- /.*/
- name: GvsPrepareRangesCallset
Expand Down Expand Up @@ -259,6 +260,7 @@ workflows:
branches:
- master
- ah_var_store
- VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
tags:
- /.*/
- name: GvsBeta
Expand Down Expand Up @@ -316,6 +318,7 @@ workflows:
- master
- ah_var_store
- vs_1456_status_writes_bug
- VS-1416-modify-ingest-to-correctly-handle-ploidy-differences-in-dragen-3-7-8-samples
tags:
- /.*/
- name: GvsIngestTieout
Expand Down
15 changes: 15 additions & 0 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ workflow GvsAssignIds {
String vcf_header_lines_schema_json = '[{"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}, {"name":"vcf_header_lines","type":"STRING","mode":"REQUIRED"},{"name":"is_expected_unique","type":"BOOLEAN","mode":"REQUIRED"}]'
String sample_vcf_header_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"}, {"name":"vcf_header_lines_hash","type":"STRING","mode":"REQUIRED"}]'
String sample_load_status_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name":"status","type":"STRING","mode":"REQUIRED"}, {"name":"event_timestamp","type":"TIMESTAMP","mode":"REQUIRED"}]'
String sample_chromosome_ploidy_schema_json = '[{"name": "sample_id","type": "INTEGER","mode": "REQUIRED"},{"name": "chromosome","type": "INTEGER","mode": "REQUIRED"},{"name": "ploidy","type": "INTEGER","mode": "REQUIRED"}]'

if (!defined(git_hash) || !defined(cloud_sdk_docker)) {
call Utils.GetToolVersions {
Expand Down Expand Up @@ -71,6 +72,20 @@ workflow GvsAssignIds {
cloud_sdk_docker = effective_cloud_sdk_docker,
}

call GvsCreateTables.CreateTables as CreateSamplePloidyMapTable {
input:
project_id = project_id,
dataset_name = dataset_name,
go = ValidateSamples.done,
datatype = "sample_chromosome_ploidy",
schema_json = sample_chromosome_ploidy_schema_json,
max_table_id = 1,
superpartitioned = "false",
partitioned = "false",
clustering_field = "sample_id",
cloud_sdk_docker = effective_cloud_sdk_docker,
}

if (load_vcf_headers) {
call GvsCreateTables.CreateTables as CreateScratchVCFHeaderLinesTable {
input:
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore/wdl/GvsJointVariantCalling.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ workflow GvsJointVariantCalling {
String destination_project = project_id
String destination_dataset = dataset_name
String fq_temp_table_dataset = "~{destination_project}.~{destination_dataset}"
String ploidy_table_name = "sample_chromosome_ploidy"

if (false) {
Int extract_maxretries_override = ""
Expand Down Expand Up @@ -233,6 +234,7 @@ workflow GvsJointVariantCalling {
is_wgs = is_wgs,
maximum_alternate_alleles = maximum_alternate_alleles,
target_interval_list = target_interval_list,
ploidy_table_name = ploidy_table_name,
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ task GetToolVersions {
String cloud_sdk_slim_docker = "gcr.io/google.com/cloudsdktool/cloud-sdk:435.0.0-slim"
String variants_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/variants:2024-10-22-alpine-e7443149b8db"
String variants_nirvana_docker = "us.gcr.io/broad-dsde-methods/variantstore:nirvana_2022_10_19"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024_10_10-gatkbase-1cd1f9652cb9"
String gatk_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/gatk:2024-10-24-gatkbase-b29b46ab0443"
String real_time_genomics_docker = "docker.io/realtimegenomics/rtg-tools:latest"
String gotc_imputation_docker = "us.gcr.io/broad-gotc-prod/imputation-bcf-vcf:1.0.5-1.10.2-0.1.16-1649948623"
String plink_docker = "us-central1-docker.pkg.dev/broad-dsde-methods/gvs/plink2:2024-04-23-slim-a0a65f52cc0e"
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ workflow GvsQuickstartIntegration {
File full_wgs_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list"
File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list"
String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else ""
File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-08-13/" + expected_subdir
File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-10-29/" + expected_subdir

# WDL 1.0 trick to set a variable ('none') to be undefined.
if (false) {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package org.broadinstitute.hellbender.tools.gvs.common;

import htsjdk.samtools.util.CollectionUtil;
import htsjdk.samtools.util.Interval;
import htsjdk.samtools.util.Lazy;
import htsjdk.variant.variantcontext.VariantContext;

import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

/**
* Holds static methods for determining whether a particular location falls within a PAR, for the purposes of accurately determining the correct ploidy
* <p>
* Some methods, due to their implementation, make implicit hg38 assumptions. If we choose to support other references, we'll need to update this class.
*/
public class PloidyUtils {
/**
* This copies the PAR definition used in the tool FindMendelianViolations with the addition of the Y regions
* @see picard.vcf.MendelianViolations.FindMendelianViolations
* https://github.com/broadinstitute/picard/blob/63138fc8b7ae33eb0490a8ef707b61befa2f51d4/src/main/java/picard/vcf/MendelianViolations/FindMendelianViolations.java#L203
* Wikipedia data: https://en.wikipedia.org/wiki/Pseudoautosomal_region
*/
private static final Set<String> PSEUDO_AUTOSOMAL_REGIONS_DEFINITION = CollectionUtil.makeSet("X:60001-2699520", "X:154931044-155260560", "chrX:10001-2781479", "chrX:155701383-156030895", "Y:10001-2649520", "Y:59034050-59363566", "chrY:10001-2781479", "chrY:56887903-57217415");

public static Lazy<Set<Interval>> ParIntervals = new Lazy<>(() -> Collections.unmodifiableSet(parseIntervalLists(PSEUDO_AUTOSOMAL_REGIONS_DEFINITION)));

public static boolean doesVariantOverlapPAR(VariantContext variant) {
return doesIntervalOverlapPAR(variant.getContig(), variant.getStart(), variant.getEnd());
}

public static boolean isLocationInPAR(long location) {
// Note: SchemaUtils.decodeContig makes implicit hg38 assumptions. If we support different references, we'll
// need to factor that into how we do our mapping from pure location numbers to contigs and positions
return doesIntervalOverlapPAR(SchemaUtils.decodeContig(location), SchemaUtils.decodePosition(location), SchemaUtils.decodePosition(location) + 1);
}

public static boolean doesIntervalOverlapPAR(String chrom, int positionStart, int positionEnd) {
Interval locationAsInterval = new Interval(chrom, positionStart, positionEnd);
for (Interval par : ParIntervals.get()) {
if (par.intersects(locationAsInterval)) {
return true;
}
}
return false;
}

private static Set<Interval> parseIntervalLists(final Set<String> intervalLists){
// Parse the PARs
final Set<Interval> intervals = new HashSet<>(intervalLists.size());
for (final String par : intervalLists) {
final String[] splits1 = par.split(":");
final String[] splits2 = splits1[1].split("-");
intervals.add(new Interval(splits1[0], Integer.parseInt(splits2[0]), Integer.parseInt(splits2[1])));
}
return intervals;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ public class SchemaUtils {
CALL_PGT, CALL_PID, CALL_PS);


public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, GENOTYPE, PLOIDY);
public static final List<String> SAMPLE_PLOIDY_FIELDS = Arrays.asList(CHROMOSOME, SAMPLE_ID, PLOIDY);

public static final List<String> EXTRACT_REF_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, LENGTH_FIELD_NAME, STATE_FIELD_NAME);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,6 @@
public class ExtractCohortEngine {
private final Logger logger;

// These represent the PARs given for hg38. They are defined differently for other references, so we'll have to store this in
// a better, ref-dependent different manner if/when we get around to supporting other references
// See: https://en.wikipedia.org/wiki/Pseudoautosomal_region
private final List<LongRange> PARRegions = List.of(
new LongRange(23000000010001L, 23000002781479L), // PAR1, X
new LongRange(24000000010001L, 24000002781479L), // PAR1, Y
new LongRange(23000155701383L, 23000156030895L), // PAR2, X
new LongRange(24000056887903L, 24000057217415L)); // PAR2, Y

private final boolean printDebugInformation;
private final int localSortMaxRecordsInRam;
private final List<SimpleInterval> traversalIntervals;
Expand Down Expand Up @@ -307,6 +298,7 @@ public void traverse() {

if (samplePloidyTableRef != null) {
try (StorageAPIAvroReader reader = new StorageAPIAvroReader(samplePloidyTableRef, ploidyTableRestriction, projectID)) {
logger.info("Found ploidy lookup table. Reading it into memory");
for (final GenericRecord queryRow : reader) {
// the key will be a basic joining of chromosome (represented as a location) with sample name
String chromosome = queryRow.get(SchemaUtils.CHROMOSOME).toString();
Expand All @@ -315,6 +307,7 @@ public void traverse() {
samplePloidyMap.put(makePloidyLookupKey(chromosome, sampleId), ploidy);
}
processBytesScanned(reader);
logger.info("Finished reading ploidy table into memory. "+samplePloidyMap.size()+" entries read.");
}
}

Expand Down Expand Up @@ -604,7 +597,7 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
// If we have looked up the ploidy in our table and it says 1, use a haploid reference
// Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
if (ploidy == 2 || isInPAR(location)) {
if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
genotypeBuilder.alleles(gtAlleles);
} else if (ploidy == 1) {
genotypeBuilder.alleles(gtHaploidAlleles);
Expand All @@ -627,7 +620,22 @@ VariantContext finalizeCurrentVariant(final List<VariantContext> unmergedVariant
for (int sampleId = samplesNotEncountered.nextSetBit(0); sampleId >= 0; sampleId = samplesNotEncountered.nextSetBit(sampleId + 1)) {
genotypeBuilder.reset(false);
genotypeBuilder.name(sampleIdToName.get((long) sampleId));
genotypeBuilder.alleles(gtAlleles);

long chromAsPosition = location - SchemaUtils.decodePosition(location);
String key = makePloidyLookupKey(Long.toString(chromAsPosition), Integer.toString(sampleId));

// Logic for determining the correct ploidy for reference data
// If we have no info in the table, the ploidy is explicitly 2, OR we are in a PAR, use diploid reference.
// If we have looked up the ploidy in our table and it says 1, use a haploid reference
// Otherwise, if we have a ploidy that is neither 1 nor 2, throw a user exception because we haven't coded for this case
int ploidy = (samplePloidyMap.containsKey(key) ? samplePloidyMap.get(key) : 2);
if (ploidy == 2 || PloidyUtils.isLocationInPAR(location)) {
genotypeBuilder.alleles(gtAlleles);
} else if (ploidy == 1) {
genotypeBuilder.alleles(gtHaploidAlleles);
} else {
throw new UserException("GVS cannot currently handle extracting with a ploidy of "+ploidy+" as seen at "+SchemaUtils.decodeContig(location)+": "+SchemaUtils.decodePosition(location)+".");
}
genotypeBuilder.GQ(inferredReferenceState.getReferenceGQ());
genotypes.add(genotypeBuilder.make());
}
Expand Down Expand Up @@ -677,16 +685,6 @@ private String makePloidyLookupKey(final String chromosome, final String sampleI
return chromosome+"-"+sampleId;
}

private boolean isInPAR(final long location) {
for (LongRange region : PARRegions) {
if (region.containsLong(location)) {
return true;
}
}
return false;
}


private VariantContext filterSiteByAlleleSpecificVQScore(VariantContext mergedVC,
Map<Allele, Map<Allele, Double>> scoreMap,
Map<Allele, Map<Allele, Double>> vqScoreMap,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.*;

/**
* Ingest variant walker
Expand All @@ -47,6 +43,7 @@ public final class CreateVariantIngestFiles extends VariantWalker {
private RefCreator refCreator;
private VetCreator vetCreator;
private VcfHeaderLineScratchCreator vcfHeaderLineScratchCreator;
private SamplePloidyCreator samplePloidyCreator;
private LoadStatus loadStatus;

private final Map<String, Boolean> allLineHeaders = new HashMap<>();
Expand Down Expand Up @@ -292,6 +289,10 @@ public void onTraversalStart() {

if (enableReferenceRanges && !refRangesRowsExist) {
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStatesToIgnore, outputDir, outputType, enableReferenceRanges, projectID, datasetName, storeCompressedReferences);

// The ploidy table is really only needed for inferring reference ploidy, as variants store their genotypes
// directly. If we're not ingesting references, we can't compute and ingest ploidy either
samplePloidyCreator = new SamplePloidyCreator(sampleId, projectID, datasetName, outputType);
}

if (enableVet && !vetRowsExist) {
Expand Down Expand Up @@ -384,6 +385,18 @@ public Object onTraversalSuccess() {
}
// Wait until all data has been submitted and in pending state to commit
refCreator.commitData();

// this is likely an unnecessary check as it currently stands, but it can't hurt to have it in case we
// later separate their creation, throwing the ploidy stuff explicity behind a flag
if (samplePloidyCreator != null) {
try {
samplePloidyCreator.apply(refCreator.getReferencePloidyData(), refCreator.getTotalRefEntries());
} catch (IOException ioe) {
throw new GATKException("Error writing ploidy data", ioe);
}

samplePloidyCreator.commitData();
}
}

if (vetCreator != null) {
Expand All @@ -407,6 +420,9 @@ public void closeTool() {
if (vetCreator != null) {
vetCreator.closeTool();
}
if (samplePloidyCreator != null) {
samplePloidyCreator.closeTool();
}
if (vcfHeaderLineScratchCreator != null) {
vcfHeaderLineScratchCreator.closeTool();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.gvs.common.CommonCode;
import org.broadinstitute.hellbender.tools.gvs.common.GQStateEnum;
import org.broadinstitute.hellbender.tools.gvs.common.IngestConstants;
import org.broadinstitute.hellbender.tools.gvs.common.SchemaUtils;
import org.broadinstitute.hellbender.tools.gvs.common.*;
import org.broadinstitute.hellbender.utils.GenomeLoc;
import org.broadinstitute.hellbender.utils.GenomeLocParser;
import org.broadinstitute.hellbender.utils.GenomeLocSortedSet;
Expand All @@ -17,9 +14,7 @@

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.*;


public final class RefCreator {
Expand All @@ -38,6 +33,12 @@ public final class RefCreator {
private static final String PREFIX_SEPARATOR = "_";
private final static String REF_RANGES_FILETYPE_PREFIX = "ref_ranges_";

private Map<String, BitSet> ploidiesPerChromosome = null;
private Map<String, Map<Integer, Long>> ploidiesCountPerChromosome = null;

// for easily calculating percentages later
private long totalRefEntries = 0L;

public static boolean doRowsExistFor(CommonCode.OutputType outputType, String projectId, String datasetName, String tableNumber, Long sampleId) {
if (outputType != CommonCode.OutputType.BQ) return false;
return BigQueryUtils.doRowsExistFor(projectId, datasetName, REF_RANGES_FILETYPE_PREFIX + tableNumber, SchemaUtils.SAMPLE_ID_FIELD_NAME, sampleId);
Expand All @@ -50,6 +51,9 @@ public RefCreator(String sampleIdentifierForOutputFileName, Long sampleId, Strin
this.storeCompressedReferences = storeCompressedReferences;
this.gqStatesToIgnore = gqStatesToIgnore;

this.ploidiesPerChromosome = new HashMap<>();
this.ploidiesCountPerChromosome = new HashMap<>();

coverageLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));

try {
Expand Down Expand Up @@ -100,6 +104,31 @@ public void apply(VariantContext variant, List<GenomeLoc> intervalsToWrite) thro
// if we are writing ref ranges, and this is a reference block, write it!
if (writeReferenceRanges) {
if (variant.isReferenceBlock()) {

// Record reference ploidy if this is not in a PAR
if (!PloidyUtils.doesVariantOverlapPAR(variant)) {
// create the bitset for this ploidy if it isn't there
if (!ploidiesCountPerChromosome.containsKey(variant.getContig())) {
ploidiesCountPerChromosome.put(variant.getContig(), new HashMap<>());
}
// set the bit for this ploidy so we record having seen it
Map<Integer, Long> ploidyCounts = ploidiesCountPerChromosome.get(variant.getContig());

int ploidy = variant.getMaxPloidy(1);

Long currentCount = 0L;
if (ploidyCounts.containsKey(ploidy)) {
currentCount = ploidyCounts.get(ploidy);
}

// increment counts for this one and put it back
++currentCount;
ploidyCounts.put(ploidy, currentCount);

++totalRefEntries;
}


// break up reference blocks to be no longer than MAX_REFERENCE_BLOCK_SIZE
int localStart = start;
while ( localStart <= end ) {
Expand Down Expand Up @@ -262,6 +291,14 @@ public static Set<GQStateEnum> getGQStateEnumGreaterThan(GQStateEnum s){
return ret;
}

public Map<String, Map<Integer, Long>> getReferencePloidyData() {
return ploidiesCountPerChromosome;
}

public long getTotalRefEntries() {
return totalRefEntries;
}

public void commitData() {
if (outputType == CommonCode.OutputType.BQ) {
if (writeReferenceRanges && refRangesWriter != null) {
Expand Down
Loading

0 comments on commit 441e7f4

Please sign in to comment.