Skip to content

Commit

Permalink
Remove pet code from CreateVariantIngestFiles and friends [VS-375] (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
mcovarr authored Apr 15, 2022
1 parent 43a1480 commit 7a15427
Show file tree
Hide file tree
Showing 15 changed files with 2,597 additions and 5,652 deletions.
1 change: 0 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ workflows:
branches:
- master
- ah_var_store
- kc_wdl_gatk_override
- name: GvsPrepareRangesCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Expand Down
4 changes: 1 addition & 3 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ task CheckForDuplicateData {
# check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
# this returns the list of sample names that do already have data loaded
echo "WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded, s.withdrawn FROM \`${TEMP_TABLE}\` t left outer join \`${SAMPLE_INFO_TABLE}\` s on (s.sample_name = t.sample_name)) " >> query.sql
echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%' OR table_name like 'pet_%')" >> query.sql
echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" >> query.sql
echo "UNION DISTINCT " >> query.sql
echo "SELECT i.sample_name FROM items i WHERE i.is_loaded = True AND i.withdrawn IS NULL " >> query.sql
echo "UNION DISTINCT " >> query.sql
Expand Down Expand Up @@ -208,7 +208,6 @@ task LoadData {
}

Boolean load_ref_ranges = true
Boolean load_pet = false
Boolean load_vet = true
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

Expand Down Expand Up @@ -269,7 +268,6 @@ task LoadData {
--dataset-name ~{dataset_name} \
--output-type BQ \
--enable-reference-ranges ~{load_ref_ranges} \
--enable-pet ~{load_pet} \
--enable-vet ~{load_vet} \
-SN ${sample_name} \
-SNM ~{sample_map} \
Expand Down
20 changes: 0 additions & 20 deletions scripts/variantstore/wdl/schemas/pet_schema.json

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -89,17 +89,11 @@ public class SchemaUtils {
public static final List<String> YNG_FIELDS = Arrays.asList(FILTER_SET_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, VQSLOD, YNG_STATUS);
public static final List<String> TRANCHE_FIELDS = Arrays.asList(TARGET_TRUTH_SENSITIVITY, MIN_VQSLOD, TRANCHE_FILTER_NAME, TRANCHE_MODEL);


public static final List<String> PET_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, STATE_FIELD_NAME); // TODO do we still need?
public static final List<String> VET_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, ALT_ALLELE_FIELD_NAME, AS_RAW_MQ,
AS_RAW_MQRankSum, QUALapprox, AS_QUALapprox, AS_RAW_ReadPosRankSum, AS_SB_TABLE, AS_VarDP, CALL_GT, CALL_AD, CALL_GQ, CALL_PGT, CALL_PID, CALL_PL);
public static final List<String> ALT_ALLELE_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, SAMPLE_ID_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", ALT_ALLELE_FIELD_NAME, "allele_pos", CALL_GT, AS_RAW_MQ, RAW_MQ, AS_RAW_MQRankSum, "raw_mqranksum_x_10", AS_QUALapprox, "qual", AS_RAW_ReadPosRankSum, "raw_readposranksum_x_10", AS_SB_TABLE, "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS", CALL_AD, "ref_ad", "ad");
public static final List<String> FEATURE_EXTRACT_FIELDS = Arrays.asList(LOCATION_FIELD_NAME, REF_ALLELE_FIELD_NAME, "allele", RAW_QUAL, "ref_ad", AS_MQRankSum, "AS_MQRankSum_ft", AS_ReadPosRankSum, "AS_ReadPosRankSum_ft", RAW_MQ, SUM_AD, RAW_AD, "RAW_AD_GT_1", "SB_REF_PLUS","SB_REF_MINUS","SB_ALT_PLUS","SB_ALT_MINUS","num_het_samples","num_homvar_samples","distinct_alleles","hq_genotype_samples", "sum_qualapprox", "num_snp_alleles");

public static final String LOAD_STATUS_FIELD_NAME = "status";
public static final String LOAD_STATUS_EVENT_TIMESTAMP_NAME = "event_timestamp";
public static final List<String> LOAD_STATUS_TABLE_REF_FIELDS = Arrays.asList(SAMPLE_ID_FIELD_NAME, LOAD_STATUS_FIELD_NAME, LOAD_STATUS_EVENT_TIMESTAMP_NAME);


public static final long chromAdjustment = 1000000000000L;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
package org.broadinstitute.hellbender.tools.gvs.ingest;

import com.google.cloud.bigquery.BigQuery;
import com.google.cloud.bigquery.StandardTableDefinition;
import com.google.cloud.bigquery.Table;
import com.google.cloud.bigquery.TableId;

import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.util.RuntimeIOException;
import htsjdk.variant.variantcontext.Allele;
Expand Down Expand Up @@ -35,8 +32,8 @@
* Ingest variant walker
*/
@CommandLineProgramProperties(
summary = "Exome and Genome Ingest tool for the Joint Genotyping in Big Query project",
oneLineSummary = "Ingest tool for BQJG",
summary = "Exome and Genome Ingest tool for the Genomic Variant Store",
oneLineSummary = "Ingest tool for GVS",
programGroup = ShortVariantDiscoveryProgramGroup.class,
omitFromCommandLine = true
)
Expand All @@ -49,19 +46,11 @@ public final class CreateVariantIngestFiles extends VariantWalker {

private GenomeLocSortedSet intervalArgumentGenomeLocSortedSet;

private String sampleName;
private String sampleId;
private List<SimpleInterval> userIntervals;

// Inside the parent directory, a directory for each chromosome will be created, with a pet directory and vet directory in each one.
// Each pet and vet directory will hold all of the pet and vet tsvs for each sample
// A sample_info directory will be created, with a sample_info tsv for each sample

// @Argument(fullName = "output-path",
// shortName = "VPO",
// doc = "Path to the directory where the variants TSVs and positions expanded TSVs should be written")
// public GATKPathSpecifier parentOutputDirectory = null;
// public Path parentDirectory = null;
// Inside the parent directory, a directory for each chromosome will be created, with a vet directory in each one.
// Each vet directory will hold all the vet TSVs for each sample.
// A sample_info directory will be created, with a sample_info tsv for each sample.

@Argument(fullName = "ref-block-gq-to-ignore",
shortName = "IG",
Expand All @@ -79,20 +68,14 @@ public final class CreateVariantIngestFiles extends VariantWalker {
shortName = "rr",
doc = "write reference ranges data",
optional = true)
public boolean enableReferenceRanges = false;
public boolean enableReferenceRanges = true;

@Argument(fullName = "enable-vet",
shortName = "ev",
doc = "write vet data",
optional = true)
public boolean enableVet = true;

@Argument(fullName = "enable-pet",
shortName = "ep",
doc = "write pet data",
optional = true)
public boolean enablePet = true;

@Argument(fullName = "sample-name-mapping",
shortName = "SNM",
doc = "Sample name to sample id mapping. This must be provided if gvs-sample-id is not",
Expand Down Expand Up @@ -126,35 +109,35 @@ public final class CreateVariantIngestFiles extends VariantWalker {
fullName = "ref-version",
doc = "Remove this option!!!! only for ease of testing. Valid options are 37 or 38",
optional = true)
private String refVersion = "37";
public String refVersion = "37";

@Argument(
fullName = "output-directory",
doc = "directory for output tsv files",
optional = true)
private File outputDir = new File(".");
public File outputDir = new File(".");

@Argument(
fullName = "project-id",
doc = "ID of the Google Cloud project where the dataset for pet and vet tables exist",
doc = "ID of the Google Cloud project where the dataset for vet tables exists",
optional = true
)
protected String projectID = null;
public String projectID = null;

@Argument(
fullName = "dataset-name",
doc = "Name of the dataset to update pet and vet tables",
doc = "Name of the dataset to update vet tables",
optional = true
)
protected String datasetName = null;
public String datasetName = null;


@Argument(
fullName = "force-loading-from-non-allele-specific",
doc = "Even if there are allele-specific (AS) annotations, use backwards compatibility mode",
optional = true
)
protected boolean forceLoadingFromNonAlleleSpecific = false;
public boolean forceLoadingFromNonAlleleSpecific = false;

// getGenotypes() returns list of lists for all samples at variant
// assuming one sample per gvcf, getGenotype(0) retrieves GT for sample at index 0
Expand All @@ -173,8 +156,6 @@ private String getInputFileName() {
return pathParts[pathParts.length - 1];
}



@Override
public void onTraversalStart() {
//set up output directory
Expand All @@ -189,7 +170,7 @@ public void onTraversalStart() {
// TODO if you change here, also change in CreateArrayIngestFiles
// Get sample name
final VCFHeader inputVCFHeader = getHeaderForVariants();
sampleName = sampleNameParam == null ? IngestUtils.getSampleName(inputVCFHeader) : sampleNameParam;
String sampleName = sampleNameParam == null ? IngestUtils.getSampleName(inputVCFHeader) : sampleNameParam;
if (sampleIdParam == null && sampleMap == null) {
throw new IllegalArgumentException("One of sample-id or sample-name-mapping must be specified");
}
Expand All @@ -209,13 +190,13 @@ public void onTraversalStart() {

// To set up the missing positions
SAMSequenceDictionary seqDictionary = getBestAvailableSequenceDictionary();
userIntervals = intervalArgumentCollection.getIntervals(seqDictionary);

final GenomeLocSortedSet genomeLocSortedSet = new GenomeLocSortedSet(new GenomeLocParser(seqDictionary));
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocSortedSet.getGenomeLocParser(), IntervalUtils.genomeLocsFromLocatables(genomeLocSortedSet.getGenomeLocParser(), intervalArgumentCollection.getIntervals(seqDictionary)));
final GenomeLocParser genomeLocParser = new GenomeLocParser(seqDictionary);
intervalArgumentGenomeLocSortedSet = GenomeLocSortedSet.createSetFromList(genomeLocParser, IntervalUtils.genomeLocsFromLocatables(genomeLocParser, intervalArgumentCollection.getIntervals(seqDictionary)));

if (enablePet || enableReferenceRanges) {
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType, enablePet, enableReferenceRanges, projectID, datasetName);
if (enableReferenceRanges) {
//noinspection ConstantConditions
refCreator = new RefCreator(sampleIdentifierForOutputFileName, sampleId, tableNumber, seqDictionary, gqStateToIgnore, dropAboveGqThreshold, outputDir, outputType, enableReferenceRanges, projectID, datasetName);
}

if (enableVet) {
Expand All @@ -238,9 +219,7 @@ public void onTraversalStart() {
} else if (state == LoadStatus.LoadState.PARTIAL) {
throw new GATKException("The loading for sample id " + sampleId + " into the _" + tableNumber + " table(s) was interrupted before it was able to complete successfully.");
}

}

}

@Override
Expand All @@ -254,7 +233,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,
throw new IllegalStateException("There are no intervals being covered by this variant, something went wrong with interval parsing");
}

// take the first interval(assuming this is returned in order) and make sure if its a variant, that it starts at/after the interval start
// take the first interval(assuming this is returned in order) and make sure if it's a variant, that it starts at/after the interval start
// we are going to ignore any deletions that start before an interval.
if (!variant.isReferenceBlock() && intervalsToWrite.get(0).getStart() > variant.getStart()){
return;
Expand All @@ -266,25 +245,23 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,
}

try {
// write to VET if NOT reference block and NOT a no call
// write to VET if NOT reference block and NOT a no call
if (!variant.isReferenceBlock() && !isNoCall(variant)) {
if (enableVet) vetCreator.apply(variant, readsContext, referenceContext, featureContext);
if (enableVet) vetCreator.apply(variant);
}
} catch (IOException ioe) {
throw new GATKException("Error writing VET", ioe);
}

try {
if (refCreator != null) {
refCreator.apply(variant, intervalsToWrite);
if (enableReferenceRanges) refCreator.apply(variant, intervalsToWrite);
}
} catch (IOException ioe) {
throw new GATKException("Error writing PET", ioe);
throw new GATKException("Error writing reference ranges", ioe);
}

}


@Override
public Object onTraversalSuccess() {
if (outputType == CommonCode.OutputType.BQ) {
Expand Down Expand Up @@ -319,8 +296,7 @@ public void closeTool() {
refCreator.closeTool();
}
if (vetCreator != null) {
vetCreator.closeTool();;
vetCreator.closeTool();
}
}

}

This file was deleted.

This file was deleted.

Loading

0 comments on commit 7a15427

Please sign in to comment.