Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update GvsExtractCallset.example.inputs.json #7469

Merged
merged 4 commits into from
Sep 15, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"GvsExtractCallset.reference_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
"GvsExtractCallset.reference_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
"GvsExtractCallset.wgs_intervals": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list",
"GvsExtractCallset.scatter_count": 50,
"GvsExtractCallset.scatter_count": 50

"GvsExtractCallset.gatk_override": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/mmt_SA_support_and_wdl_renaming_20210415/gatk-package-4.2.0.0-345-g8a7821a-SNAPSHOT-local.jar"
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ workflow GvsExtractCallset {

String output_file_base_name
String? output_gcs_dir
File? gatk_override
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210914/gatk-package-4.2.0.0-406-ga9206a2-SNAPSHOT-local.jar"
Int local_disk_for_extract = 150
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ workflow GvsImportGenomes {
Int batch_size = 1

Int? preemptible_tries
File? gatk_override
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210914/gatk-package-4.2.0.0-406-ga9206a2-SNAPSHOT-local.jar"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any benefit to making this a constant somewhere so we dont have to update it in multiple places? ( I guess it's only 2, so not really an issue though)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah and I don't know of a good way to have a constant/shared value across two WDLs

String? docker
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ protected static VCFHeader generateVcfHeader(Set<String> sampleNames,
VCFConstants.GENOTYPE_QUALITY_KEY
);
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));

headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.EXCESS_HET_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_VQS_LOD_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_YNG_STATUS_KEY));

Expand Down Expand Up @@ -222,6 +222,12 @@ protected void onStartup() {
extraHeaderLines.add(new VCFFormatHeaderLine("FT", 1, VCFHeaderLineType.String, "Genotype Filter Field"));
}

if (emitPLs) {
VCFStandardHeaderLines.addStandardFormatLines(extraHeaderLines, true,
VCFConstants.GENOTYPE_PL_KEY
);
}

SampleList sampleList = new SampleList(sampleTableName, sampleFileName, projectID, printDebugInformation, "extract-cohort");
Map<Long, String> sampleIdToName = sampleList.getMap();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class ExtractCohortEngine {
private final ProgressMeter progressMeter;
private final String projectID;
private final CommonCode.ModeEnum mode;
private final boolean emitPLs;

/** List of sample names seen in the variant data from BigQuery. */
private Set<String> sampleNames;
Expand Down Expand Up @@ -110,6 +111,7 @@ public ExtractCohortEngine(final String projectID,
this.sampleIdToName = sampleIdToName;
this.sampleNames = new HashSet<>(sampleIdToName.values());
this.mode = mode;
this.emitPLs = emitPLs;

this.cohortTableRef = cohortTableName == null || "".equals(cohortTableName) ? null :
new TableReference(cohortTableName, emitPLs ? SchemaUtils.COHORT_FIELDS : SchemaUtils.COHORT_FIELDS_NO_PL);
Expand Down Expand Up @@ -667,7 +669,7 @@ private VariantContext createVariantContextFromSampleRecord(final ExtractCohortR
}

final String callPL = sampleRecord.getCallPL();
if ( callPL != null ) {
if ( this.emitPLs && callPL != null ) {
genotypeBuilder.PL(Arrays.stream(callPL.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public void testFinalVCFfromAvro() throws Exception {
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("emit-pls", false);
.add("emit-pls", true);

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(outputVCF, expectedVCF);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,19 @@
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NAY,Description="Considered a NAY in the Yay, Nay, Grey table">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AS_FS,Number=A,Type=Float,Description="allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele">
##INFO=<ID=AS_MQ,Number=A,Type=Float,Description="Allele-specific RMS Mapping Quality">
##INFO=<ID=AS_MQRankSum,Number=A,Type=Float,Description="Allele-specific Mapping Quality Rank Sum">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
##INFO=<ID=AS_RAW_MQ,Number=1,Type=String,Description="Allele-specfic raw data for RMS Mapping Quality">
##INFO=<ID=AS_RAW_MQRankSum,Number=1,Type=String,Description="Allele-specfic raw data for Mapping Quality Rank Sum">
##INFO=<ID=AS_RAW_ReadPosRankSum,Number=1,Type=String,Description="allele specific raw data for rank sum test of read position bias">
##INFO=<ID=AS_ReadPosRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias">
##INFO=<ID=AS_SB_TABLE,Number=1,Type=String,Description="Allele-specific forward/reverse read counts for strand bias tests. Includes the reference and alleles separated by |.">
##INFO=<ID=AS_SOR,Number=A,Type=Float,Description="Allele specific strand Odds Ratio of 2x|Alts| contingency table to detect allele specific strand bias">
##INFO=<ID=AS_VQSLOD,Number=A,Type=String,Description="For each alt allele, the log odds of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=AS_VarDP,Number=1,Type=String,Description="Allele-specific (informative) depth over variant genotypes -- including ref, RAW format">
##INFO=<ID=AS_YNG,Number=A,Type=String,Description="For each alt allele, the yay/nay/grey status (yay are known good alleles, nay are known false positives, grey are unknown)">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##INFO=<ID=SB_TABLE,Number=4,Type=Integer,Description="Forward/reverse read counts for strand bias tests">
##INFO=<ID=VarDP,Number=1,Type=Integer,Description="(informative) depth over variant genotypes">
##contig=<ID=chr1,length=248956422,assembly=38>
##contig=<ID=chr2,length=242193529,assembly=38>
##contig=<ID=chr3,length=198295559,assembly=38>
Expand Down
Binary file not shown.