Skip to content

Commit

Permalink
Update GvsExtractCallset.example.inputs.json (#7469)
Browse files Browse the repository at this point in the history
* updated default overrider jar
* fixed bugs in removed headers, processing Avro PLs and unit tests
  • Loading branch information
kcibul authored Sep 15, 2021
1 parent a9206a2 commit 7b3eb29
Show file tree
Hide file tree
Showing 8 changed files with 14 additions and 28 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"GvsExtractCallset.reference_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
"GvsExtractCallset.reference_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
"GvsExtractCallset.wgs_intervals": "gs://gcp-public-data--broad-references/hg38/v0/wgs_calling_regions.hg38.noCentromeres.noTelomeres.interval_list",
"GvsExtractCallset.scatter_count": 50,
"GvsExtractCallset.scatter_count": 50

"GvsExtractCallset.gatk_override": "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/mmt_SA_support_and_wdl_renaming_20210415/gatk-package-4.2.0.0-345-g8a7821a-SNAPSHOT-local.jar"
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ workflow GvsExtractCallset {

String output_file_base_name
String? output_gcs_dir
File? gatk_override
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210914/gatk-package-4.2.0.0-406-ga9206a2-SNAPSHOT-local.jar"
Int local_disk_for_extract = 150
}

Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ workflow GvsImportGenomes {
Int batch_size = 1

Int? preemptible_tries
File? gatk_override
File? gatk_override = "gs://broad-dsp-spec-ops/scratch/bigquery-jointcalling/jars/ah_var_store_20210914/gatk-package-4.2.0.0-406-ga9206a2-SNAPSHOT-local.jar"
String? docker
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ protected static VCFHeader generateVcfHeader(Set<String> sampleNames,
VCFConstants.GENOTYPE_QUALITY_KEY
);
headerLines.add(GATKVCFHeaderLines.getFormatLine(GATKVCFConstants.REFERENCE_GENOTYPE_QUALITY));

headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.EXCESS_HET_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_VQS_LOD_KEY));
headerLines.add(GATKVCFHeaderLines.getInfoLine(GATKVCFConstants.AS_YNG_STATUS_KEY));

Expand Down Expand Up @@ -222,6 +222,12 @@ protected void onStartup() {
extraHeaderLines.add(new VCFFormatHeaderLine("FT", 1, VCFHeaderLineType.String, "Genotype Filter Field"));
}

if (emitPLs) {
VCFStandardHeaderLines.addStandardFormatLines(extraHeaderLines, true,
VCFConstants.GENOTYPE_PL_KEY
);
}

SampleList sampleList = new SampleList(sampleTableName, sampleFileName, projectID, printDebugInformation, "extract-cohort");
Map<Long, String> sampleIdToName = sampleList.getMap();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ public class ExtractCohortEngine {
private final ProgressMeter progressMeter;
private final String projectID;
private final CommonCode.ModeEnum mode;
private final boolean emitPLs;

/** List of sample names seen in the variant data from BigQuery. */
private Set<String> sampleNames;
Expand Down Expand Up @@ -110,6 +111,7 @@ public ExtractCohortEngine(final String projectID,
this.sampleIdToName = sampleIdToName;
this.sampleNames = new HashSet<>(sampleIdToName.values());
this.mode = mode;
this.emitPLs = emitPLs;

this.cohortTableRef = cohortTableName == null || "".equals(cohortTableName) ? null :
new TableReference(cohortTableName, emitPLs ? SchemaUtils.COHORT_FIELDS : SchemaUtils.COHORT_FIELDS_NO_PL);
Expand Down Expand Up @@ -667,7 +669,7 @@ private VariantContext createVariantContextFromSampleRecord(final ExtractCohortR
}

final String callPL = sampleRecord.getCallPL();
if ( callPL != null ) {
if ( this.emitPLs && callPL != null ) {
genotypeBuilder.PL(Arrays.stream(callPL.split(SchemaUtils.MULTIVALUE_FIELD_DELIMITER)).mapToInt(Integer::parseInt).toArray());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public void testFinalVCFfromAvro() throws Exception {
.add("local-sort-max-records-in-ram", 10000000)
.add("cohort-avro-file-name", cohortAvroFileName)
.add("sample-file", sampleFile)
.add("emit-pls", false);
.add("emit-pls", true);

runCommandLine(args);
IntegrationTestSpec.assertEqualTextFiles(outputVCF, expectedVCF);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,40 +4,19 @@
##FILTER=<ID=LowQual,Description="Low quality">
##FILTER=<ID=NAY,Description="Considered a NAY in the Yay, Nay, Grey table">
##FILTER=<ID=NO_HQ_GENOTYPES,Description="Site has no high quality variant genotypes">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another; will always be heterozygous and is not intended to describe called alleles">
##FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=RGQ,Number=1,Type=Integer,Description="Unconditional reference genotype confidence, encoded as a phred quality -10*log10 p(genotype call is wrong)">
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=AS_FS,Number=A,Type=Float,Description="allele specific phred-scaled p-value using Fisher's exact test to detect strand bias of each alt allele">
##INFO=<ID=AS_MQ,Number=A,Type=Float,Description="Allele-specific RMS Mapping Quality">
##INFO=<ID=AS_MQRankSum,Number=A,Type=Float,Description="Allele-specific Mapping Quality Rank Sum">
##INFO=<ID=AS_QD,Number=A,Type=Float,Description="Allele-specific Variant Confidence/Quality by Depth">
##INFO=<ID=AS_QUALapprox,Number=1,Type=String,Description="Allele-specific QUAL approximations">
##INFO=<ID=AS_RAW_MQ,Number=1,Type=String,Description="Allele-specfic raw data for RMS Mapping Quality">
##INFO=<ID=AS_RAW_MQRankSum,Number=1,Type=String,Description="Allele-specfic raw data for Mapping Quality Rank Sum">
##INFO=<ID=AS_RAW_ReadPosRankSum,Number=1,Type=String,Description="allele specific raw data for rank sum test of read position bias">
##INFO=<ID=AS_ReadPosRankSum,Number=A,Type=Float,Description="allele specific Z-score from Wilcoxon rank sum test of each Alt vs. Ref read position bias">
##INFO=<ID=AS_SB_TABLE,Number=1,Type=String,Description="Allele-specific forward/reverse read counts for strand bias tests. Includes the reference and alleles separated by |.">
##INFO=<ID=AS_SOR,Number=A,Type=Float,Description="Allele specific strand Odds Ratio of 2x|Alts| contingency table to detect allele specific strand bias">
##INFO=<ID=AS_VQSLOD,Number=A,Type=String,Description="For each alt allele, the log odds of being a true variant versus being false under the trained gaussian mixture model">
##INFO=<ID=AS_VarDP,Number=1,Type=String,Description="Allele-specific (informative) depth over variant genotypes -- including ref, RAW format">
##INFO=<ID=AS_YNG,Number=A,Type=String,Description="For each alt allele, the yay/nay/grey status (yay are known good alleles, nay are known false positives, grey are unknown)">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=ExcessHet,Number=1,Type=Float,Description="Phred-scaled p-value for exact test of excess heterozygosity">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
##INFO=<ID=QUALapprox,Number=1,Type=Integer,Description="Sum of PL[0] values; used to approximate the QUAL score">
##INFO=<ID=SB,Number=1,Type=Float,Description="Strand Bias">
##INFO=<ID=SB_TABLE,Number=4,Type=Integer,Description="Forward/reverse read counts for strand bias tests">
##INFO=<ID=VarDP,Number=1,Type=Integer,Description="(informative) depth over variant genotypes">
##contig=<ID=chr1,length=248956422,assembly=38>
##contig=<ID=chr2,length=242193529,assembly=38>
##contig=<ID=chr3,length=198295559,assembly=38>
Expand Down
Binary file not shown.

0 comments on commit 7b3eb29

Please sign in to comment.