Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VAT Performance / Reliability Improvements #7828

Merged
merged 7 commits into from
May 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,17 @@ workflows:
branches:
- master
- ah_var_store
- name: GvsCreateVATAnnotations
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
testParameterFiles:
- /scripts/variantstore/wdl/GvsCreateVATAnnotations.example.inputs.json
filters:
branches:
- master
- ah_var_store
- vat_speed
- vat_speed_kctest
- name: GvsCreateVATFromAnnotations
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCreateVATFromAnnotations.wdl
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"GvsCreateVATAnnotations.input_vcf_name": "String",
"GvsCreateVATAnnotations.output_path": "String",
"GvsCreateVATAnnotations.custom_annotations_template": "File",
"GvsCreateVATAnnotations.nirvana_data_directory": "File",
"GvsCreateVATAnnotations.input_vcf": "File",
"GvsCreateVATAnnotations.ancestry_mapping_list": "File",
"GvsCreateVATAnnotations.input_vcf_index": "File",
"GvsCreateVATAnnotations.service_account_json_path": "String? (optional)",
"GvsCreateVATAnnotations.ref": "File"
}

71 changes: 46 additions & 25 deletions scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,6 @@ task ExtractAnAcAfFromVCF {
String custom_annotations_file_name = "ac_an_af.tsv"
String local_input_vcf = basename(input_vcf)
String local_input_vcf_index = basename(input_vcf_index)
String normalized_vcf = "normalized.vcf"
String normalized_vcf_compressed = "normalized.vcf.gz"
String normalized_vcf_indexed = "normalized.vcf.gz.tbi"

Expand All @@ -88,13 +87,18 @@ task ExtractAnAcAfFromVCF {
command <<<
set -e

# custom function to prepend the current datetime to an echo statement
echo_date () { echo "`date "+%Y/%m/%d %H:%M:%S"` $1"; }

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
export GOOGLE_APPLICATION_CREDENTIALS=local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json

fi

echo_date "VAT: Custom localization of inputs"

cp ~{custom_annotations_template} ~{custom_annotations_file_name}

gsutil cp ~{input_vcf} ~{local_input_vcf}
Expand All @@ -111,64 +115,81 @@ task ExtractAnAcAfFromVCF {
# "sas"
#]

echo_date "VAT: Convert input to BCF format"
bcftools convert --threads 4 -O b -o original.bcf ~{local_input_vcf}
rm ~{local_input_vcf}

echo_date "VAT: Calculating number of +50 alt alleles on N sites"

## track the dropped variants with +50 alt alleles or N's in the reference (Since Nirvana cant handle N as a base, drop them for now)
bcftools view -i 'N_ALT>50 || REF~"N"' ~{local_input_vcf} | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv
bcftools view --threads 4 -i 'N_ALT>50 || REF~"N"' -O u original.bcf | bcftools query -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > track_dropped.tsv

wc -l track_dropped.tsv

## filter out sites with too many alt alleles
bcftools view -e 'N_ALT>50 || REF~"N"' --no-update ~{local_input_vcf} -Ou | \
echo_date "VAT: filter out sites with too many alt alleles"
bcftools view --threads 4 -e 'N_ALT>50 || REF~"N"' --no-update original.bcf -O u | \
## filter out the non-passing sites
bcftools view -f 'PASS,.' --no-update -Oz -o filtered.vcf.gz
## normalize, left align and split multi allelic sites to new lines, remove duplicate lines
bcftools norm -m- --check-ref w -f Homo_sapiens_assembly38.fasta filtered.vcf.gz -Oz -o normalized.vcf.gz
rm ~{local_input_vcf}
## filter out spanning deletions and variants with an AC of 0
bcftools view -e 'ALT[0]="*" || AC=0' --no-update normalized.vcf.gz -Ou | \
bcftools view --threads 4 -f 'PASS,.' --no-update -O b -o filtered.bcf

echo_date "VAT: normalize, left align and split multi allelic sites to new lines, remove duplicate lines"
bcftools norm --threads 4 -m- --check-ref w -f Homo_sapiens_assembly38.fasta filtered.bcf -O b -o normalized.bcf
rm filtered.bcf

echo_date "VAT: filter out spanning deletions and variants with an AC of 0, respect the FT flag"
bcftools view --threads 4 -e 'ALT[0]="*" || AC=0' --no-update normalized.bcf -O u | \
## ensure that we respect the FT tag
bcftools filter -i "FORMAT/FT='PASS,.'" --set-GTs . -Oz -o ~{normalized_vcf}
bcftools filter --threads 4 -i "FORMAT/FT='PASS,.'" --set-GTs . -O b -o normalized.filtered.bcf

## clean up unneeded file
rm normalized.vcf.gz
rm normalized.bcf

echo_date "VAT: detecting and removing duplicate rows"

## During normalization, sometimes duplicate variants appear but with different calculations. This seems to be a bug in bcftools. For now we are dropping all duplicate variants
## to locate the duplicates, we first make a file of just the first 5 columns
bcftools query ~{normalized_vcf} -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' > check_duplicates.tsv
## check it for duplicates and put them in a new file
sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv
rm check_duplicates.tsv ## clean up
## remove those rows (that match up to the first 5 cols)
zgrep -v -wFf duplicates.tsv ~{normalized_vcf} > deduplicated.vcf.gz
rm ~{normalized_vcf} ## clean up
bcftools query normalized.filtered.bcf -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\n' | sort check_duplicates.tsv | uniq -d | cut -f1,2,3,4,5 > duplicates.tsv
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this can be simplified to:
grep -v '^#' normalized.filtered.bcf | cut -f 1-5 | sort | uniq -d > duplicates.tsv

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and actually we dont need ID either so we could do it with f1,2,4,5


# If there ARE dups to remove
if [ -s duplicates.tsv ]; then
## remove those rows (that match up to the first 5 cols)
bcftools view --threads 4 normalized.filtered.bcf | grep -v -wFf duplicates.tsv | bcftools view --threads 4 -O b -o deduplicated.bcf
else
# There are no duplicates to remove
cp normalized.filtered.bcf deduplicated.bcf
fi
rm normalized.filtered.bcf

## add duplicates to the file that's tracking dropped variants
cat duplicates.tsv >> track_dropped.tsv
rm duplicates.tsv ## clean up unneeded file

## calculate annotations for all subpopulations
echo_date "VAT: calculate annotations for all subpopulations"
## AC_het,AC_hom and AC_Hemi are used to calculate the participant count
bcftools plugin fill-tags -- deduplicated.vcf.gz -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
bcftools plugin fill-tags --threads 4 -- deduplicated.bcf -S ~{subpopulation_sample_list} -t AC,AF,AN,AC_het,AC_hom,AC_Hemi | bcftools query -f \
'%CHROM\t%POS\t%REF\t%ALT\t%AC\t%AN\t%AF\t%AC_Hom\t%AC_Het\t%AC_Hemi\t%AC_afr\t%AN_afr\t%AF_afr\t%AC_Hom_afr\t%AC_Het_afr\t%AC_Hemi_afr\t%AC_amr\t%AN_amr\t%AF_amr\t%AC_Hom_amr\t%AC_Het_amr\t%AC_Hemi_amr\t%AC_eas\t%AN_eas\t%AF_eas\t%AC_Hom_eas\t%AC_Het_eas\t%AC_Hemi_eas\t%AC_eur\t%AN_eur\t%AF_eur\t%AC_Hom_eur\t%AC_Het_eur\t%AC_Hemi_eur\t%AC_mid\t%AN_mid\t%AF_mid\t%AC_Hom_mid\t%AC_Het_mid\t%AC_Hemi_mid\t%AC_oth\t%AN_oth\t%AF_oth\t%AC_Hom_oth\t%AC_Het_oth\t%AC_Hemi_oth\t%AC_sas\t%AN_sas\t%AF_sas\t%AC_Hom_sas\t%AC_Het_sas\t%AC_Hemi_sas\n' \
>> ~{custom_annotations_file_name}

## for validation of the pipeline
wc -l ~{custom_annotations_file_name} | awk '{print $1 -7}' > count.txt

echo_date "VAT: produce sites-only VCf"

## compress the vcf and index it, make it sites-only for the next step
bcftools view --no-update --drop-genotypes deduplicated.vcf.gz -Oz -o ~{normalized_vcf_compressed}
bcftools view --threads 4 --no-update --drop-genotypes deduplicated.bcf -O z -o ~{normalized_vcf_compressed}
## if we can spare the IO and want to pass a smaller file we can also drop the info field w bcftools annotate -x INFO
bcftools index --tbi ~{normalized_vcf_compressed}
bcftools index --tbi ~{normalized_vcf_compressed}

echo_date "VAT: finished"
>>>
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_20211101"
maxRetries: 3
memory: "32 GB"
memory: "16 GB"
preemptible: 3
cpu: "4"
disks: "local-disk 250 HDD"
disks: "local-disk 500 HDD"
}
# ------------------------------------------------
# Outputs:
Expand Down