Skip to content

Commit

Permalink
Moving the WDL for importing array manifest to BQ (#6860)
Browse files Browse the repository at this point in the history
* Copying wdl from variantstore repo

* Adding tests and changes to WDL

* addressing comments

* adding readme
  • Loading branch information
meganshand authored and Marianie-Simeon committed Feb 16, 2021
1 parent 5a34197 commit 3a930f2
Show file tree
Hide file tree
Showing 9 changed files with 327 additions and 2 deletions.
5 changes: 3 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ env:
- SCALA_VERSION=2.11 RUN_CNV_SOMATIC_WDL=true TESTS_REQUIRE_GCLOUD=true
- SCALA_VERSION=2.11 RUN_M2_WDL=true TESTS_REQUIRE_GCLOUD=true
- SCALA_VERSION=2.11 RUN_CNN_WDL=true TESTS_REQUIRE_GCLOUD=true
- SCALA_VERSION=2.11 RUN_VARIANTSTORE_WDL=true TESTS_REQUIRE_GCLOUD=true
- SCALA_VERSION=2.11 TEST_TYPE=wdlGen
global:
#gradle needs this
Expand Down Expand Up @@ -91,7 +92,7 @@ before_install:
echo "Done testing github authentication";
fi;
# Download Cromwell jar -- if you change the version, please change the CROMWELL_JAR env variable above, too.
- if [[ $TEST_TYPE == wdlGen || $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then
- if [[ $TEST_TYPE == wdlGen || $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true || $RUN_VARIANTSTORE_WDL == true ]]; then
wget -O $CROMWELL_JAR https://github.com/broadinstitute/cromwell/releases/download/51/cromwell-51.jar;
wget -O $WOMTOOL_JAR https://github.com/broadinstitute/cromwell/releases/download/51/womtool-51.jar;
fi;
Expand All @@ -112,7 +113,7 @@ install:
else
./gradlew assemble;
./gradlew installDist;
if [[ $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true ]]; then
if [[ $RUN_CNV_GERMLINE_COHORT_WDL == true || $RUN_CNV_GERMLINE_CASE_WDL == true || $RUN_CNV_SOMATIC_WDL == true || $RUN_M2_WDL == true || $RUN_CNN_WDL == true || $RUN_VARIANTSTORE_WDL == true ]]; then
echo "building a shadow jar for the wdl";
./gradlew shadowJar;
elif [[ $TEST_TYPE == cloud ]]; then
Expand Down
2 changes: 2 additions & 0 deletions scripts/variantstore_cromwell_tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
The `run_variantstore_wdl.sh` script tests the WDLs in the `gatk/scripts/variantstore_wdl` directory. Currently the test
only checks in the WDL runs without failing, and does not check the results.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"ImportArrayManifest.extended_manifest_csv":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/tiny_manifest.csv",
"ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json",
"ImportArrayManifest.project_id":"broad-dsde-dev",
"ImportArrayManifest.dataset_name":"temp_tables",
"ImportArrayManifest.table_name": "__TABLE_NAME__",
"ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS"
}
48 changes: 48 additions & 0 deletions scripts/variantstore_cromwell_tests/local-with-gcs.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
include required(classpath("application"))

backend {
# Override the default backend.
default = "LocalExample2"

providers {
LocalExample2 {
# The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend.
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory"

# The backend custom configuration.
config {

run-in-background = true

# The list of possible runtime custom attributes.
runtime-attributes = """
String? docker
String? docker_user
"""

# Submit string when there is no "docker" runtime attribute.
submit = "/usr/bin/env bash ${script}"

# Submit string when there is a "docker" runtime attribute.
submit-docker = """
docker run \
--rm -i \
${"--user " + docker_user} \
--entrypoint ${job_shell} \
-e GOOGLE_APPLICATION_CREDENTIALS=${docker_cwd}/sa.json \
-v ${cwd}:${docker_cwd} \
-v __SERVICE_ACCOUNT__:${docker_cwd}/sa.json \
${docker} ${docker_script}
"""

# The defaults for runtime attributes if not provided.
default-runtime-attributes {
failOnStderr: false
continueOnReturnCode: 0
}
}
}
}
}


39 changes: 39 additions & 0 deletions scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/usr/bin/env bash

set -e
#cd in the directory of the script in order to use relative paths
script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
cd "$script_path"

WORKING_DIR=/home/travis/build/broadinstitute
UUID=$(cat /proc/sys/kernel/random/uuid | sed s/-/_/g)

set -e

echo "Building docker image for VariantStore WDL tests (skipping unit tests)..."

#assume Dockerfile is in root
echo "Building docker without running unit tests... ========="
cd $WORKING_DIR/gatk
# IMPORTANT: This code is duplicated in the cnv WDL test.
if [ ${TRAVIS_PULL_REQUEST} != false ]; then
HASH_TO_USE=FETCH_HEAD
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST};
else
HASH_TO_USE=${TRAVIS_COMMIT}
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/;
fi
echo "Docker build done =========="
echo "Putting the newly built docker image into the json parameters"
cd $WORKING_DIR/gatk/scripts/
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json
sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json
echo "JSON FILE (modified) ======="
cat $WORKING_DIR/import_array_manifest_test_mod.json

sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf
echo "Updated local_backend.conf with service account"

echo "Running ImportArrayManifest WDL through cromwell"
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata
150 changes: 150 additions & 0 deletions scripts/variantstore_wdl/ImportArrayManifest.wdl
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
version 1.0

workflow ImportArrayManifest {

input {
File extended_manifest_csv
File manifest_schema_json
String project_id
String dataset_name
String? table_name

Int? preemptible_tries
String? docker
}

String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])

call CreateManifestCsv {
input:
extended_manifest_csv = extended_manifest_csv,
preemptible_tries = preemptible_tries,
docker = docker_final
}

call LoadManifest {
input:
project_id = project_id,
dataset_name = dataset_name,
table_name = table_name,
manifest_schema_json = manifest_schema_json,
manifest_csv = CreateManifestCsv.manifest_csv,
preemptible_tries = preemptible_tries,
docker = docker_final
}
output {
File manifest_csv = CreateManifestCsv.manifest_csv
File manifest_ingest_csv = CreateManifestCsv.manifest_ingest_csv
File manifest_sub_csv = CreateManifestCsv.manifest_sub_csv
File manifest_proc_csv = CreateManifestCsv.manifest_proc_csv
}
}

task LoadManifest {
input {
String project_id
String dataset_name
String? table_name
File manifest_csv
File manifest_schema_json
# runtime
Int? preemptible_tries
String docker
# String to add command for testing only. Can be ignored otherwise.
String? for_testing_only
}

String ingest_table = dataset_name + "." + select_first([table_name, "probe_info"])

parameter_meta {
manifest_schema_json: {
localization_optional: false
}
}

command <<<
set +e
~{for_testing_only}
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null
if [ $? -ne 0 ]; then
echo "making dataset ~{project_id}.~{dataset_name}"
bq mk --project_id=~{project_id} ~{dataset_name}
fi
bq show --project_id ~{project_id} ~{ingest_table} > /dev/null
if [ $? -ne 0 ]; then
echo "making table ~{ingest_table}"
# create a site info table and load - schema and TSV header need to be the same order
bq --location=US mk --project_id=~{project_id} ~{ingest_table} ~{manifest_schema_json}
fi
set -e

bq load --location=US --project_id=~{project_id} --null_marker "null" --source_format=CSV ~{ingest_table} ~{manifest_csv} ~{manifest_schema_json}
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk " + 20 + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}

}

task CreateManifestCsv {
input {
File extended_manifest_csv

# runtime
Int? preemptible_tries
String docker
}

Int disk_size = ceil(size(extended_manifest_csv, "GB") * 2.5) + 20

meta {
description: "Creates a tsv file for imort into BigQuery"
}
parameter_meta {
extended_manifest_csv: {
localization_optional: false
}
}
command <<<
set -e

TMP_SORTED="manifest_ingest_sorted.csv"
TMP_SUB="manifest_ingest_sub.csv"
TMP_PROC="manifest_ingest_processed.csv"
TMP="manifest_ingest.csv"

# put integers in front of the chromosomes that are not numbered so that they end up ordered by X, Y and MT
sed 's/,X,/,23X,/g; s/,Y,/,24Y,/g; s/,MT,/,25MT,/g' ~{extended_manifest_csv} > $TMP_SUB

# sort the probes by chrom, position and then name so there is a specific ordering when we assign integers
sort -t , -k23n,23 -k24n,24 -k2,2 $TMP_SUB > $TMP_SORTED

# checking for != "build37Flag" skips the header row (we don't want that numbered)
# only process rows with 29 fields - this skips some header info fields
# also skip entries that are flagged, not matched or have index conflict
awk -F ',' 'NF==29 && ($29!="ILLUMINA_FLAGGED" && $29!="INDEL_NOT_MATCHED" && $29!="INDEL_CONFLICT" && $29!="build37Flag") { flag=$29; if ($29=="PASS") flag="null"; print ++id","$2","$9","$23","$24","$25","$26","$27","flag }' $TMP_SORTED > $TMP_PROC

# remove the integer prefixes for chromosomes X, Y and MT
sed 's/,23X,/,X,/g; s/,24Y,/,Y,/g; s/,25MT,/,MT,/g' $TMP_PROC > $TMP

echo "created file for ingest $TMP"
>>>
runtime {
docker: docker
memory: "4 GB"
disks: "local-disk " + disk_size + " HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 2
}
output {
File manifest_csv = "manifest_ingest.csv"
File manifest_ingest_csv = "manifest_ingest_sorted.csv"
File manifest_sub_csv = "manifest_ingest_sub.csv"
File manifest_proc_csv = "manifest_ingest_processed.csv"

}
}
3 changes: 3 additions & 0 deletions scripts/variantstore_wdl/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
This directory has the WDLs for generating and interacting with a variant store in BigQuery. It currently contains
`ImportArrayManifest.wdl` which uploads the `probe_info` table from a Manifest file. Other WDLs for ingesting the array
data, calculating metrics, and extracting array data are coming soon.
56 changes: 56 additions & 0 deletions scripts/variantstore_wdl/schemas/manifest_schema.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[
{
"description": "[DESCRIPTION]",
"name": "ProbeId",
"type": "Integer",
"mode": "Required"
},
{
"description": "[DESCRIPTION]",
"name": "Name",
"type": "String",
"mode": "Required"
},
{
"description": "[DESCRIPTION]",
"name": "GenomeBuild",
"type": "Integer",
"mode": "Required"
},
{
"description": "[DESCRIPTION]",
"name": "Chr",
"type": "String",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "Position",
"type": "Integer",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "Ref",
"type": "String",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "AlleleA",
"type": "String",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "AlleleB",
"type": "String",
"mode": "Nullable"
},
{
"description": "[DESCRIPTION]",
"name": "build37Flag",
"type": "String",
"mode": "Nullable"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Illumina, Inc.
[Heading],
Descriptor File Name,name.bpm
Assay Format,Infinium LCG
Date Manufactured,09/28/2020
CreateExtendedIlluminaManifest.version,1.5
Target Build,37
Target Reference File,/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta
Cluster File,/files/ClusterFile.egt
dbSNP File,/seq/references/Homo_sapiens_assembly19/v1/variant_calling/vqsr_resources/WGS/v2/dbsnp_138.b37.vcf.gz
Supported Build,36
Supported Reference File,/seq/references/Homo_sapiens_assembly19/v1/arrays/human_b36_both.fasta
Supported Chain File,/seq/references/Homo_sapiens_assembly19/v1/arrays/b36ToHg19.broad.over.chain
Loci Count ,2
[Assay],
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,RefStrand,Intensity_Only,build37Chr,build37Pos,build37RefAllele,build37AlleleA,build37AlleleB,build37Rsid,build37Flag
1:5700115-A-T-0_T_R_0000000000,1:5700115-A-T,TOP,[T/A],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700115,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[T/A]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[A/T]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700115,A,A,T,,PASS
1:5700116-C-G-0_T_R_0000000000,1:5700116-C-G,TOP,[C/G],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700116,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[G/C]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[C/G]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700116,C,C,G,,PASS

0 comments on commit 3a930f2

Please sign in to comment.