-
Notifications
You must be signed in to change notification settings - Fork 590
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Moving the WDL for importing array manifest to BQ (#6860)
* Copying wdl from variantstore repo * Adding tests and changes to WDL * addressing comments * adding readme
- Loading branch information
1 parent
5a34197
commit 3a930f2
Showing
9 changed files
with
327 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
The `run_variantstore_wdl.sh` script tests the WDLs in the `gatk/scripts/variantstore_wdl` directory. Currently the test | ||
only checks in the WDL runs without failing, and does not check the results. |
8 changes: 8 additions & 0 deletions
8
scripts/variantstore_cromwell_tests/import_array_manifest_test.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"ImportArrayManifest.extended_manifest_csv":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/tiny_manifest.csv", | ||
"ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json", | ||
"ImportArrayManifest.project_id":"broad-dsde-dev", | ||
"ImportArrayManifest.dataset_name":"temp_tables", | ||
"ImportArrayManifest.table_name": "__TABLE_NAME__", | ||
"ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
include required(classpath("application")) | ||
|
||
backend { | ||
# Override the default backend. | ||
default = "LocalExample2" | ||
|
||
providers { | ||
LocalExample2 { | ||
# The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend. | ||
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" | ||
|
||
# The backend custom configuration. | ||
config { | ||
|
||
run-in-background = true | ||
|
||
# The list of possible runtime custom attributes. | ||
runtime-attributes = """ | ||
String? docker | ||
String? docker_user | ||
""" | ||
|
||
# Submit string when there is no "docker" runtime attribute. | ||
submit = "/usr/bin/env bash ${script}" | ||
|
||
# Submit string when there is a "docker" runtime attribute. | ||
submit-docker = """ | ||
docker run \ | ||
--rm -i \ | ||
${"--user " + docker_user} \ | ||
--entrypoint ${job_shell} \ | ||
-e GOOGLE_APPLICATION_CREDENTIALS=${docker_cwd}/sa.json \ | ||
-v ${cwd}:${docker_cwd} \ | ||
-v __SERVICE_ACCOUNT__:${docker_cwd}/sa.json \ | ||
${docker} ${docker_script} | ||
""" | ||
|
||
# The defaults for runtime attributes if not provided. | ||
default-runtime-attributes { | ||
failOnStderr: false | ||
continueOnReturnCode: 0 | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
39 changes: 39 additions & 0 deletions
39
scripts/variantstore_cromwell_tests/run_variantstore_wdl.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
#cd in the directory of the script in order to use relative paths | ||
script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) | ||
cd "$script_path" | ||
|
||
WORKING_DIR=/home/travis/build/broadinstitute | ||
UUID=$(cat /proc/sys/kernel/random/uuid | sed s/-/_/g) | ||
|
||
set -e | ||
|
||
echo "Building docker image for VariantStore WDL tests (skipping unit tests)..." | ||
|
||
#assume Dockerfile is in root | ||
echo "Building docker without running unit tests... =========" | ||
cd $WORKING_DIR/gatk | ||
# IMPORTANT: This code is duplicated in the cnv WDL test. | ||
if [ ${TRAVIS_PULL_REQUEST} != false ]; then | ||
HASH_TO_USE=FETCH_HEAD | ||
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST}; | ||
else | ||
HASH_TO_USE=${TRAVIS_COMMIT} | ||
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; | ||
fi | ||
echo "Docker build done ==========" | ||
echo "Putting the newly built docker image into the json parameters" | ||
cd $WORKING_DIR/gatk/scripts/ | ||
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json | ||
sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json | ||
echo "JSON FILE (modified) =======" | ||
cat $WORKING_DIR/import_array_manifest_test_mod.json | ||
|
||
sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf | ||
echo "Updated local_backend.conf with service account" | ||
|
||
echo "Running ImportArrayManifest WDL through cromwell" | ||
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl | ||
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
version 1.0 | ||
|
||
workflow ImportArrayManifest { | ||
|
||
input { | ||
File extended_manifest_csv | ||
File manifest_schema_json | ||
String project_id | ||
String dataset_name | ||
String? table_name | ||
|
||
Int? preemptible_tries | ||
String? docker | ||
} | ||
|
||
String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"]) | ||
|
||
call CreateManifestCsv { | ||
input: | ||
extended_manifest_csv = extended_manifest_csv, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
|
||
call LoadManifest { | ||
input: | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
table_name = table_name, | ||
manifest_schema_json = manifest_schema_json, | ||
manifest_csv = CreateManifestCsv.manifest_csv, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
output { | ||
File manifest_csv = CreateManifestCsv.manifest_csv | ||
File manifest_ingest_csv = CreateManifestCsv.manifest_ingest_csv | ||
File manifest_sub_csv = CreateManifestCsv.manifest_sub_csv | ||
File manifest_proc_csv = CreateManifestCsv.manifest_proc_csv | ||
} | ||
} | ||
|
||
task LoadManifest { | ||
input { | ||
String project_id | ||
String dataset_name | ||
String? table_name | ||
File manifest_csv | ||
File manifest_schema_json | ||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
# String to add command for testing only. Can be ignored otherwise. | ||
String? for_testing_only | ||
} | ||
|
||
String ingest_table = dataset_name + "." + select_first([table_name, "probe_info"]) | ||
|
||
parameter_meta { | ||
manifest_schema_json: { | ||
localization_optional: false | ||
} | ||
} | ||
|
||
command <<< | ||
set +e | ||
~{for_testing_only} | ||
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null | ||
if [ $? -ne 0 ]; then | ||
echo "making dataset ~{project_id}.~{dataset_name}" | ||
bq mk --project_id=~{project_id} ~{dataset_name} | ||
fi | ||
bq show --project_id ~{project_id} ~{ingest_table} > /dev/null | ||
if [ $? -ne 0 ]; then | ||
echo "making table ~{ingest_table}" | ||
# create a site info table and load - schema and TSV header need to be the same order | ||
bq --location=US mk --project_id=~{project_id} ~{ingest_table} ~{manifest_schema_json} | ||
fi | ||
set -e | ||
|
||
bq load --location=US --project_id=~{project_id} --null_marker "null" --source_format=CSV ~{ingest_table} ~{manifest_csv} ~{manifest_schema_json} | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + 20 + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
|
||
} | ||
|
||
task CreateManifestCsv { | ||
input { | ||
File extended_manifest_csv | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
} | ||
|
||
Int disk_size = ceil(size(extended_manifest_csv, "GB") * 2.5) + 20 | ||
|
||
meta { | ||
description: "Creates a tsv file for imort into BigQuery" | ||
} | ||
parameter_meta { | ||
extended_manifest_csv: { | ||
localization_optional: false | ||
} | ||
} | ||
command <<< | ||
set -e | ||
|
||
TMP_SORTED="manifest_ingest_sorted.csv" | ||
TMP_SUB="manifest_ingest_sub.csv" | ||
TMP_PROC="manifest_ingest_processed.csv" | ||
TMP="manifest_ingest.csv" | ||
|
||
# put integers in front of the chromosomes that are not numbered so that they end up ordered by X, Y and MT | ||
sed 's/,X,/,23X,/g; s/,Y,/,24Y,/g; s/,MT,/,25MT,/g' ~{extended_manifest_csv} > $TMP_SUB | ||
|
||
# sort the probes by chrom, position and then name so there is a specific ordering when we assign integers | ||
sort -t , -k23n,23 -k24n,24 -k2,2 $TMP_SUB > $TMP_SORTED | ||
|
||
# checking for != "build37Flag" skips the header row (we don't want that numbered) | ||
# only process rows with 29 fields - this skips some header info fields | ||
# also skip entries that are flagged, not matched or have index conflict | ||
awk -F ',' 'NF==29 && ($29!="ILLUMINA_FLAGGED" && $29!="INDEL_NOT_MATCHED" && $29!="INDEL_CONFLICT" && $29!="build37Flag") { flag=$29; if ($29=="PASS") flag="null"; print ++id","$2","$9","$23","$24","$25","$26","$27","flag }' $TMP_SORTED > $TMP_PROC | ||
|
||
# remove the integer prefixes for chromosomes X, Y and MT | ||
sed 's/,23X,/,X,/g; s/,24Y,/,Y,/g; s/,25MT,/,MT,/g' $TMP_PROC > $TMP | ||
|
||
echo "created file for ingest $TMP" | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File manifest_csv = "manifest_ingest.csv" | ||
File manifest_ingest_csv = "manifest_ingest_sorted.csv" | ||
File manifest_sub_csv = "manifest_ingest_sub.csv" | ||
File manifest_proc_csv = "manifest_ingest_processed.csv" | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
This directory has the WDLs for generating and interacting with a variant store in BigQuery. It currently contains | ||
`ImportArrayManifest.wdl` which uploads the `probe_info` table from a Manifest file. Other WDLs for ingesting the array | ||
data, calculating metrics, and extracting array data are coming soon. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
[ | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "ProbeId", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Name", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "GenomeBuild", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Chr", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Position", | ||
"type": "Integer", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Ref", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AlleleA", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AlleleB", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "build37Flag", | ||
"type": "String", | ||
"mode": "Nullable" | ||
} | ||
] |
18 changes: 18 additions & 0 deletions
18
src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/tiny_manifest.csv
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Illumina, Inc. | ||
[Heading], | ||
Descriptor File Name,name.bpm | ||
Assay Format,Infinium LCG | ||
Date Manufactured,09/28/2020 | ||
CreateExtendedIlluminaManifest.version,1.5 | ||
Target Build,37 | ||
Target Reference File,/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta | ||
Cluster File,/files/ClusterFile.egt | ||
dbSNP File,/seq/references/Homo_sapiens_assembly19/v1/variant_calling/vqsr_resources/WGS/v2/dbsnp_138.b37.vcf.gz | ||
Supported Build,36 | ||
Supported Reference File,/seq/references/Homo_sapiens_assembly19/v1/arrays/human_b36_both.fasta | ||
Supported Chain File,/seq/references/Homo_sapiens_assembly19/v1/arrays/b36ToHg19.broad.over.chain | ||
Loci Count ,2 | ||
[Assay], | ||
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,RefStrand,Intensity_Only,build37Chr,build37Pos,build37RefAllele,build37AlleleA,build37AlleleB,build37Rsid,build37Flag | ||
1:5700115-A-T-0_T_R_0000000000,1:5700115-A-T,TOP,[T/A],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700115,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[T/A]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[A/T]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700115,A,A,T,,PASS | ||
1:5700116-C-G-0_T_R_0000000000,1:5700116-C-G,TOP,[C/G],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700116,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[G/C]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[C/G]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700116,C,C,G,,PASS |