-
Notifications
You must be signed in to change notification settings - Fork 590
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Moving the WDL for importing array manifest to BQ #6860
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"ImportArrayManifest.extended_manifest_csv":"/home/travis/build/broadinstitute/gatk/src/test/resources/org/broadinstitute/hellbender/tools/variantdb/arrays/tiny_manifest.csv", | ||
"ImportArrayManifest.manifest_schema_json":"/home/travis/build/broadinstitute/gatk/scripts/variantstore_wdl/schemas/manifest_schema.json", | ||
"ImportArrayManifest.project_id":"broad-dsde-dev", | ||
"ImportArrayManifest.dataset_name":"temp_tables", | ||
"ImportArrayManifest.table_name": "__TABLE_NAME__", | ||
"ImportArrayManifest.LoadManifest.for_testing_only": "gcloud auth activate-service-account --key-file $GOOGLE_APPLICATION_CREDENTIALS" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
include required(classpath("application")) | ||
|
||
backend { | ||
# Override the default backend. | ||
default = "LocalExample2" | ||
|
||
providers { | ||
LocalExample2 { | ||
# The actor that runs the backend. In this case, it's the Shared File System (SFS) ConfigBackend. | ||
actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" | ||
|
||
# The backend custom configuration. | ||
config { | ||
|
||
run-in-background = true | ||
|
||
# The list of possible runtime custom attributes. | ||
runtime-attributes = """ | ||
String? docker | ||
String? docker_user | ||
""" | ||
|
||
# Submit string when there is no "docker" runtime attribute. | ||
submit = "/usr/bin/env bash ${script}" | ||
|
||
# Submit string when there is a "docker" runtime attribute. | ||
submit-docker = """ | ||
docker run \ | ||
--rm -i \ | ||
${"--user " + docker_user} \ | ||
--entrypoint ${job_shell} \ | ||
-e GOOGLE_APPLICATION_CREDENTIALS=${docker_cwd}/sa.json \ | ||
-v ${cwd}:${docker_cwd} \ | ||
-v __SERVICE_ACCOUNT__:${docker_cwd}/sa.json \ | ||
${docker} ${docker_script} | ||
""" | ||
|
||
# The defaults for runtime attributes if not provided. | ||
default-runtime-attributes { | ||
failOnStderr: false | ||
continueOnReturnCode: 0 | ||
} | ||
} | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -e | ||
#cd in the directory of the script in order to use relative paths | ||
script_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) | ||
cd "$script_path" | ||
|
||
WORKING_DIR=/home/travis/build/broadinstitute | ||
UUID=$(cat /proc/sys/kernel/random/uuid | sed s/-/_/g) | ||
|
||
set -e | ||
|
||
echo "Building docker image for VariantStore WDL tests (skipping unit tests)..." | ||
|
||
#assume Dockerfile is in root | ||
echo "Building docker without running unit tests... =========" | ||
cd $WORKING_DIR/gatk | ||
# IMPORTANT: This code is duplicated in the cnv WDL test. | ||
if [ ${TRAVIS_PULL_REQUEST} != false ]; then | ||
HASH_TO_USE=FETCH_HEAD | ||
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/ -t ${TRAVIS_PULL_REQUEST}; | ||
else | ||
HASH_TO_USE=${TRAVIS_COMMIT} | ||
sudo bash build_docker.sh -e ${HASH_TO_USE} -s -u -d $PWD/temp_staging/; | ||
fi | ||
echo "Docker build done ==========" | ||
echo "Putting the newly built docker image into the json parameters" | ||
cd $WORKING_DIR/gatk/scripts/ | ||
sed -r "s/__GATK_DOCKER__/broadinstitute\/gatk\:$HASH_TO_USE/g" variantstore_cromwell_tests/import_array_manifest_test.json >$WORKING_DIR/import_array_manifest_test_tmp.json | ||
sed -r "s/__TABLE_NAME__/$UUID/g" $WORKING_DIR/import_array_manifest_test_tmp.json > $WORKING_DIR/import_array_manifest_test_mod.json | ||
echo "JSON FILE (modified) =======" | ||
cat $WORKING_DIR/import_array_manifest_test_mod.json | ||
|
||
sed -r "s|__SERVICE_ACCOUNT__|$GOOGLE_APPLICATION_CREDENTIALS|g" variantstore_cromwell_tests/local-with-gcs.conf >$WORKING_DIR/set_up.conf | ||
echo "Updated local_backend.conf with service account" | ||
|
||
echo "Running ImportArrayManifest WDL through cromwell" | ||
ln -fs $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl | ||
sudo java -Dconfig.file=$WORKING_DIR/set_up.conf -jar $CROMWELL_JAR run $WORKING_DIR/gatk/scripts/variantstore_wdl/ImportArrayManifest.wdl -i $WORKING_DIR/import_array_manifest_test_mod.json -m $WORKING_DIR/test_import_manifest_wdl.metadata |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
version 1.0 | ||
|
||
workflow ImportArrayManifest { | ||
|
||
input { | ||
File extended_manifest_csv | ||
File manifest_schema_json | ||
String project_id | ||
String dataset_name | ||
String? table_name | ||
|
||
Int? preemptible_tries | ||
String? docker | ||
} | ||
|
||
String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you overriding this? Or are we always testing with this static GATK version? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not overriding it in this case because we're not using any GATK tools in this WDL. But maybe we should still be testing the current branch regardless? I know we'll need to for future WDLs (Ingest, calculate metrics, and extract will all use GATK tools that will need to be in the docker from the current branch) |
||
|
||
call CreateManifestCsv { | ||
input: | ||
extended_manifest_csv = extended_manifest_csv, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
|
||
call LoadManifest { | ||
input: | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
table_name = table_name, | ||
manifest_schema_json = manifest_schema_json, | ||
manifest_csv = CreateManifestCsv.manifest_csv, | ||
preemptible_tries = preemptible_tries, | ||
docker = docker_final | ||
} | ||
output { | ||
File manifest_csv = CreateManifestCsv.manifest_csv | ||
File manifest_ingest_csv = CreateManifestCsv.manifest_ingest_csv | ||
File manifest_sub_csv = CreateManifestCsv.manifest_sub_csv | ||
File manifest_proc_csv = CreateManifestCsv.manifest_proc_csv | ||
} | ||
} | ||
|
||
task LoadManifest { | ||
input { | ||
String project_id | ||
String dataset_name | ||
String? table_name | ||
File manifest_csv | ||
File manifest_schema_json | ||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
# String to add command for testing only. Can be ignored otherwise. | ||
String? for_testing_only | ||
} | ||
|
||
String ingest_table = dataset_name + "." + select_first([table_name, "probe_info"]) | ||
|
||
parameter_meta { | ||
manifest_schema_json: { | ||
localization_optional: false | ||
} | ||
} | ||
|
||
command <<< | ||
set +e | ||
~{for_testing_only} | ||
bq ls --project_id ~{project_id} ~{dataset_name} > /dev/null | ||
if [ $? -ne 0 ]; then | ||
echo "making dataset ~{project_id}.~{dataset_name}" | ||
bq mk --project_id=~{project_id} ~{dataset_name} | ||
fi | ||
bq show --project_id ~{project_id} ~{ingest_table} > /dev/null | ||
if [ $? -ne 0 ]; then | ||
echo "making table ~{ingest_table}" | ||
# create a site info table and load - schema and TSV header need to be the same order | ||
bq --location=US mk --project_id=~{project_id} ~{ingest_table} ~{manifest_schema_json} | ||
fi | ||
set -e | ||
|
||
bq load --location=US --project_id=~{project_id} --null_marker "null" --source_format=CSV ~{ingest_table} ~{manifest_csv} ~{manifest_schema_json} | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + 20 + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
|
||
} | ||
|
||
task CreateManifestCsv { | ||
input { | ||
File extended_manifest_csv | ||
|
||
# runtime | ||
Int? preemptible_tries | ||
String docker | ||
} | ||
|
||
Int disk_size = ceil(size(extended_manifest_csv, "GB") * 2.5) + 20 | ||
|
||
meta { | ||
description: "Creates a tsv file for imort into BigQuery" | ||
} | ||
parameter_meta { | ||
extended_manifest_csv: { | ||
localization_optional: false | ||
} | ||
} | ||
command <<< | ||
set -e | ||
|
||
TMP_SORTED="manifest_ingest_sorted.csv" | ||
TMP_SUB="manifest_ingest_sub.csv" | ||
TMP_PROC="manifest_ingest_processed.csv" | ||
TMP="manifest_ingest.csv" | ||
|
||
# put integers in front of the chromosomes that are not numbered so that they end up ordered by X, Y and MT | ||
sed 's/,X,/,23X,/g; s/,Y,/,24Y,/g; s/,MT,/,25MT,/g' ~{extended_manifest_csv} > $TMP_SUB | ||
|
||
# sort the probes by chrom, position and then name so there is a specific ordering when we assign integers | ||
sort -t , -k23n,23 -k24n,24 -k2,2 $TMP_SUB > $TMP_SORTED | ||
|
||
# checking for != "build37Flag" skips the header row (we don't want that numbered) | ||
# only process rows with 29 fields - this skips some header info fields | ||
# also skip entries that are flagged, not matched or have index conflict | ||
awk -F ',' 'NF==29 && ($29!="ILLUMINA_FLAGGED" && $29!="INDEL_NOT_MATCHED" && $29!="INDEL_CONFLICT" && $29!="build37Flag") { flag=$29; if ($29=="PASS") flag="null"; print ++id","$2","$9","$23","$24","$25","$26","$27","flag }' $TMP_SORTED > $TMP_PROC | ||
|
||
# remove the integer prefixes for chromosomes X, Y and MT | ||
sed 's/,23X,/,X,/g; s/,24Y,/,Y,/g; s/,25MT,/,MT,/g' $TMP_PROC > $TMP | ||
|
||
echo "created file for ingest $TMP" | ||
>>> | ||
runtime { | ||
docker: docker | ||
memory: "4 GB" | ||
disks: "local-disk " + disk_size + " HDD" | ||
preemptible: select_first([preemptible_tries, 5]) | ||
cpu: 2 | ||
} | ||
output { | ||
File manifest_csv = "manifest_ingest.csv" | ||
File manifest_ingest_csv = "manifest_ingest_sorted.csv" | ||
File manifest_sub_csv = "manifest_ingest_sub.csv" | ||
File manifest_proc_csv = "manifest_ingest_processed.csv" | ||
|
||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
[ | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "ProbeId", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Name", | ||
"type": "String", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "GenomeBuild", | ||
"type": "Integer", | ||
"mode": "Required" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Chr", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Position", | ||
"type": "Integer", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "Ref", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AlleleA", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "AlleleB", | ||
"type": "String", | ||
"mode": "Nullable" | ||
}, | ||
{ | ||
"description": "[DESCRIPTION]", | ||
"name": "build37Flag", | ||
"type": "String", | ||
"mode": "Nullable" | ||
} | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
Illumina, Inc. | ||
[Heading], | ||
Descriptor File Name,name.bpm | ||
Assay Format,Infinium LCG | ||
Date Manufactured,09/28/2020 | ||
CreateExtendedIlluminaManifest.version,1.5 | ||
Target Build,37 | ||
Target Reference File,/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta | ||
Cluster File,/files/ClusterFile.egt | ||
dbSNP File,/seq/references/Homo_sapiens_assembly19/v1/variant_calling/vqsr_resources/WGS/v2/dbsnp_138.b37.vcf.gz | ||
Supported Build,36 | ||
Supported Reference File,/seq/references/Homo_sapiens_assembly19/v1/arrays/human_b36_both.fasta | ||
Supported Chain File,/seq/references/Homo_sapiens_assembly19/v1/arrays/b36ToHg19.broad.over.chain | ||
Loci Count ,2 | ||
[Assay], | ||
IlmnID,Name,IlmnStrand,SNP,AddressA_ID,AlleleA_ProbeSeq,AddressB_ID,AlleleB_ProbeSeq,GenomeBuild,Chr,MapInfo,Ploidy,Species,Source,SourceVersion,SourceStrand,SourceSeq,TopGenomicSeq,BeadSetID,Exp_Clusters,RefStrand,Intensity_Only,build37Chr,build37Pos,build37RefAllele,build37AlleleA,build37AlleleB,build37Rsid,build37Flag | ||
1:5700115-A-T-0_T_R_0000000000,1:5700115-A-T,TOP,[T/A],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700115,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[T/A]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[A/T]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700115,A,A,T,,PASS | ||
1:5700116-C-G-0_T_R_0000000000,1:5700116-C-G,TOP,[C/G],000000000,TCAATCTATATTACCCACCACAGAGCAGAAGGCATACCACATGATTTCTG,,,37,1,5700116,diploid,Homo sapiens,PAGE,0,BOT,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[G/C]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct[C/G]tcaatctatattacccaccacagagcagaaggcataccacatgatttctgatttctgct,0000,3,-,0,1,5700116,C,C,G,,PASS |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this just copied from elsewhere? Seems like there should be a gatk-test google project...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is the GATK test project AFAIK. It's used in the BigQueryUtils tests (in GATK).