Skip to content

Commit

Permalink
Add load lock file to prevent accidental re-loading of data to BQ (#7138
Browse files Browse the repository at this point in the history
)

* set lock at start of ImportGenomes run and check uuid at crucial steps
* use google docker for lockfile tasks, rename to LOCKFILE, add instruction for removing lockfile
* auth as SA if needed for lock and release tasks
  • Loading branch information
mmorgantaylor committed Apr 6, 2021
1 parent 8771530 commit ba9e606
Showing 1 changed file with 160 additions and 5 deletions.
165 changes: 160 additions & 5 deletions scripts/variantstore/wdl/ImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ workflow ImportGenomes {
String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])
call SetLock {
input:
output_directory = output_directory,
service_account_json = service_account_json,
preemptible_tries = preemptible_tries
}

call GetMaxTableId {
input:
sample_map = sample_map
Expand Down Expand Up @@ -87,7 +94,8 @@ workflow ImportGenomes {
output_directory = output_directory,
gatk_override = gatk_override,
docker = docker_final,
preemptible_tries = preemptible_tries
preemptible_tries = preemptible_tries,
run_uuid = SetLock.run_uuid
}
}

Expand All @@ -104,7 +112,8 @@ workflow ImportGenomes {
table_creation_done = CreateMetadataTables.done,
tsv_creation_done = CreateImportTsvs.done,
service_account_json = service_account_json,
docker = docker_final
docker = docker_final,
run_uuid = SetLock.run_uuid
}
}
Expand All @@ -121,7 +130,8 @@ workflow ImportGenomes {
table_creation_done = CreatePetTables.done,
tsv_creation_done = CreateImportTsvs.done,
service_account_json = service_account_json,
docker = docker_final
docker = docker_final,
run_uuid = SetLock.run_uuid
}
}

Expand All @@ -138,9 +148,128 @@ workflow ImportGenomes {
table_creation_done = CreateVetTables.done,
tsv_creation_done = CreateImportTsvs.done,
service_account_json = service_account_json,
docker = docker_final
docker = docker_final,
run_uuid = SetLock.run_uuid
}
}

call ReleaseLock {
input:
run_uuid = SetLock.run_uuid,
output_directory = output_directory,
load_metadata_done = LoadMetadataTable.done,
load_pet_done = LoadPetTable.done,
load_vet_done = LoadVetTable.done,
service_account_json = service_account_json,
preemptible_tries = preemptible_tries
}
}

# we create a lock file in the output directory with a uuid for this run of ImportGenomes.
# other tasks (TSV creation, bq load) check that the lock file exists and contains the run_uuid
# specific to this task.
task SetLock {
meta {
volatile: true
}

input {
String output_directory
File? service_account_json

# runtime
Int? preemptible_tries
}

String has_service_account_file = if (defined(service_account_json)) then 'true' else 'false'

command <<<
set -x
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gcloud auth activate-service-account --key-file='~{service_account_json}'
fi

# generate uuid for this run
RUN_UUID=$(dbus-uuidgen)
echo $RUN_UUID | tee RUN_UUID_STRING

DIR="~{output_directory}/"

# check for existing lock file
LOCKFILE="LOCKFILE"
HAS_LOCKFILE=$(gsutil ls "${DIR}${LOCKFILE}" | wc -l)
if [ $HAS_LOCKFILE -gt 0 ]; then
echo "ERROR: lock file in place. Check whether another run of ImportGenomes with this output directory is in progress or a previous run had an error. If you would like to proceed, run `gsutil rm ${DIR}${LOCKFILE}` and re-run the workflow." 1>&2
exit 1
else # put the lock file in place
echo "Setting lock file with UUID ${RUN_UUID}"
echo $RUN_UUID > $LOCKFILE
gsutil cp $LOCKFILE "${DIR}${LOCKFILE}" || { echo "Error uploading lockfile to ${DIR}${LOCKFILE}" 1>&2 ; exit 1; }
fi
>>>

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "1 GB"
disks: "local-disk 10 HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 1
}

output {
String run_uuid = read_string("RUN_UUID_STRING")
}
}

task ReleaseLock {
meta {
volatile: true
}

input {
String run_uuid
String output_directory
Array[String] load_metadata_done
Array[String] load_pet_done
Array[String] load_vet_done
File? service_account_json

# runtime
Int? preemptible_tries
}

String has_service_account_file = if (defined(service_account_json)) then 'true' else 'false'

command <<<
set -x
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gcloud auth activate-service-account --key-file='~{service_account_json}'
fi


LOCKFILE="~{output_directory}/LOCKFILE"
EXISTING_LOCK_ID=$(gsutil cat ${LOCKFILE})
CURRENT_RUN_ID="~{run_uuid}"

if [ ${EXISTING_LOCK_ID} = ${CURRENT_RUN_ID} ]; then
gsutil rm $LOCKFILE
else
echo "ERROR: found mismatched lockfile containing run ${EXISTING_LOCK_ID}, which does not match this run ${CURRENT_RUN_ID}." 1>&2
exit 1
fi
>>>

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "1 GB"
disks: "local-disk 10 HDD"
preemptible: select_first([preemptible_tries, 5])
cpu: 1
}
}

task GetMaxTableId {
Expand Down Expand Up @@ -186,6 +315,7 @@ task CreateImportTsvs {
String docker

String? for_testing_only
String run_uuid
}

Int disk_size = if defined(drop_state) then 30 else 75
Expand Down Expand Up @@ -220,6 +350,16 @@ task CreateImportTsvs {
gsutil cp ~{input_vcf} .
gsutil cp ~{input_vcf_index} .
fi

# check for existence of the correct lockfile
LOCKFILE="~{output_directory}/LOCKFILE"
EXISTING_LOCK_ID=$(gsutil cat ${LOCKFILE}) || { echo "Error retrieving lockfile from ${LOCKFILE}" 1>&2 ; exit 1; }
CURRENT_RUN_ID="~{run_uuid}"

if [ ${EXISTING_LOCK_ID} != ${CURRENT_RUN_ID} ]; then
echo "ERROR: found mismatched lockfile containing run ${EXISTING_LOCK_ID}, which does not match this run ${CURRENT_RUN_ID}." 1>&2
exit 1
fi

gatk --java-options "-Xmx7000m" CreateVariantIngestFiles \
-V ~{updated_input_vcf} \
Expand Down Expand Up @@ -342,6 +482,7 @@ task LoadTable {
File? service_account_json
String table_creation_done
Array[String] tsv_creation_done
String run_uuid

String docker
}
Expand All @@ -357,6 +498,17 @@ task LoadTable {
gcloud config set project ~{project_id}
fi

DIR="~{storage_location}/~{datatype}_tsvs/"
# check for existence of the correct lockfile
LOCKFILE="~{storage_location}/LOCKFILE"
EXISTING_LOCK_ID=$(gsutil cat ${LOCKFILE}) || { echo "Error retrieving lockfile from ${LOCKFILE}" 1>&2 ; exit 1; }
CURRENT_RUN_ID="~{run_uuid}"

if [ "${EXISTING_LOCK_ID}" != "${CURRENT_RUN_ID}" ]; then
echo "ERROR: found mismatched lockfile containing run ${EXISTING_LOCK_ID}, which does not match this run ${CURRENT_RUN_ID}." 1>&2
exit 1
fi

DIR="~{storage_location}/~{datatype}_tsvs/"

printf -v PADDED_TABLE_ID "%03d" ~{table_id}
Expand All @@ -379,7 +531,6 @@ task LoadTable {
else
echo "no ${FILES} files to process in $DIR"
fi

>>>

runtime {
Expand All @@ -389,5 +540,9 @@ task LoadTable {
preemptible: 0
cpu: 1
}

output {
String done = "true"
}
}

0 comments on commit ba9e606

Please sign in to comment.