Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add task for VAT validation #4 #7363

Merged
merged 4 commits into from
Jul 23, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ workflows:
filters:
branches:
- ah_var_store
- rsa_add_vat_val_4
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
136 changes: 102 additions & 34 deletions scripts/variantstore/wdl/GvsValidateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -35,11 +35,69 @@ workflow GvsValidateVatTable {
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

# once there is more than one check, they will be gathered into this workflow output, in the format
# [{ValidationRule1: "PASS/FAIL Extra info from this test"},
# {ValidationRule2: "PASS/FAIL Extra from this test"}]
call SchemaOnlyOneRowPerNullTranscript {
input:
query_project_id = query_project_id,
fq_vat_table = fq_vat_table,
service_account_json_path = service_account_json_path,
last_modified_timestamp = GetBQTableLastModifiedDatetime.last_modified_timestamp
}

output {
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result, SpotCheckForExpectedTranscripts.result]
Array[Map[String, String]] validation_results = [EnsureVatTableHasVariants.result, SpotCheckForExpectedTranscripts.result, SchemaOnlyOneRowPerNullTranscript.result]
}
}


task GetBQTableLastModifiedDatetime {
# because this is being used to determine if the data has changed, never use call cache
meta {
volatile: true
}

input {
String query_project
String fq_table
String? service_account_json_path
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

# ------------------------------------------------
# try to get the last modified date for the table in question; fail if something comes back from BigQuery
# that isn't in the right format (e.g. an error)
command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project}
fi

echo "project_id = ~{query_project}" > ~/.bigqueryrc

# bq needs the project name to be separate by a colon
DATASET_TABLE_COLON=$(echo ~{fq_table} | sed 's/\./:/')

LASTMODIFIED=$(bq --location=US --project_id=~{query_project} --format=json show ${DATASET_TABLE_COLON} | python3 -c "import sys, json; print(json.load(sys.stdin)['lastModifiedTime']);")
if [[ $LASTMODIFIED =~ ^[0-9]+$ ]]; then
echo $LASTMODIFIED
else
exit 1
fi
>>>

output {
String last_modified_timestamp = read_string(stdout())
}

runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
preemptible: 3
cpu: 1
}
}

Expand All @@ -54,6 +112,7 @@ task EnsureVatTableHasVariants {
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e
if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
Expand Down Expand Up @@ -104,6 +163,8 @@ task SpotCheckForExpectedTranscripts {
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
Expand Down Expand Up @@ -135,10 +196,11 @@ task SpotCheckForExpectedTranscripts {
if [[ $NUMRESULTS = "0" ]]; then
echo "PASS: The VAT table ~{fq_vat_table} only has the expected transcripts at the tested location ('IGFLR1' and 'AD000671.2' in chromosome 19, between positions 35,740,407 - 35,740,469)." > validation_results.txt
else
echo "FAIL: The VAT table ~{fq_vat_table} had unexpected transcripts at the tested location: [csv output follows] " > validation_results.txt
echo "FAIL: The VAT table ~{fq_vat_table} had unexpected transcripts at the tested location: [csv output follows] " > validation_results.txt
cat bq_query_output.csv >> validation_results.txt
fi
>>>

# ------------------------------------------------
# Runtime settings:
runtime {
Expand All @@ -148,61 +210,67 @@ task SpotCheckForExpectedTranscripts {
cpu: "1"
disks: "local-disk 100 HDD"
}
# ------------------------------------------------
# Output: {"Name of validation rule": "PASS/FAIL plus additional validation results"}

output {
Map[String, String] result = {"SpotCheckForExpectedTranscripts": read_string('validation_results.txt')}
}
}

task GetBQTableLastModifiedDatetime {
# because this is being used to determine if the data has changed, never use call cache
meta {
volatile: true
}

task SchemaOnlyOneRowPerNullTranscript {
input {
String query_project
String fq_table
String query_project_id
String fq_vat_table
String? service_account_json_path
String last_modified_timestamp
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'

# ------------------------------------------------
# try to get the last modified date for the table in question; fail if something comes back from BigQuery
# that isn't in the right format (e.g. an error)
command <<<
set -e

if [ ~{has_service_account_file} = 'true' ]; then
gsutil cp ~{service_account_json_path} local.service_account.json
gcloud auth activate-service-account --key-file=local.service_account.json
gcloud config set project ~{query_project}
gcloud config set project ~{query_project_id}
fi
echo "project_id = ~{query_project_id}" > ~/.bigqueryrc

echo "project_id = ~{query_project}" > ~/.bigqueryrc
bq query --nouse_legacy_sql --project_id=~{query_project_id} --format=csv 'SELECT * FROM (SELECT
rsasch marked this conversation as resolved.
Show resolved Hide resolved
vid,
COUNT(vid) AS num_rows
FROM
~{fq_vat_table}
WHERE
transcript_source is NULL AND
transcript is NULL
GROUP BY vid) null_transcripts
WHERE num_rows > 1' > bq_variant_count.csv

# bq needs the project name to be separate by a colon
DATASET_TABLE_COLON=$(echo ~{fq_table} | sed 's/\./:/')
# get number of lines in bq query output
NUMRESULTS=$(awk 'END{print NR}' bq_variant_count.csv)

LASTMODIFIED=$(bq --location=US --project_id=~{query_project} --format=json show ${DATASET_TABLE_COLON} | python3 -c "import sys, json; print(json.load(sys.stdin)['lastModifiedTime']);")
if [[ $LASTMODIFIED =~ ^[0-9]+$ ]]; then
echo $LASTMODIFIED
# if the result of the query has any rows, that means there were vids will null transcripts and multiple
# rows in the VAT, which should not be the case
if [[ $NUMRESULTS = "0" ]]; then
echo "PASS: The VAT table ~{fq_vat_table} only has 1 row per vid with a null transcript" > validation_results.txt
else
exit 1
echo "FAIL: The VAT table ~{fq_vat_table} had at least one vid with a null transcript and more than one row: [csv output follows] " > validation_results.txt
cat bq_variant_count.csv >> validation_results.txt
fi
>>>

output {
String last_modified_timestamp = read_string(stdout())
}

# ------------------------------------------------
# Runtime settings:
runtime {
docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
memory: "3 GB"
disks: "local-disk 10 HDD"
memory: "1 GB"
preemptible: 3
cpu: 1
cpu: "1"
disks: "local-disk 100 HDD"
}
# ------------------------------------------------
# Output: {"Name of validation rule": "PASS/FAIL plus additional validation results"}
output {
Map[String, String] result = {"SchemaOnlyOneRowPerNullTranscript": read_string('validation_results.txt')}
}
}