Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add preliminary labels to queries [VS-381] #7902

Merged
merged 12 commits into from
Jun 17, 2022
2 changes: 1 addition & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ workflows:
branches:
- master
- ah_var_store
- rsa_add_prelim_labels
- name: GvsCalculatePrecisionAndSensitivity
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl
Expand All @@ -201,7 +202,6 @@ workflows:
- master
- ah_var_store
- vs_447_fixup_non_fq_invocations
- rsa_use_internal_project
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
10 changes: 6 additions & 4 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ task AssignIds {

Int samples_per_table = 4000
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:assign_ids"

command <<<
set -e
Expand Down Expand Up @@ -132,20 +134,20 @@ task AssignIds {
bq load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock $NAMES_FILE "sample_name:STRING"

# add sample_name to sample_info_table
bq --project_id=~{project_id} query --use_legacy_sql=false \
bq --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \
'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
offset=$(tail -1 maxid)

# perform actual id assignment
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \
'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'

# retrieve the list of assigned ids and samples to update the datamodel
echo "entity:sample_id,gvs_id" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \
'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv

Expand Down
8 changes: 6 additions & 2 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ task GetVetTableNames {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"

command <<<
set -e
Expand All @@ -77,7 +79,7 @@ task GetVetTableNames {
fi

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
'SELECT table_name FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv

# remove the header row from the CSV file
Expand Down Expand Up @@ -109,6 +111,8 @@ task CreateAltAlleleTable {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"

command <<<
set -e
Expand All @@ -120,7 +124,7 @@ task CreateAltAlleleTable {
fi

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
'CREATE OR REPLACE TABLE `~{project_id}.~{dataset_name}.alt_allele` (
location INT64,
sample_id INT64,
Expand Down
8 changes: 6 additions & 2 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ task ValidateFilterSetName {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"

command <<<
set -ex
Expand All @@ -236,7 +238,7 @@ task ValidateFilterSetName {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
FILTERSETS=${OUTPUT#"available_filter_set_names"}

if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then
Expand Down Expand Up @@ -472,6 +474,8 @@ task GenerateSampleListFile {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"

command <<<
set -e
Expand All @@ -487,7 +491,7 @@ task GenerateSampleListFile {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt
bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt

if [ -n "$OUTPUT_GCS_DIR" ]; then
gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/
Expand Down
10 changes: 7 additions & 3 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ task SetIsLoadedColumn {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"

command <<<
set -ex
Expand All @@ -263,7 +265,7 @@ task SetIsLoadedColumn {
echo "project_id = ~{project_id}" > ~/.bigqueryrc

# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)'
>>>
runtime {
Expand Down Expand Up @@ -296,6 +298,8 @@ task GetUningestedSampleIds {
Int samples_per_table = 4000
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
Int num_samples = length(external_sample_names)
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"

command <<<
set -ex
Expand Down Expand Up @@ -325,7 +329,7 @@ task GetUningestedSampleIds {
bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results

# prep for being able to return min table id
Expand All @@ -342,7 +346,7 @@ task GetUningestedSampleIds {
python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id

# get sample map of samples that haven't been loaded yet
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} \
"SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map

cut -d, -f1 sample_map > gvs_ids
Expand Down
6 changes: 6 additions & 0 deletions scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,12 @@ task BuildGATKJarAndCreateDataset {

bq mk --project_id="gvs-internal" "$dataset"

# add labels for DSP Cloud Cost Control Labeling and Reporting
bq update --set_label service:gvs gvs-internal:$dataset
bq update --set_label team:variants gvs-internal:$dataset
bq update --set_label env:dev gvs-internal:$dataset
bq update --set_label managedby:build_gatk_jar_and_create_dataset gvs-internal:$dataset

echo -n "$dataset" > dataset.txt
>>>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def make_extract_table(control_samples,
if not (bool(re.match(r"[a-z0-9_-]+$", key)) & bool(re.match(r"[a-z0-9_-]+$", value))):
raise ValueError(f"label key or value did not pass validation--format should be 'key1=val1, key2=val2'")

# add labels for DSP Cloud Cost Control Labeling and Reporting
query_labels.update({'service':'gvs','team':'variants','managedby':'prepare_ranges_callset'})

rsasch marked this conversation as resolved.
Show resolved Hide resolved
#Default QueryJobConfig will be merged into job configs passed in
#but if a specific default config is being updated (eg labels), new config must be added
#to the client._default_query_job_config that already exists
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_path):
global client
default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True)
# add labels for DSP Cloud Cost Control Labeling and Reporting to default_config
default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True, labels={'service':'gvs','team':'variants','managedby':'create_alt_allele'})

if sa_key_path:
credentials = service_account.Credentials.from_service_account_file(
Expand Down