Skip to content

Commit

Permalink
Add preliminary labels to queries [VS-381] (#7902)
Browse files Browse the repository at this point in the history
  • Loading branch information
rsasch authored and RoriCremer committed Jun 29, 2022
1 parent 18ce129 commit 91d5e96
Show file tree
Hide file tree
Showing 9 changed files with 38 additions and 15 deletions.
1 change: 0 additions & 1 deletion .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,6 @@ workflows:
- master
- ah_var_store
- vs_447_fixup_non_fq_invocations
- rsa_use_internal_project
- name: GvsIngestTieout
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl
Expand Down
10 changes: 6 additions & 4 deletions scripts/variantstore/wdl/GvsAssignIds.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ task AssignIds {

Int samples_per_table = 4000
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:assign_ids"

command <<<
set -e
Expand Down Expand Up @@ -137,20 +139,20 @@ task AssignIds {
bq load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock $NAMES_FILE "sample_name:STRING"

# add sample_name to sample_info_table
bq --project_id=~{project_id} query --use_legacy_sql=false \
bq --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \
'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
offset=$(tail -1 maxid)

# perform actual id assignment
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \
'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'

# retrieve the list of assigned ids and samples to update the datamodel
echo "entity:sample_id,gvs_id" > update.tsv
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \
'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv

Expand Down
10 changes: 7 additions & 3 deletions scripts/variantstore/wdl/GvsCreateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ task GetVetTableNames {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"

command <<<
set -e
Expand All @@ -77,7 +79,7 @@ task GetVetTableNames {
fi

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
'SELECT table_name FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv

# remove the header row from the CSV file
Expand Down Expand Up @@ -109,6 +111,8 @@ task CreateAltAlleleTable {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"

command <<<
set -e
Expand All @@ -120,7 +124,7 @@ task CreateAltAlleleTable {
fi

echo "project_id = ~{project_id}" > ~/.bigqueryrc
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
'CREATE OR REPLACE TABLE `~{project_id}.~{dataset_name}.alt_allele` (
location INT64,
sample_id INT64,
Expand Down Expand Up @@ -196,7 +200,7 @@ task PopulateAltAlleleTable {
$SERVICE_ACCOUNT_STANZA
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_05_13"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_06_15"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
8 changes: 6 additions & 2 deletions scripts/variantstore/wdl/GvsExtractCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ task ValidateFilterSetName {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"

command <<<
set -ex
Expand All @@ -236,7 +238,7 @@ task ValidateFilterSetName {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
FILTERSETS=${OUTPUT#"available_filter_set_names"}

if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then
Expand Down Expand Up @@ -472,6 +474,8 @@ task GenerateSampleListFile {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"

command <<<
set -e
Expand All @@ -487,7 +491,7 @@ task GenerateSampleListFile {

echo "project_id = ~{query_project}" > ~/.bigqueryrc

bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt
bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt

if [ -n "$OUTPUT_GCS_DIR" ]; then
gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/
Expand Down
10 changes: 7 additions & 3 deletions scripts/variantstore/wdl/GvsImportGenomes.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,8 @@ task SetIsLoadedColumn {
}

String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"

command <<<
set -ex
Expand All @@ -263,7 +265,7 @@ task SetIsLoadedColumn {
echo "project_id = ~{project_id}" > ~/.bigqueryrc

# set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)'
>>>
runtime {
Expand Down Expand Up @@ -296,6 +298,8 @@ task GetUningestedSampleIds {
Int samples_per_table = 4000
String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
Int num_samples = length(external_sample_names)
# add labels for DSP Cloud Cost Control Labeling and Reporting
String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"

command <<<
set -ex
Expand Down Expand Up @@ -325,7 +329,7 @@ task GetUningestedSampleIds {
bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"

# get the current maximum id, or 0 if there are none
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
"SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results

# prep for being able to return min table id
Expand All @@ -342,7 +346,7 @@ task GetUningestedSampleIds {
python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id

# get sample map of samples that haven't been loaded yet
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} \
"SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map

cut -d, -f1 sample_map > gvs_ids
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ task PrepareRangesCallsetTask {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_05_16"
docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_06_15"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
6 changes: 6 additions & 0 deletions scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,12 @@ task BuildGATKJarAndCreateDataset {

bq mk --project_id="gvs-internal" "$dataset"

# add labels for DSP Cloud Cost Control Labeling and Reporting
bq update --set_label service:gvs gvs-internal:$dataset
bq update --set_label team:variants gvs-internal:$dataset
bq update --set_label environment:dev gvs-internal:$dataset
bq update --set_label managedby:build_gatk_jar_and_create_dataset gvs-internal:$dataset

echo -n "$dataset" > dataset.txt
>>>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,9 @@ def make_extract_table(control_samples,
if not (bool(re.match(r"[a-z0-9_-]+$", key)) & bool(re.match(r"[a-z0-9_-]+$", value))):
raise ValueError(f"label key or value did not pass validation--format should be 'key1=val1, key2=val2'")

# add labels for DSP Cloud Cost Control Labeling and Reporting
query_labels.update({'service':'gvs','team':'variants','managedby':'prepare_ranges_callset'})

#Default QueryJobConfig will be merged into job configs passed in
#but if a specific default config is being updated (eg labels), new config must be added
#to the client._default_query_job_config that already exists
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_path):
global client
default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True)
# add labels for DSP Cloud Cost Control Labeling and Reporting to default_config
default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True, labels={'service':'gvs','team':'variants','managedby':'create_alt_allele'})

if sa_key_path:
credentials = service_account.Credentials.from_service_account_file(
Expand Down

0 comments on commit 91d5e96

Please sign in to comment.