diff --git a/.dockstore.yml b/.dockstore.yml index 29176a0d675..13e77a7b7c6 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -202,7 +202,6 @@ workflows: - master - ah_var_store - vs_447_fixup_non_fq_invocations - - rsa_use_internal_project - name: GvsIngestTieout subclass: WDL primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl index 4bc7976b8e7..214e911779b 100644 --- a/scripts/variantstore/wdl/GvsAssignIds.wdl +++ b/scripts/variantstore/wdl/GvsAssignIds.wdl @@ -97,6 +97,8 @@ task AssignIds { Int samples_per_table = 4000 String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:assign_ids" command <<< set -e @@ -137,20 +139,20 @@ task AssignIds { bq load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock $NAMES_FILE "sample_name:STRING" # add sample_name to sample_info_table - bq --project_id=~{project_id} query --use_legacy_sql=false \ + bq --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \ 'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)' # get the current maximum id, or 0 if there are none - bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid + bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid offset=$(tail -1 maxid) # perform actual id assignment - bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \ + bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \ 'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;' # retrieve the list of assigned ids and samples to update the datamodel echo "entity:sample_id,gvs_id" > update.tsv - bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \ + bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \ 'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv diff --git a/scripts/variantstore/wdl/GvsCreateAltAllele.wdl b/scripts/variantstore/wdl/GvsCreateAltAllele.wdl index c009889873f..89a0e728c06 100644 --- a/scripts/variantstore/wdl/GvsCreateAltAllele.wdl +++ b/scripts/variantstore/wdl/GvsCreateAltAllele.wdl @@ -66,6 +66,8 @@ task GetVetTableNames { } String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele" command <<< set -e @@ -77,7 +79,7 @@ task GetVetTableNames { fi echo "project_id = ~{project_id}" > ~/.bigqueryrc - bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \ + bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'SELECT table_name FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv # remove the header row from the CSV file @@ -109,6 +111,8 @@ task CreateAltAlleleTable { } String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele" command <<< set -e @@ -120,7 +124,7 @@ task CreateAltAlleleTable { fi echo "project_id = ~{project_id}" > ~/.bigqueryrc - bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \ + bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \ 'CREATE OR REPLACE TABLE `~{project_id}.~{dataset_name}.alt_allele` ( location INT64, sample_id INT64, @@ -196,7 +200,7 @@ task PopulateAltAlleleTable { $SERVICE_ACCOUNT_STANZA >>> runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_05_13" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_06_15" memory: "3 GB" disks: "local-disk 10 HDD" cpu: 1 diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl index 9d2ad81ae54..074dad2774c 100644 --- a/scripts/variantstore/wdl/GvsExtractCallset.wdl +++ b/scripts/variantstore/wdl/GvsExtractCallset.wdl @@ -224,6 +224,8 @@ task ValidateFilterSetName { } String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset" command <<< set -ex @@ -236,7 +238,7 @@ task ValidateFilterSetName { echo "project_id = ~{query_project}" > ~/.bigqueryrc - OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name") + OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name") FILTERSETS=${OUTPUT#"available_filter_set_names"} if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then @@ -472,6 +474,8 @@ task GenerateSampleListFile { } String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset" command <<< set -e @@ -487,7 +491,7 @@ task GenerateSampleListFile { echo "project_id = ~{query_project}" > ~/.bigqueryrc - bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt + bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt if [ -n "$OUTPUT_GCS_DIR" ]; then gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/ diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl index c1790e9dc75..efd20d3efee 100644 --- a/scripts/variantstore/wdl/GvsImportGenomes.wdl +++ b/scripts/variantstore/wdl/GvsImportGenomes.wdl @@ -250,6 +250,8 @@ task SetIsLoadedColumn { } String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes" command <<< set -ex @@ -263,7 +265,7 @@ task SetIsLoadedColumn { echo "project_id = ~{project_id}" > ~/.bigqueryrc # set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id - bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ + bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \ 'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)' >>> runtime { @@ -296,6 +298,8 @@ task GetUningestedSampleIds { Int samples_per_table = 4000 String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false' Int num_samples = length(external_sample_names) + # add labels for DSP Cloud Cost Control Labeling and Reporting + String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes" command <<< set -ex @@ -325,7 +329,7 @@ task GetUningestedSampleIds { bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING" # get the current maximum id, or 0 if there are none - bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \ + bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \ "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results # prep for being able to return min table id @@ -342,7 +346,7 @@ task GetUningestedSampleIds { python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id # get sample map of samples that haven't been loaded yet - bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \ + bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} \ "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map cut -d, -f1 sample_map > gvs_ids diff --git a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl index 1372b594953..7b75d5c0fdb 100644 --- a/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl +++ b/scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl @@ -112,7 +112,7 @@ task PrepareRangesCallsetTask { } runtime { - docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_05_16" + docker: "us.gcr.io/broad-dsde-methods/variantstore:ah_var_store_2022_06_15" memory: "3 GB" disks: "local-disk 100 HDD" bootDiskSizeGb: 15 diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl index 6fbe511a037..330db440b7f 100644 --- a/scripts/variantstore/wdl/GvsUtils.wdl +++ b/scripts/variantstore/wdl/GvsUtils.wdl @@ -287,6 +287,12 @@ task BuildGATKJarAndCreateDataset { bq mk --project_id="gvs-internal" "$dataset" + # add labels for DSP Cloud Cost Control Labeling and Reporting + bq update --set_label service:gvs gvs-internal:$dataset + bq update --set_label team:variants gvs-internal:$dataset + bq update --set_label environment:dev gvs-internal:$dataset + bq update --set_label managedby:build_gatk_jar_and_create_dataset gvs-internal:$dataset + echo -n "$dataset" > dataset.txt >>> diff --git a/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py b/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py index 47a52e19415..7d69d060800 100644 --- a/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py +++ b/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py @@ -246,6 +246,9 @@ def make_extract_table(control_samples, if not (bool(re.match(r"[a-z0-9_-]+$", key)) & bool(re.match(r"[a-z0-9_-]+$", value))): raise ValueError(f"label key or value did not pass validation--format should be 'key1=val1, key2=val2'") + # add labels for DSP Cloud Cost Control Labeling and Reporting + query_labels.update({'service':'gvs','team':'variants','managedby':'prepare_ranges_callset'}) + #Default QueryJobConfig will be merged into job configs passed in #but if a specific default config is being updated (eg labels), new config must be added #to the client._default_query_job_config that already exists diff --git a/scripts/variantstore/wdl/extract/populate_alt_allele_table.py b/scripts/variantstore/wdl/extract/populate_alt_allele_table.py index e7d3dec6d03..b3d39dba7fd 100644 --- a/scripts/variantstore/wdl/extract/populate_alt_allele_table.py +++ b/scripts/variantstore/wdl/extract/populate_alt_allele_table.py @@ -13,7 +13,8 @@ def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_path): global client - default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True) + # add labels for DSP Cloud Cost Control Labeling and Reporting to default_config + default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True, labels={'service':'gvs','team':'variants','managedby':'create_alt_allele'}) if sa_key_path: credentials = service_account.Credentials.from_service_account_file(