broadinstitute · rsasch · Jun 17, 2022 · Jun 13, 2022 · Jun 14, 2022 · Jun 14, 2022
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -186,6 +186,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - rsa_add_prelim_labels
    - name: GvsCalculatePrecisionAndSensitivity
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsCalculatePrecisionAndSensitivity.wdl
@@ -201,7 +202,6 @@ workflows:
          - master
          - ah_var_store
          - vs_447_fixup_non_fq_invocations
-         - rsa_use_internal_project
    - name: GvsIngestTieout
      subclass: WDL
      primaryDescriptorPath: /scripts/variantstore/wdl/GvsIngestTieout.wdl

diff --git a/scripts/variantstore/wdl/GvsAssignIds.wdl b/scripts/variantstore/wdl/GvsAssignIds.wdl
@@ -92,6 +92,8 @@ task AssignIds {
 
   Int samples_per_table = 4000
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:assign_ids"
 
   command <<<
     set -e
@@ -132,20 +134,20 @@ task AssignIds {
     bq load --project_id=~{project_id} ~{dataset_name}.sample_id_assignment_lock $NAMES_FILE "sample_name:STRING"
 
     # add sample_name to sample_info_table
-    bq --project_id=~{project_id} query --use_legacy_sql=false \
+    bq --project_id=~{project_id} query --use_legacy_sql=false ~{bq_labels} \
       'INSERT into `~{dataset_name}.~{sample_info_table}` (sample_name, is_control) select sample_name, ~{samples_are_controls} from `~{dataset_name}.sample_id_assignment_lock` m where m.sample_name not in (SELECT sample_name FROM `~{dataset_name}.~{sample_info_table}`)'
 
     # get the current maximum id, or 0 if there are none
-    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
+    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} 'SELECT IFNULL(MAX(sample_id),0) FROM `~{dataset_name}.~{sample_info_table}`' > maxid
     offset=$(tail -1 maxid)
 
     # perform actual id assignment
-    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false --parameter=offset:INTEGER:$offset \
+    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} --parameter=offset:INTEGER:$offset \
       'UPDATE `~{dataset_name}.~{sample_info_table}` m SET m.sample_id = id_assign.id FROM (SELECT sample_name, @offset + ROW_NUMBER() OVER(order by sample_name) as id FROM `~{dataset_name}.~{sample_info_table}` WHERE sample_id IS NULL) id_assign WHERE m.sample_name = id_assign.sample_name;'
 
     # retrieve the list of assigned ids and samples to update the datamodel
     echo "entity:sample_id,gvs_id" > update.tsv
-    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n $num_samples --parameter=offset:INTEGER:$offset \
+    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n $num_samples --parameter=offset:INTEGER:$offset \
       'SELECT sample_name, sample_id from `~{dataset_name}.~{sample_info_table}` WHERE sample_id >= @offset' > update.tsv
     cat update.tsv | sed -e 's/sample_id/gvs_id/' -e 's/sample_name/entity:sample_id/' -e 's/,/\t/g' > gvs_ids.tsv
 

diff --git a/scripts/variantstore/wdl/GvsCreateAltAllele.wdl b/scripts/variantstore/wdl/GvsCreateAltAllele.wdl
@@ -66,6 +66,8 @@ task GetVetTableNames {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"
 
   command <<<
     set -e
@@ -77,7 +79,7 @@ task GetVetTableNames {
     fi
 
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
-    bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
+    bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
     'SELECT table_name FROM `~{project_id}.~{dataset_name}.INFORMATION_SCHEMA.TABLES` WHERE table_name LIKE "vet_%" ORDER BY table_name' > vet_tables.csv
 
     # remove the header row from the CSV file
@@ -109,6 +111,8 @@ task CreateAltAlleleTable {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:create_alt_allele"
 
   command <<<
     set -e
@@ -120,7 +124,7 @@ task CreateAltAlleleTable {
     fi
 
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
-    bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false \
+    bq query --location=US --project_id=~{project_id} --format=csv --use_legacy_sql=false ~{bq_labels} \
     'CREATE OR REPLACE TABLE `~{project_id}.~{dataset_name}.alt_allele` (
       location INT64,
       sample_id INT64,

diff --git a/scripts/variantstore/wdl/GvsExtractCallset.wdl b/scripts/variantstore/wdl/GvsExtractCallset.wdl
@@ -224,6 +224,8 @@ task ValidateFilterSetName {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"
 
   command <<<
     set -ex
@@ -236,7 +238,7 @@ task ValidateFilterSetName {
 
     echo "project_id = ~{query_project}" > ~/.bigqueryrc
 
-    OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
+    OUTPUT=$(bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT filter_set_name as available_filter_set_names FROM \`~{data_project}.~{data_dataset}.filter_set_info\` GROUP BY filter_set_name")
     FILTERSETS=${OUTPUT#"available_filter_set_names"}
 
     if [[ $FILTERSETS =~ "~{filter_set_name}" ]]; then
@@ -472,6 +474,8 @@ task GenerateSampleListFile {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:extract_callset"
 
   command <<<
     set -e
@@ -487,7 +491,7 @@ task GenerateSampleListFile {
 
     echo "project_id = ~{query_project}" > ~/.bigqueryrc
 
-    bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt
+    bq --location=US --project_id=~{query_project} --format=csv query --use_legacy_sql=false ~{bq_labels} "SELECT sample_name FROM ~{fq_samples_to_extract_table}" | sed 1d > sample-name-list.txt
 
     if [ -n "$OUTPUT_GCS_DIR" ]; then
       gsutil cp sample-name-list.txt ${OUTPUT_GCS_DIR}/

diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -250,6 +250,8 @@ task SetIsLoadedColumn {
   }
 
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"
 
   command <<<
     set -ex
@@ -263,7 +265,7 @@ task SetIsLoadedColumn {
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
     # set is_loaded to true if there is a corresponding vet table partition with rows for that sample_id
-    bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
+    bq --location=US --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
     'UPDATE `~{dataset_name}.sample_info` SET is_loaded = true WHERE sample_id IN (SELECT CAST(partition_id AS INT64) from `~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS` WHERE partition_id NOT LIKE "__%" AND total_logical_bytes > 0 AND table_name LIKE "vet_%") OR sample_id IN (SELECT sample_id FROM `~{dataset_name}.sample_load_status` GROUP BY 1 HAVING COUNT(1) = 2)'
   >>>
   runtime {
@@ -296,6 +298,8 @@ task GetUningestedSampleIds {
   Int samples_per_table = 4000
   String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
   Int num_samples = length(external_sample_names)
+  # add labels for DSP Cloud Cost Control Labeling and Reporting
+  String bq_labels = "--label service:gvs --label team:variants --label managedby:import_genomes"
 
   command <<<
     set -ex
@@ -325,7 +329,7 @@ task GetUningestedSampleIds {
     bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"
 
     # get the current maximum id, or 0 if there are none
-    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false \
+    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} \
       "SELECT IFNULL(MIN(sample_id),0) as min, IFNULL(MAX(sample_id),0) as max FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > results
 
     # prep for being able to return min table id
@@ -342,7 +346,7 @@ task GetUningestedSampleIds {
     python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id
 
     # get sample map of samples that haven't been loaded yet
-    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
+    bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false ~{bq_labels} -n ~{num_samples} \
       "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map
 
     cut -d, -f1 sample_map > gvs_ids

diff --git a/scripts/variantstore/wdl/GvsUtils.wdl b/scripts/variantstore/wdl/GvsUtils.wdl
@@ -287,6 +287,12 @@ task BuildGATKJarAndCreateDataset {
 
     bq mk --project_id="gvs-internal" "$dataset"
 
+    # add labels for DSP Cloud Cost Control Labeling and Reporting
+    bq update --set_label service:gvs gvs-internal:$dataset
+    bq update --set_label team:variants gvs-internal:$dataset
+    bq update --set_label env:dev gvs-internal:$dataset
+    bq update --set_label managedby:build_gatk_jar_and_create_dataset gvs-internal:$dataset
+
     echo -n "$dataset" > dataset.txt
   >>>
 

diff --git a/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py b/scripts/variantstore/wdl/extract/create_ranges_cohort_extract_data_table.py
@@ -246,6 +246,9 @@ def make_extract_table(control_samples,
         if not (bool(re.match(r"[a-z0-9_-]+$", key)) & bool(re.match(r"[a-z0-9_-]+$", value))):
           raise ValueError(f"label key or value did not pass validation--format should be 'key1=val1, key2=val2'")
 
+    # add labels for DSP Cloud Cost Control Labeling and Reporting
+    query_labels.update({'service':'gvs','team':'variants','managedby':'prepare_ranges_callset'})
+
     #Default QueryJobConfig will be merged into job configs passed in
     #but if a specific default config is being updated (eg labels), new config must be added
     #to the client._default_query_job_config that already exists

diff --git a/scripts/variantstore/wdl/extract/populate_alt_allele_table.py b/scripts/variantstore/wdl/extract/populate_alt_allele_table.py
@@ -13,7 +13,8 @@
 
 def populate_alt_allele_table(query_project, vet_table_name, fq_dataset, sa_key_path):
     global client
-    default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True)
+    # add labels for DSP Cloud Cost Control Labeling and Reporting to default_config
+    default_config = QueryJobConfig(priority="INTERACTIVE", use_query_cache=True, labels={'service':'gvs','team':'variants','managedby':'create_alt_allele'})
 
     if sa_key_path:
         credentials = service_account.Credentials.from_service_account_file(