broadinstitute · mmorgantaylor · Mar 2, 2021 · Feb 25, 2021 · Feb 25, 2021 · Feb 25, 2021
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -57,6 +57,7 @@ workflows:
        branches:
          - master
          - ah_var_store
+         - mmt_vcf_inputs
    - name: funcotator
      subclass: WDL
      primaryDescriptorPath: /scripts/funcotator_wdl/funcotator.wdl

diff --git a/scripts/variantstore/wdl/ImportGenomes.example.inputs.json b/scripts/variantstore/wdl/ImportGenomes.example.inputs.json
@@ -6,13 +6,12 @@
   "ImportGenomes.drop_state": "SIXTY",
   "ImportGenomes.gatk_override": "gs://broad-dsp-spec-ops/kcibul/gatk-package-4.1.8.1-153-g9c3b338-SNAPSHOT-local.jar",
 
+  "ImportGenomes.input_vcfs": "this.samples.hg38_reblocked_gvcf",
+
   "ImportGenomes.project_id": "spec-ops-aou",
   "ImportGenomes.dataset_name": "kc_import_test1",
 
-  "ImportGenomes.sample_map": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/test_sample_map.csv",
-
-  "ImportGenomes.input_vcfs_list": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/input_vcfs.txt",
-
-  "ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun2_importdir"
+  "ImportGenomes.output_directory": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/testrun2_importdir",
+  "ImportGenomes.sample_map": "gs://fc-50fc04ca-572b-4cba-82d5-4af10722cdc7/test_sample_map.csv"
 }
 
diff --git a/scripts/variantstore/wdl/ImportGenomes.wdl b/scripts/variantstore/wdl/ImportGenomes.wdl
@@ -1,11 +1,9 @@
 version 1.0
 
-import "https://api.firecloud.org/ga4gh/v1/tools/synthetic-microarray-gen:LoadBigQueryData/versions/7/plain-WDL/descriptor" as LoadBigQueryData
-
 workflow ImportGenomes {
 
   input {
-    File input_vcfs_list
+    Array[File] input_vcfs
     Array[File]? input_metrics
     File interval_list
     String output_directory
@@ -24,13 +22,13 @@ workflow ImportGenomes {
   }
 
   String docker_final = select_first([docker, "us.gcr.io/broad-gatk/gatk:4.1.7.0"])
-  Array[String] input_vcfs = read_lines(input_vcfs_list)
 
   call GetMaxTableId {
     input:
       sample_map = sample_map
   }
 
+  # create the pet, vet, and metadata TSVs to be imported to BQ
   scatter (i in range(length(input_vcfs))) {
     if (defined(input_metrics)) {
       File input_metric = select_first([input_metrics])[i]
@@ -51,46 +49,94 @@ workflow ImportGenomes {
     }
   }
 
-  call LoadBigQueryData.LoadBigQueryData as LoadMetadataTsvs {
-    input:
-      done = CreateImportTsvs.done,
+  # CreateTables requires GetMaxTableId and CreateImportTSVs to have completed
+  call CreateTables as CreateMetadataTables {
+  	input:
+  	  tsv_creation_done = CreateImportTsvs.done,
       project_id = project_id,
       dataset_name = dataset_name,
       storage_location = output_directory,
       datatype = "metadata",
-      numbered = "false",
-      partitioned = "false",
       max_table_id = GetMaxTableId.max_table_id,
       schema = metadata_schema,
+      numbered = "false",
+      partitioned = "false",
+      uuid = "",
       preemptible_tries = preemptible_tries,
       docker = docker_final
   }
 
-  call LoadBigQueryData.LoadBigQueryData as LoadPetTsvs {
-    input:
-      done = CreateImportTsvs.done,
+  call CreateTables as CreatePetTables {
+  	input:
+  	  tsv_creation_done = CreateImportTsvs.done,
       project_id = project_id,
       dataset_name = dataset_name,
       storage_location = output_directory,
       datatype = "pet",
       max_table_id = GetMaxTableId.max_table_id,
       schema = pet_schema,
+      numbered = "false",
+      partitioned = "false",
+      uuid = "",
       preemptible_tries = preemptible_tries,
       docker = docker_final
   }
 
-  call LoadBigQueryData.LoadBigQueryData as LoadVetTsvs {
-    input:
-      done = CreateImportTsvs.done,
+  call CreateTables as CreateVetTables {
+  	input:
+  	  tsv_creation_done = CreateImportTsvs.done,
       project_id = project_id,
       dataset_name = dataset_name,
       storage_location = output_directory,
       datatype = "vet",
       max_table_id = GetMaxTableId.max_table_id,
       schema = vet_schema,
+      numbered = "false",
+      partitioned = "false",
+      uuid = "",
       preemptible_tries = preemptible_tries,
       docker = docker_final
+  }
+
+  # LoadTable requires CreateTables and CreateImportTsvs to be completed
+  scatter (table_dir_files_str in CreateMetadataTables.table_dir_files_list) {
+    call LoadTable as LoadMetadataTable {
+      input:
+        tsv_creation_done = CreateImportTsvs.done,
+        table_dir_files_str = table_dir_files_str,
+        project_id = project_id,
+        schema = metadata_schema,
+        load = "true",
+        preemptible_tries = preemptible_tries,
+        docker = docker_final
     }
+  }
+
+  scatter (table_dir_files_str in CreatePetTables.table_dir_files_list) {
+    call LoadTable as LoadPetTable {
+      input:
+        tsv_creation_done = CreateImportTsvs.done,
+        table_dir_files_str = table_dir_files_str,
+        project_id = project_id,
+        schema = pet_schema,
+        load = "true",
+        preemptible_tries = preemptible_tries,
+        docker = docker_final
+    }
+  }
+
+  scatter (table_dir_files_str in CreateVetTables.table_dir_files_list) {
+    call LoadTable as LoadVetTable {
+      input:
+        tsv_creation_done = CreateImportTsvs.done,
+        table_dir_files_str = table_dir_files_str,
+        project_id = project_id,
+        schema = vet_schema,
+        load = "true",
+        preemptible_tries = preemptible_tries,
+        docker = docker_final
+    }
+  }
 }
 
 task GetMaxTableId {
@@ -133,8 +179,6 @@ task CreateImportTsvs {
     Int? preemptible_tries
     File? gatk_override
     String docker
-
-    String? for_testing_only
   }
 
   Int multiplier = if defined(drop_state) then 4 else 10
@@ -157,7 +201,6 @@ task CreateImportTsvs {
       export TMPDIR=/tmp
 
       export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override}
-      ~{for_testing_only}
 
       gatk --java-options "-Xmx7000m" CreateVariantIngestFiles \
         -V ~{input_vcf} \
@@ -185,3 +228,131 @@ task CreateImportTsvs {
   }
 }
 
+# Creates all the tables necessary for the LoadData operation
+# As an optimization, I also generate a (table, dir, files) csv file which contains
+# most of inputs necessary for the following LoadTable task.
+task CreateTables {
+	meta {
+    	volatile: true
+  	}
+
+	input {
+      String project_id
+      String dataset_name
+      String storage_location
+      String datatype
+      Int max_table_id
+      File schema
+      String numbered
+      String partitioned
+      String uuid
+      Array[String] tsv_creation_done
+
+      # runtime
+      Int? preemptible_tries
+      String docker
+    }
+
+  command <<<
+    set -x
+    set -e
+
+    DIR="~{storage_location}/~{datatype}_tsvs/"
+
+    for TABLE_ID in $(seq 1 ~{max_table_id}); do
+      PARTITION_STRING=""
+      if [ ~{partitioned} == "true" ]; then
+        let "PARTITION_START=(${TABLE_ID}-1)*4000+1"
+        let "PARTITION_END=$PARTITION_START+3999"
+        let "PARTITION_STEP=1"
+        PARTITION_FIELD="sample_id"
+        PARTITION_STRING="--range_partitioning=$PARTITION_FIELD,$PARTITION_START,$PARTITION_END,$PARTITION_STEP"
+      fi
+
+      printf -v PADDED_TABLE_ID "_%03d" ${TABLE_ID}
+      FILES="~{datatype}${PADDED_TABLE_ID}_*"
+
+      NUM_FILES=$(gsutil ls $DIR$FILES | wc -l)
+
+      # create the table
+      PREFIX=""
+      if [ -n "~{uuid}" ]; then
+          PREFIX="~{uuid}_"
+      fi
+
+      if [ $NUM_FILES -gt 0 ]; then
+        if [ ~{numbered} != "true" ]; then
+          PADDED_TABLE_ID=""  #override table id to empty string, but it is needed to get the files
+        fi
+
+        TABLE="~{dataset_name}.${PREFIX}~{datatype}${PADDED_TABLE_ID}"
+
+        # Check that the table has not been created yet
+        set +e
+        bq show --project_id ~{project_id} $TABLE > /dev/null
+        BQ_SHOW_RC=$?
+        set -e
+        if [ $BQ_SHOW_RC -ne 0 ]; then
+          echo "making table $TABLE"
+          bq --location=US mk ${PARTITION_STRING} --project_id=~{project_id} $TABLE ~{schema}
+        fi
+
+        echo "$TABLE,$DIR,$FILES" >> table_dir_files.csv
+      else
+        echo "no ${FILES} files to process"
+      fi
+    done
+  >>>
+
+  output {
+    Array[String] table_dir_files_list = read_lines("table_dir_files.csv")
+  }
+
+  runtime {
+    docker: docker
+    memory: "3 GB"
+    disks: "local-disk 10 HDD"
+    preemptible: select_first([preemptible_tries, 5])
+    cpu: 1
+  }
+}
+
+task LoadTable {
+  meta {
+    volatile: true
+  }
+
+  input {
+    String table_dir_files_str
+    String project_id
+    File schema
+    String load
+    Array[String] tsv_creation_done
+
+    Int? preemptible_tries
+    String docker
+  }
+
+  command <<<
+    TABLE=$(echo ~{table_dir_files_str} | cut -d, -f1)
+    DIR=$(echo ~{table_dir_files_str} | cut -d, -f2)
+    FILES=$(echo ~{table_dir_files_str} | cut -d, -f3)
+
+    #load should be false if using Google Storage Transfer so that the tables will be created by this script, but no data will be uploaded.
+    if [ ~{load} = true ]; then
+      bq load --location=US --project_id=~{project_id} --skip_leading_rows=1 --source_format=CSV -F "\t" $TABLE $DIR$FILES ~{schema} || exit 1
+      echo "ingested ${FILES} file from $DIR into table $TABLE"
+      gsutil mv $DIR$FILES ${DIR}done/
+    else
+      echo "${FILES} will be ingested from $DIR by Google Storage Transfer"
+    fi
+  >>>
+
+  runtime {
+    docker: docker
+    memory: "3 GB"
+    disks: "local-disk 10 HDD"
+    preemptible: select_first([preemptible_tries, 5])
+    cpu: 1
+  }
+}