broadinstitute · rsasch · Jun 3, 2022 · May 18, 2022 · May 18, 2022 · May 18, 2022
diff --git a/scripts/variantstore/wdl/GvsImportGenomes.wdl b/scripts/variantstore/wdl/GvsImportGenomes.wdl
@@ -36,7 +36,7 @@ workflow GvsImportGenomes {
     }
   }
 
-  call GetSampleIds {
+  call GetUningestedSampleIds {
     input:
       dataset_name = dataset_name,
       project_id = project_id,
@@ -45,28 +45,30 @@ workflow GvsImportGenomes {
       service_account_json_path = service_account_json_path
   }
 
-  call CheckForDuplicateData {
+  call CurateInputLists {
     input:
       dataset_name = dataset_name,
       project_id = project_id,
-      sample_names = external_sample_names,
+      input_vcf_index_list = write_lines(input_vcf_indexes),
+      input_vcf_list = write_lines(input_vcfs),
+      input_sample_name_list = write_lines(external_sample_names),
+      input_samples_to_be_loaded_map = GetUningestedSampleIds.sample_map,
       service_account_json_path = service_account_json_path
   }
 
   call CreateFOFNs {
     input:
       batch_size = load_data_batch_size,
-      input_vcf_index_list = write_lines(input_vcf_indexes),
-      input_vcf_list = write_lines(input_vcfs),
-      sample_name_list = write_lines(external_sample_names),
+      input_vcf_index_list = CurateInputLists.input_vcf_indexes,
+      input_vcf_list = CurateInputLists.input_vcfs,
+      sample_name_list = CurateInputLists.sample_name_list,
   }
 
   scatter (i in range(length(CreateFOFNs.vcf_batch_vcf_fofns))) {
     call LoadData {
       input:
         dataset_name = dataset_name,
         project_id = project_id,
-        duplicate_check_passed = CheckForDuplicateData.done,
         skip_loading_vqsr_fields = skip_loading_vqsr_fields,
         drop_state = "FORTY",
         drop_state_includes_greater_than = false,
@@ -77,7 +79,7 @@ workflow GvsImportGenomes {
         load_data_preemptible_override = load_data_preemptible_override,
         load_data_maxretries_override = load_data_maxretries_override,
         sample_names = read_lines(CreateFOFNs.vcf_sample_name_fofns[i]),
-        sample_map = GetSampleIds.sample_map,
+        sample_map = GetUningestedSampleIds.sample_map,
         service_account_json_path = service_account_json_path,
     }
   }
@@ -96,78 +98,6 @@ workflow GvsImportGenomes {
   }
 }
 
-task CheckForDuplicateData {
-  input {
-    String dataset_name
-    String project_id
-
-    Array[String] sample_names
-
-    String? service_account_json_path
-  }
-
-  String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
-  Int num_samples = length(sample_names)
-
-  meta {
-    volatile: true
-  }
-
-  command <<<
-    set -e
-
-    if [ ~{has_service_account_file} = 'true' ]; then
-      gsutil cp ~{service_account_json_path} local.service_account.json
-      gcloud auth activate-service-account --key-file=local.service_account.json
-      gcloud config set project ~{project_id}
-    fi
-
-    echo "project_id = ~{project_id}" > ~/.bigqueryrc
-
-    INFO_SCHEMA_TABLE="~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS"
-    TEMP_TABLE="~{dataset_name}.sample_dupe_check"
-    SAMPLE_INFO_TABLE="~{dataset_name}.sample_info"
-
-    # create a temp table with the sample_names
-    bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
-    NAMES_FILE=~{write_lines(sample_names)}
-    bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"
-
-    # check the INFORMATION_SCHEMA.PARTITIONS table to see if any of input sample names/ids have data loaded into their partitions
-    # this returns the list of sample names that do already have data loaded
-    echo "WITH items as (SELECT s.sample_id, s.sample_name, s.is_loaded, s.withdrawn FROM \`${TEMP_TABLE}\` t left outer join \`${SAMPLE_INFO_TABLE}\` s on (s.sample_name = t.sample_name)) " >> query.sql
-    echo "SELECT i.sample_name FROM \`${INFO_SCHEMA_TABLE}\` p JOIN items i ON (p.partition_id = CAST(i.sample_id AS STRING)) WHERE p.total_logical_bytes > 0 AND (table_name like 'ref_ranges_%' OR table_name like 'vet_%')" >> query.sql
-    echo "UNION DISTINCT "  >> query.sql
-    echo "SELECT i.sample_name FROM items i WHERE i.is_loaded = True "  >> query.sql
-    echo "UNION DISTINCT "  >> query.sql
-    echo "SELECT i.sample_name FROM items i WHERE i.sample_id IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\`) "  >> query.sql
-
-
-    cat query.sql | bq --location=US --project_id=~{project_id} query --format=csv -n ~{num_samples} --use_legacy_sql=false | sed -e '/sample_name/d' > duplicates
-
-      # remove the temp table
-      bq --project_id=~{project_id} rm -f -t ${TEMP_TABLE}
-
-    # true if there is data in results
-    if [ -s duplicates ]; then
-      echo "ERROR: Trying to load samples that have already been loaded"
-      cat duplicates
-      exit 1
-    fi
-  >>>
-  runtime {
-    docker: "gcr.io/google.com/cloudsdktool/cloud-sdk:305.0.0"
-    memory: "1 GB"
-    disks: "local-disk 10 HDD"
-    preemptible: 5
-    cpu: 1
-  }
-  output {
-    Boolean done = true
-    File? duplicates = "duplicates"
-  }
-}
-
 task CreateFOFNs {
   input {
     Int batch_size
@@ -204,7 +134,6 @@ task LoadData {
     String dataset_name
     String project_id
 
-    Boolean duplicate_check_passed
     Array[File] input_vcf_indexes
     Array[File] input_vcfs
     File interval_list
@@ -304,8 +233,6 @@ task LoadData {
   }
 }
 
-
-
 task SetIsLoadedColumn {
   meta {
     volatile: true
@@ -349,7 +276,7 @@ task SetIsLoadedColumn {
   }
 }
 
-task GetSampleIds {
+task GetUningestedSampleIds {
   meta {
     volatile: true
   }
@@ -378,20 +305,19 @@ task GetSampleIds {
     echo "project_id = ~{project_id}" > ~/.bigqueryrc
 
     # create temp table with the sample_names and load external sample names into temp table -- make sure it doesn't exist already
-     set +e
-     TEMP_TABLE="~{dataset_name}.sample_names_to_load"
-     bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null
-     BQ_SHOW_RC=$?
-     set -e
-
-     # if there is already a table of sample names or something else is wrong, bail
-     if [ $BQ_SHOW_RC -eq 0 ]; then
-       echo "There is already a list of sample names. This may need manual cleanup. Exiting"
-       exit 1
-     fi
+    set +e
+    TEMP_TABLE="~{dataset_name}.sample_names_to_load"
+    bq show --project_id ~{project_id} ${TEMP_TABLE} > /dev/null
+    BQ_SHOW_RC=$?
+    set -e
+
+    # if there is already a table of sample names or something else is wrong, bail
+    if [ $BQ_SHOW_RC -eq 0 ]; then
+      echo "There is already a list of sample names. This may need manual cleanup. Exiting"
+      exit 1
+    fi
 
     echo "Creating the external sample name list table ${TEMP_TABLE}"
-    TEMP_TABLE="~{dataset_name}.sample_names_to_load"
     bq --project_id=~{project_id} mk ${TEMP_TABLE} "sample_name:STRING"
     NAMES_FILE=~{write_lines(external_sample_names)}
     bq load --project_id=~{project_id} ${TEMP_TABLE} $NAMES_FILE "sample_name:STRING"
@@ -413,8 +339,9 @@ task GetSampleIds {
     python3 -c "from math import ceil; print(ceil($max_sample_id/~{samples_per_table}))" > max_sample_id
     python3 -c "from math import ceil; print(ceil($min_sample_id/~{samples_per_table}))" > min_sample_id
 
+    # get sample map of samples that haven't been loaded yet
     bq --project_id=~{project_id} query --format=csv --use_legacy_sql=false -n ~{num_samples} \
-      "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name" > sample_map
+      "SELECT sample_id, samples.sample_name FROM \`~{dataset_name}.~{table_name}\` AS samples JOIN \`${TEMP_TABLE}\` AS temp ON samples.sample_name=temp.sample_name WHERE samples.sample_id NOT IN (SELECT sample_id FROM \`~{dataset_name}.sample_load_status\` WHERE status='FINISHED')" > sample_map
 
     cut -d, -f1 sample_map > gvs_ids
-    cut -d, -f1 sample_map > gvs_ids
+    cut -d ',' -f1 sample_map > gvs_ids
-    cut -d, -f1 sample_map > gvs_ids
+    cut -d ',' -f1 sample_map > gvs_ids
 
@@ -435,3 +362,45 @@ task GetSampleIds {
     File gvs_ids = "gvs_ids"
   }
 }
+
+task CurateInputLists {
+  input {
+    String dataset_name
+    String project_id
+    File input_vcf_index_list
+    File input_vcf_list
+    File input_samples_to_be_loaded_map
+    File input_sample_name_list
+
+    String? service_account_json_path
+  }
+
+  String has_service_account_file = if (defined(service_account_json_path)) then 'true' else 'false'
+  command <<<
+    set -ex
+    if [ ~{has_service_account_file} = 'true' ]; then
+      gsutil cp ~{service_account_json_path} local.service_account.json
+      gcloud auth activate-service-account --key-file=local.service_account.json
+    fi
+
+    python3 /app/curate_input_array_files.py --sample_map_to_be_loaded_file_name ~{input_samples_to_be_loaded_map} \
+                                             --sample_name_list_file_name ~{input_sample_name_list} \
+                                             --vcf_list_file_name ~{input_vcf_list} \
+                                             --vcf_index_list_file_name  ~{input_vcf_index_list} \
+                                             --output_files True
+  >>>
+  runtime {
+    docker: "us.gcr.io/broad-dsde-methods/variantstore:rsa_skip_samples_20220602"
+    memory: "3 GB"
+    disks: "local-disk 100 HDD"
+    bootDiskSizeGb: 15
+    preemptible: 3
+    cpu: 1
+  }
+
+  output {
+    File input_vcf_indexes = "output_vcf_index_list_file"
+    File input_vcfs = "output_vcf_list_file"
+    File sample_name_list = "output_sample_name_list_file"
+  }
+}
diff --git a/scripts/variantstore/wdl/extract/Dockerfile b/scripts/variantstore/wdl/extract/Dockerfile
@@ -14,6 +14,7 @@ COPY alt_allele_positions.sql /app
 COPY alt_allele_temp_function.sql /app
 COPY utils.py /app
 COPY add_max_as_vqslod.py /app
+COPY curate_input_array_files.py /app
 COPY scale_xy_bed_values.py /app
 
 WORKDIR /app
diff --git a/scripts/variantstore/wdl/extract/curate_input_array_files.py b/scripts/variantstore/wdl/extract/curate_input_array_files.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+import numpy as np
+from contextlib import contextmanager
+import argparse
+
+SAMPLE_MAP_TO_BE_LOADED_FILE_SUFFIX = "samples_to_be_loaded_map_file"
+SAMPLE_NAME_FILE_SUFFIX = "sample_name_list_file"
+VCF_FILE_SUFFIX = "vcf_list_file"
+VCF_INDEX_FILE_SUFFIX = "vcf_index_list_file"
+
+@contextmanager
+def handle_file_error(file_name):
+    try:
+        yield
+    except:
+        print(f"ERROR: required file named '{file_name}' does not exist.")
+
+
+def curate_input_arrays(sample_map_to_be_loaded_file_name,
+                        sample_name_list_file_name,
+                        vcf_list_file_name,
+                        vcf_index_list_file_name,
+                        output_files):
+    sample_map_to_be_loaded_array = vcf_array = vcf_indexes_array = sample_names_array = []
+    with handle_file_error(sample_map_to_be_loaded_file_name):
+        sample_map_to_be_loaded_array = np.loadtxt(sample_map_to_be_loaded_file_name, dtype=str, delimiter=",")
+    with handle_file_error(vcf_list_file_name):
+        vcf_array = np.loadtxt(vcf_list_file_name, dtype=str)
+    with handle_file_error(vcf_index_list_file_name):
+        vcf_indexes_array = np.loadtxt(vcf_index_list_file_name, dtype=str)
+    with handle_file_error(sample_name_list_file_name):
+        sample_names_array = np.loadtxt(sample_name_list_file_name, dtype=str)
+    rows_to_delete = []
+
+    # use input_sample_names_array to figure out which index "rows" to delete
+    for i in range(len(sample_names_array)):
+        if sample_names_array[i] not in sample_map_to_be_loaded_array:
+            rows_to_delete.append(i)
+
+    # re-create input arrays using array of "rows" to delete
+    vcf_array = [vcf_array[i] for i in range(len(vcf_array)) if i not in rows_to_delete]
+    vcf_indexes_array = [vcf_indexes_array[i] for i in range(len(vcf_indexes_array)) if
+                         i not in rows_to_delete]
+    sample_names_array = [sample_names_array[i] for i in range(len(sample_names_array)) if
+                          i not in rows_to_delete]
+
+    if output_files:
+        print(f"Creating 'output_{SAMPLE_NAME_FILE_SUFFIX}', 'output_{VCF_FILE_SUFFIX}' and 'output_{VCF_INDEX_FILE_SUFFIX}'.")
+        np.savetxt(f"output_{SAMPLE_NAME_FILE_SUFFIX}", sample_names_array, fmt='%s')
+        np.savetxt(f"output_{VCF_FILE_SUFFIX}", vcf_array, fmt='%s')
+        np.savetxt(f"output_{VCF_INDEX_FILE_SUFFIX}", vcf_indexes_array, fmt='%s')
+    else:
+        d = dict();
+        d['sample_names_array'] = sample_names_array
+        d['vcf_array'] = vcf_array
+        d['vcf_indexes_array'] = vcf_indexes_array
+        return d
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(allow_abbrev=False, description='Curate GvsImportGenomes arrays to remove duplicate samples')
+
+    parser.add_argument('--sample_map_to_be_loaded_file_name',type=str, help='name of sample_map file', required=False, default=f"input_{SAMPLE_MAP_TO_BE_LOADED_FILE_SUFFIX}")
+    parser.add_argument('--sample_name_list_file_name',type=str, help='name of sample name list file', required=False, default=f"input_{SAMPLE_NAME_FILE_SUFFIX}")
+    parser.add_argument('--vcf_list_file_name',type=str, help='name of VCF list file', required=False, default=f"input_{VCF_FILE_SUFFIX}")
+    parser.add_argument('--vcf_index_list_file_name',type=str, help='name of VCF index list file', required=False, default=f"input_{VCF_INDEX_FILE_SUFFIX}")
+    parser.add_argument('--output_files',type=bool, help='true (default): outputs are files; false: outputs are arrays', required=False, default=True)
+    args = parser.parse_args()
+
+    curate_input_arrays(args.sample_map_to_be_loaded_file_name,
+                        args.sample_name_list_file_name,
+                        args.vcf_list_file_name,
+                        args.vcf_index_list_file_name,
+                        args.output_files)
diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_sample_name_list_file
@@ -0,0 +1,10 @@
+ERS4367795
+ERS4367796
+ERS4367797
+ERS4367798
+ERS4367799
+ERS4367800
+ERS4367801
+ERS4367803
+ERS4367804
+ERS4367805
diff --git a/...ariantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file b/...ariantstore/wdl/extract/curate_input_array_test_files/input_samples_to_be_loaded_map_file
@@ -0,0 +1,8 @@
+sample_id,sample_name
+9,ERS4367804
+7,ERS4367801
+4,ERS4367798
+6,ERS4367800
+10,ERS4367805
+2,ERS4367796
+1,ERS4367795
diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_index_list_file
@@ -0,0 +1,10 @@
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/22604e48-c97b-4709-bb0c-aeeef2891177/call-Reblock/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/ee0fbdc6-4d59-4733-a40b-e3fd51b8daea/call-Reblock/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/26523d6c-5bae-4486-915d-f4ee3a969420/call-Reblock/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
diff --git a/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file b/scripts/variantstore/wdl/extract/curate_input_array_test_files/input_vcf_list_file
@@ -0,0 +1,10 @@
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/22604e48-c97b-4709-bb0c-aeeef2891177/call-Reblock/HG00418.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/ee0fbdc6-4d59-4733-a40b-e3fd51b8daea/call-Reblock/HG00423.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/26523d6c-5bae-4486-915d-f4ee3a969420/call-Reblock/HG00444.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz
diff --git a/...riantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct b/...riantstore/wdl/extract/curate_input_array_test_files/output_sample_name_list_file_correct
@@ -0,0 +1,7 @@
+ERS4367795
+ERS4367796
+ERS4367798
+ERS4367800
+ERS4367801
+ERS4367804
+ERS4367805
diff --git a/...variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct b/...variantstore/wdl/extract/curate_input_array_test_files/output_vcf_index_list_file_correct
@@ -0,0 +1,7 @@
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/702cdbf7-0666-4ee5-b889-91ba0ffa90bd/call-Reblock/HG00405.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/59ee6ce9-b8d0-4e39-8cc7-8908c6daf87c/call-Reblock/HG00408.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/16c7c3a2-aa82-42ca-904f-8f0eebc21507/call-Reblock/attempt-2/HG00420.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/1acc4d0b-812d-4539-9a3b-841c3413d057/call-Reblock/HG00427.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/47091691-f665-4a2c-bc6a-b4b5d57fa222/call-Reblock/HG00429.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/117697bf-fe61-4c18-85c3-162c706c9037/call-Reblock/attempt-2/HG00447.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi
+gs://fc-e2f6ffa2-4033-4517-98fc-889bee4cc7a6/5e6b194b-5f69-40f2-a6de-f4f3f80ce05a/ReblockGVCF/d5dbd8dc-bbd1-484a-b4ea-94c02ed896d0/call-Reblock/HG00450.haplotypeCalls.er.raw.vcf.gz.rb.g.vcf.gz.tbi