Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to not prepare __REF_DATA table to Prepare [VS-697] #8079

Merged
merged 13 commits into from
Nov 7, 2022
3 changes: 0 additions & 3 deletions .dockstore.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,6 @@ workflows:
branches:
- master
- ah_var_store
- gg_VS-637_RespectSampleLoadStatusFinishedFlag
- name: GvsPrepareRangesCallset
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Expand Down Expand Up @@ -230,15 +229,13 @@ workflows:
branches:
- master
- ah_var_store
- vs_655_avro_extract_warn_on_bad_filter_name
- name: GvsCallsetStatistics
subclass: WDL
primaryDescriptorPath: /scripts/variantstore/wdl/GvsCallsetStatistics.wdl
filters:
branches:
- master
- ah_var_store
- vs_655_avro_extract_warn_on_bad_filter_name
- name: MitochondriaPipeline
subclass: WDL
primaryDescriptorPath: /scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ task Add_AS_MAX_VQSLOD_ToVcf {
File input_vcf
String output_basename

String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
String docker = "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
Int cpu = 1
Int memory_mb = 3500
Int disk_size_gb = ceil(2*size(input_vcf, "GiB")) + 50
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCallsetCost.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ task WorkflowComputeCosts {
>>>

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
}

output {
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsCreateVAT.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ task MakeSubpopulationFilesAndReadSchemaFiles {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
memory: "1 GB"
preemptible: 3
cpu: "1"
Expand Down
4 changes: 2 additions & 2 deletions scripts/variantstore/wdl/GvsCreateVATAnnotations.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ task ExtractAnAcAfFromVCF {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
maxRetries: 3
memory: "16 GB"
preemptible: 3
Expand Down Expand Up @@ -291,7 +291,7 @@ task PrepAnnotationJson {
# ------------------------------------------------
# Runtime settings:
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
memory: "8 GB"
preemptible: 5
cpu: "1"
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsExtractAvroFilesForHail.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ task GenerateHailScripts {
File hail_create_vat_inputs_script = 'hail_create_vat_inputs.py'
}
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
disks: "local-disk 500 HDD"
}
}
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsPopulateAltAllele.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ task PopulateAltAlleleTable {
done
>>>
runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
memory: "3 GB"
disks: "local-disk 10 HDD"
cpu: 1
Expand Down
11 changes: 8 additions & 3 deletions scripts/variantstore/wdl/GvsPrepareRangesCallset.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ workflow GvsPrepareCallset {

Array[String]? query_labels
File? sample_names_to_extract
Boolean only_output_vet_tables = false
}

String full_extract_prefix = if (control_samples) then "~{extract_table_prefix}_controls" else extract_table_prefix
Expand All @@ -38,7 +39,8 @@ workflow GvsPrepareCallset {
fq_temp_table_dataset = fq_temp_table_dataset,
fq_destination_dataset = fq_destination_dataset,
temp_table_ttl_in_hours = 72,
control_samples = control_samples
control_samples = control_samples,
only_output_vet_tables = only_output_vet_tables
}

output {
Expand All @@ -61,6 +63,7 @@ task PrepareRangesCallsetTask {
String fq_destination_dataset
Array[String]? query_labels
Int temp_table_ttl_in_hours = 24
Boolean only_output_vet_tables
}
meta {
# All kinds of BQ reading happening in the referenced Python script.
Expand Down Expand Up @@ -98,14 +101,16 @@ task PrepareRangesCallsetTask {
--query_project ~{query_project} \
~{sep=" " query_label_args} \
--fq_sample_mapping_table ~{fq_sample_mapping_table} \
--ttl ~{temp_table_ttl_in_hours}
--ttl ~{temp_table_ttl_in_hours} \
~{true="--only_output_vet_tables True" false='' only_output_vet_tables}
rsasch marked this conversation as resolved.
Show resolved Hide resolved

>>>
output {
String fq_cohort_extract_table_prefix = "~{fq_destination_dataset}.~{destination_cohort_table_prefix}" # implementation detail of create_ranges_cohort_extract_data_table.py
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
memory: "3 GB"
disks: "local-disk 100 HDD"
bootDiskSizeGb: 15
Expand Down
2 changes: 1 addition & 1 deletion scripts/variantstore/wdl/GvsUtils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ task ScaleXYBedValues {
}

runtime {
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-10-25-alpine"
docker: "us.gcr.io/broad-dsde-methods/variantstore:2022-11-07-v2-alpine"
maxRetries: 3
memory: "7 GB"
preemptible: 3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,19 @@ def load_sample_names(sample_names_to_extract, fq_temp_table_dataset):
return fq_sample_table


def get_all_sample_ids(fq_destination_table_samples):
sql = f"select sample_id from `{fq_destination_table_samples}`"
def get_all_sample_ids(fq_destination_table_samples, only_output_vet_tables, fq_sample_mapping_table):
rsasch marked this conversation as resolved.
Show resolved Hide resolved
if only_output_vet_tables:
sql = f"select sample_id from `{fq_sample_mapping_table}` WHERE is_control = false AND withdrawn IS NULL"
sample_table = fq_sample_mapping_table
else:
sql = f"select sample_id from `{fq_destination_table_samples}`"
sample_table = fq_destination_table_samples

query_return = utils.execute_with_retry(client, "read cohort sample table", sql)
JOBS.append({'job': query_return['job'], 'label': query_return['label']})
sample_ids = [row.sample_id for row in list(query_return['results'])]
sample_ids.sort()
print(f"Discovered {len(sample_ids)} samples in {sample_table}...")
return sample_ids


Expand Down Expand Up @@ -221,8 +227,8 @@ def make_extract_table(call_set_identifier,
fq_destination_dataset,
destination_table_prefix,
fq_sample_mapping_table,
temp_table_ttl_hours
):
temp_table_ttl_hours,
only_output_vet_tables):
try:
fq_destination_table_ref_data = f"{fq_destination_dataset}.{destination_table_prefix}__REF_DATA"
fq_destination_table_vet_data = f"{fq_destination_dataset}.{destination_table_prefix}__VET_DATA"
Expand Down Expand Up @@ -281,18 +287,19 @@ def make_extract_table(call_set_identifier,
# drive the extract. If this script was explicitly given a list of sample names then it should create the
# cohort from those samples without regard to `withdrawn` on the `sample_info` table, otherwise only include
# samples with a null `withdrawn` date in the cohort.
create_extract_samples_table(control_samples, fq_destination_table_samples, fq_sample_name_table,
if not only_output_vet_tables:
create_extract_samples_table(control_samples, fq_destination_table_samples, fq_sample_name_table,
fq_sample_mapping_table, honor_withdrawn=not sample_names_to_extract)

# pull the sample ids back down
sample_ids = get_all_sample_ids(fq_destination_table_samples)
print(f"Discovered {len(sample_ids)} samples in {fq_destination_table_samples}...")
sample_ids = get_all_sample_ids(fq_destination_table_samples, only_output_vet_tables, fq_sample_mapping_table)

# create the tables for extract data
create_final_extract_ref_table(fq_destination_table_ref_data)
create_final_extract_vet_table(fq_destination_table_vet_data)
# create and populate the tables for extract data
if not only_output_vet_tables:
create_final_extract_ref_table(fq_destination_table_ref_data)
populate_final_extract_table_with_ref(fq_ranges_dataset, fq_destination_table_ref_data, sample_ids)

populate_final_extract_table_with_ref(fq_ranges_dataset, fq_destination_table_ref_data, sample_ids)
create_final_extract_vet_table(fq_destination_table_vet_data)
populate_final_extract_table_with_vet(fq_ranges_dataset, fq_destination_table_vet_data, sample_ids)

finally:
Expand Down Expand Up @@ -325,6 +332,8 @@ def make_extract_table(call_set_identifier,
parser.add_argument('--max_tables',type=int, help='Maximum number of vet/ref ranges tables to consider', required=False,
default=250)
parser.add_argument('--ttl', type=int, help='Temp table TTL in hours', required=False, default=72)
parser.add_argument('--only_output_vet_tables', type=bool,
help='Only create __VET_DATA table, skip __REF_DATA and __SAMPLES tables', required=False, default=False)

sample_args = parser.add_mutually_exclusive_group(required=True)
sample_args.add_argument('--sample_names_to_extract', type=str,
Expand All @@ -348,4 +357,5 @@ def make_extract_table(call_set_identifier,
args.fq_destination_dataset,
args.destination_cohort_table_prefix,
args.fq_sample_mapping_table,
args.ttl)
args.ttl,
args.only_output_vet_tables)