diff --git a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl index 72a3928bc53..85fd890354a 100644 --- a/scripts/variantstore/wdl/GvsCallsetStatistics.wdl +++ b/scripts/variantstore/wdl/GvsCallsetStatistics.wdl @@ -190,11 +190,6 @@ task CreateTables { "name": "singleton", "type": "INT64", "mode": "NULLABLE" - }, - { - "name": "pass_qc", - "type": "INT64", - "mode": "NULLABLE" } ] FIN @@ -219,160 +214,40 @@ task CreateTables { "type": "INT64", "mode": "NULLABLE" }, - { - "name": "m_del_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_del_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_del_count", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "ins_count", "type": "INT64", "mode": "NULLABLE" }, - { - "name": "m_ins_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_ins_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_ins_count", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "snp_count", "type": "INT64", "mode": "NULLABLE" }, - { - "name": "m_snp_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_snp_count", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_snp_count", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "singleton", "type": "INT64", "mode": "NULLABLE" }, - { - "name": "m_singleton", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_singleton", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_singleton", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "ins_del_ratio", "type": "FLOAT64", "mode": "NULLABLE" }, - { - "name": "m_ins_del_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_ins_del_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_ins_del_ratio", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "ti_tv_ratio", "type": "FLOAT64", "mode": "NULLABLE" }, - { - "name": "m_ti_tv_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_ti_tv_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_ti_tv_ratio", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "snp_het_homvar_ratio", "type": "FLOAT64", "mode": "NULLABLE" }, - { - "name": "m_snp_het_homvar_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_snp_het_homvar_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_snp_het_homvar_ratio", - "type": "BOOL", - "mode": "NULLABLE" - }, { "name": "indel_het_homvar_ratio", "type": "FLOAT64", "mode": "NULLABLE" - }, - { - "name": "m_indel_het_homvar_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "mad_indel_het_homvar_ratio", - "type": "FLOAT64", - "mode": "NULLABLE" - }, - { - "name": "pass_indel_het_homvar_ratio", - "type": "BOOL", - "mode": "NULLABLE" } ] FIN @@ -490,8 +365,7 @@ task CollectMetricsForChromosome { snp_homvar_count, indel_het_count, indel_homvar_count, - singleton, - pass_qc + singleton ) SELECT "~{filter_set_name}" filter_set_name, sample_id, @@ -506,8 +380,7 @@ task CollectMetricsForChromosome { SUM(CASE WHEN type = "snp" AND gt_type = "homvar" THEN 1 ELSE 0 END) snp_homvar_count, SUM(CASE WHEN type IN ("ins","del") AND gt_type = "het" THEN 1 ELSE 0 END) indel_het_count, SUM(CASE WHEN type IN ("ins","del") AND gt_type = "homvar" THEN 1 ELSE 0 END) indel_homvar_count, - COUNTIF(not in_gnomad) singleton, - null AS pass_qc + COUNTIF(not in_gnomad) singleton FROM ( SELECT sample_id, type(ref, alt, call_GT) as type, @@ -575,8 +448,7 @@ task AggregateMetricsAcrossChromosomes { snp_homvar_count, indel_het_count, indel_homvar_count, - singleton, - pass_qc + singleton ) SELECT "~{filter_set_name}" filter_set_name, sample_id, @@ -590,8 +462,7 @@ task AggregateMetricsAcrossChromosomes { SUM(snp_homvar_count) snp_homvar_count, SUM(indel_het_count) indel_het_count, SUM(indel_homvar_count) indel_homvar_count, - SUM(singleton) singleton, - null AS pass_qc + SUM(singleton) singleton FROM `~{project_id}.~{dataset_name}.~{metrics_table}` GROUP BY 1,2 ' @@ -640,102 +511,31 @@ task CollectStatistics { sample_id, sample_name, del_count, - m_del_count, - mad_del_count, - pass_del_count, ins_count, - m_ins_count, - mad_ins_count, - pass_ins_count, snp_count, - m_snp_count, - mad_snp_count, - pass_snp_count, singleton, - m_singleton, - mad_singleton, - pass_singleton, ins_del_ratio, - m_ins_del_ratio, - mad_ins_del_ratio, - pass_ins_del_ratio, ti_tv_ratio, - m_ti_tv_ratio, - mad_ti_tv_ratio, - pass_ti_tv_ratio, snp_het_homvar_ratio, - m_snp_het_homvar_ratio, - mad_snp_het_homvar_ratio, - pass_snp_het_homvar_ratio, - indel_het_homvar_ratio, - m_indel_het_homvar_ratio, - mad_indel_het_homvar_ratio, - pass_indel_het_homvar_ratio + indel_het_homvar_ratio ) - WITH fss AS ( - SELECT *, - (ins_count / del_count) as ins_del_ratio, - (ti_count / tv_count) as ti_tv_ratio, - (snp_het_count / snp_homvar_count) snp_het_homvar_ratio, - (indel_het_count / indel_homvar_count) as indel_het_homvar_ratio - FROM `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` - WHERE filter_set_name = "~{filter_set_name}"), - medians AS ( - SELECT - `bqutil`.fn.median(ARRAY_AGG(del_count IGNORE NULLS)) as m_del_count, - `bqutil`.fn.median(ARRAY_AGG(ins_count IGNORE NULLS)) as m_ins_count, - `bqutil`.fn.median(ARRAY_AGG(snp_count IGNORE NULLS)) as m_snp_count, - `bqutil`.fn.median(ARRAY_AGG(singleton IGNORE NULLS)) as m_singleton, - `bqutil`.fn.median(ARRAY_AGG(ins_del_ratio IGNORE NULLS)) as m_ins_del_ratio, - `bqutil`.fn.median(ARRAY_AGG(ti_tv_ratio IGNORE NULLS)) as m_ti_tv_ratio, - `bqutil`.fn.median(ARRAY_AGG(snp_het_homvar_ratio IGNORE NULLS)) as m_snp_het_homvar_ratio, - `bqutil`.fn.median(ARRAY_AGG(indel_het_homvar_ratio IGNORE NULLS)) as m_indel_het_homvar_ratio - FROM fss), - mads AS ( - SELECT - `bqutil`.fn.median(ARRAY_AGG(ABS(del_count - m_del_count) IGNORE NULLS)) as mad_del_count, - `bqutil`.fn.median(ARRAY_AGG(ABS(ins_count - m_ins_count) IGNORE NULLS)) as mad_ins_count, - `bqutil`.fn.median(ARRAY_AGG(ABS(snp_count - m_snp_count) IGNORE NULLS)) as mad_snp_count, - `bqutil`.fn.median(ARRAY_AGG(ABS(singleton - m_singleton) IGNORE NULLS)) as mad_singleton, - `bqutil`.fn.median(ARRAY_AGG(ABS(ins_del_ratio - m_ins_del_ratio) IGNORE NULLS)) as mad_ins_del_ratio, - `bqutil`.fn.median(ARRAY_AGG(ABS(ti_tv_ratio - m_ti_tv_ratio) IGNORE NULLS)) as mad_ti_tv_ratio, - `bqutil`.fn.median(ARRAY_AGG(ABS(snp_het_homvar_ratio - m_snp_het_homvar_ratio) IGNORE NULLS)) as mad_snp_het_homvar_ratio, - `bqutil`.fn.median(ARRAY_AGG(ABS(indel_het_homvar_ratio - m_indel_het_homvar_ratio) IGNORE NULLS)) as mad_indel_het_homvar_ratio - FROM fss - CROSS JOIN medians - WHERE filter_set_name = "~{filter_set_name}") SELECT - fss.sample_id, - si.sample_name, - del_count, m_del_count, mad_del_count, - CASE WHEN del_count BETWEEN m_del_count - 4*mad_del_count AND m_del_count + 4*mad_del_count THEN true ELSE false END pass_del_count, - - ins_count, m_ins_count, mad_ins_count, - CASE WHEN ins_count BETWEEN m_ins_count - 4*mad_ins_count AND m_ins_count + 4*mad_ins_count THEN true ELSE false END pass_ins_count, - - snp_count, m_snp_count, mad_snp_count, - CASE WHEN snp_count BETWEEN m_snp_count - 4*mad_snp_count AND m_snp_count + 4*mad_snp_count THEN true ELSE false END pass_snp_count, - - singleton, m_singleton, mad_singleton, - CASE WHEN singleton BETWEEN m_singleton - 8*mad_singleton AND m_singleton + 8*mad_singleton THEN true ELSE false END pass_singleton, - - ins_del_ratio, m_ins_del_ratio, mad_ins_del_ratio, - CASE WHEN ins_del_ratio BETWEEN m_ins_del_ratio - 4*mad_ins_del_ratio AND m_ins_del_ratio + 4*mad_ins_del_ratio THEN true ELSE false END pass_ins_del_ratio, - - ti_tv_ratio, m_ti_tv_ratio, mad_ti_tv_ratio, - CASE WHEN ti_tv_ratio BETWEEN m_ti_tv_ratio - 4*mad_ti_tv_ratio AND m_ti_tv_ratio + 4*mad_ti_tv_ratio THEN true ELSE false END pass_ti_tv_ratio, - - snp_het_homvar_ratio, m_snp_het_homvar_ratio, mad_snp_het_homvar_ratio, - CASE WHEN snp_het_homvar_ratio BETWEEN m_snp_het_homvar_ratio - 4*mad_snp_het_homvar_ratio AND m_snp_het_homvar_ratio + 4*mad_snp_het_homvar_ratio THEN true ELSE false END pass_snp_het_homvar_ratio, - - indel_het_homvar_ratio, m_indel_het_homvar_ratio, mad_indel_het_homvar_ratio, - CASE WHEN indel_het_homvar_ratio BETWEEN m_indel_het_homvar_ratio - 4*mad_indel_het_homvar_ratio AND m_indel_het_homvar_ratio + 4*mad_indel_het_homvar_ratio THEN true ELSE false END pass_indel_het_homvar_ratio, - FROM fss - JOIN `~{project_id}.~{dataset_name}.~{extract_prefix}__SAMPLES` si ON (fss.sample_id = si.sample_id) - CROSS JOIN medians - CROSS JOIN mads - order by 1 + amt.sample_id, + si.sample_name, + del_count, + ins_count, + snp_count, + singleton, + (ins_count / del_count) as ins_del_ratio, + (ti_count / tv_count) as ti_tv_ratio, + (snp_het_count / snp_homvar_count) snp_het_homvar_ratio, + (indel_het_count / indel_homvar_count) as indel_het_homvar_ratio + FROM `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` amt + JOIN `~{project_id}.~{dataset_name}.sample_info` si ON (amt.sample_id = si.sample_id) + WHERE amt.filter_set_name = "~{filter_set_name}" + AND si.withdrawn IS NULL + ORDER BY 1 ' >>> @@ -764,7 +564,7 @@ task ExportToCSV { bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv --max_rows 1000000000 ' - SELECT * from `~{project_id}.~{dataset_name}.~{statistics_table}` + SELECT * FROM `~{project_id}.~{dataset_name}.~{statistics_table}` ORDER BY SAMPLE_NAME ' > '~{statistics_table}.csv' >>>