Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

VS-698 Remove unnecessary columns from Call set statistics #8073

Merged
merged 6 commits into from
Oct 26, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 21 additions & 221 deletions scripts/variantstore/wdl/GvsCallsetStatistics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -190,11 +190,6 @@ task CreateTables {
"name": "singleton",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "pass_qc",
"type": "INT64",
"mode": "NULLABLE"
}
]
FIN
Expand All @@ -219,160 +214,40 @@ task CreateTables {
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "m_del_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_del_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_del_count",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "ins_count",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "m_ins_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_ins_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_ins_count",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "snp_count",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "m_snp_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_snp_count",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_snp_count",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "singleton",
"type": "INT64",
"mode": "NULLABLE"
},
{
"name": "m_singleton",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_singleton",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_singleton",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "ins_del_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "m_ins_del_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_ins_del_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_ins_del_ratio",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "ti_tv_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "m_ti_tv_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_ti_tv_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_ti_tv_ratio",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "snp_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "m_snp_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_snp_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_snp_het_homvar_ratio",
"type": "BOOL",
"mode": "NULLABLE"
},
{
"name": "indel_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "m_indel_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "mad_indel_het_homvar_ratio",
"type": "FLOAT64",
"mode": "NULLABLE"
},
{
"name": "pass_indel_het_homvar_ratio",
"type": "BOOL",
"mode": "NULLABLE"
}
]
FIN
Expand Down Expand Up @@ -490,8 +365,7 @@ task CollectMetricsForChromosome {
snp_homvar_count,
indel_het_count,
indel_homvar_count,
singleton,
pass_qc
singleton
)
SELECT "~{filter_set_name}" filter_set_name,
sample_id,
Expand All @@ -506,8 +380,7 @@ task CollectMetricsForChromosome {
SUM(CASE WHEN type = "snp" AND gt_type = "homvar" THEN 1 ELSE 0 END) snp_homvar_count,
SUM(CASE WHEN type IN ("ins","del") AND gt_type = "het" THEN 1 ELSE 0 END) indel_het_count,
SUM(CASE WHEN type IN ("ins","del") AND gt_type = "homvar" THEN 1 ELSE 0 END) indel_homvar_count,
COUNTIF(not in_gnomad) singleton,
null AS pass_qc
COUNTIF(not in_gnomad) singleton
FROM (
SELECT sample_id,
type(ref, alt, call_GT) as type,
Expand Down Expand Up @@ -575,8 +448,7 @@ task AggregateMetricsAcrossChromosomes {
snp_homvar_count,
indel_het_count,
indel_homvar_count,
singleton,
pass_qc
singleton
)
SELECT "~{filter_set_name}" filter_set_name,
sample_id,
Expand All @@ -590,8 +462,7 @@ task AggregateMetricsAcrossChromosomes {
SUM(snp_homvar_count) snp_homvar_count,
SUM(indel_het_count) indel_het_count,
SUM(indel_homvar_count) indel_homvar_count,
SUM(singleton) singleton,
null AS pass_qc
SUM(singleton) singleton
FROM `~{project_id}.~{dataset_name}.~{metrics_table}` GROUP BY 1,2

'
Expand Down Expand Up @@ -640,102 +511,31 @@ task CollectStatistics {
sample_id,
sample_name,
del_count,
m_del_count,
mad_del_count,
pass_del_count,
ins_count,
m_ins_count,
mad_ins_count,
pass_ins_count,
snp_count,
m_snp_count,
mad_snp_count,
pass_snp_count,
singleton,
m_singleton,
mad_singleton,
pass_singleton,
ins_del_ratio,
m_ins_del_ratio,
mad_ins_del_ratio,
pass_ins_del_ratio,
ti_tv_ratio,
m_ti_tv_ratio,
mad_ti_tv_ratio,
pass_ti_tv_ratio,
snp_het_homvar_ratio,
m_snp_het_homvar_ratio,
mad_snp_het_homvar_ratio,
pass_snp_het_homvar_ratio,
indel_het_homvar_ratio,
m_indel_het_homvar_ratio,
mad_indel_het_homvar_ratio,
pass_indel_het_homvar_ratio
indel_het_homvar_ratio
)

WITH fss AS (
SELECT *,
(ins_count / del_count) as ins_del_ratio,
(ti_count / tv_count) as ti_tv_ratio,
(snp_het_count / snp_homvar_count) snp_het_homvar_ratio,
(indel_het_count / indel_homvar_count) as indel_het_homvar_ratio
FROM `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}`
WHERE filter_set_name = "~{filter_set_name}"),
medians AS (
SELECT
`bqutil`.fn.median(ARRAY_AGG(del_count IGNORE NULLS)) as m_del_count,
`bqutil`.fn.median(ARRAY_AGG(ins_count IGNORE NULLS)) as m_ins_count,
`bqutil`.fn.median(ARRAY_AGG(snp_count IGNORE NULLS)) as m_snp_count,
`bqutil`.fn.median(ARRAY_AGG(singleton IGNORE NULLS)) as m_singleton,
`bqutil`.fn.median(ARRAY_AGG(ins_del_ratio IGNORE NULLS)) as m_ins_del_ratio,
`bqutil`.fn.median(ARRAY_AGG(ti_tv_ratio IGNORE NULLS)) as m_ti_tv_ratio,
`bqutil`.fn.median(ARRAY_AGG(snp_het_homvar_ratio IGNORE NULLS)) as m_snp_het_homvar_ratio,
`bqutil`.fn.median(ARRAY_AGG(indel_het_homvar_ratio IGNORE NULLS)) as m_indel_het_homvar_ratio
FROM fss),
mads AS (
SELECT
`bqutil`.fn.median(ARRAY_AGG(ABS(del_count - m_del_count) IGNORE NULLS)) as mad_del_count,
`bqutil`.fn.median(ARRAY_AGG(ABS(ins_count - m_ins_count) IGNORE NULLS)) as mad_ins_count,
`bqutil`.fn.median(ARRAY_AGG(ABS(snp_count - m_snp_count) IGNORE NULLS)) as mad_snp_count,
`bqutil`.fn.median(ARRAY_AGG(ABS(singleton - m_singleton) IGNORE NULLS)) as mad_singleton,
`bqutil`.fn.median(ARRAY_AGG(ABS(ins_del_ratio - m_ins_del_ratio) IGNORE NULLS)) as mad_ins_del_ratio,
`bqutil`.fn.median(ARRAY_AGG(ABS(ti_tv_ratio - m_ti_tv_ratio) IGNORE NULLS)) as mad_ti_tv_ratio,
`bqutil`.fn.median(ARRAY_AGG(ABS(snp_het_homvar_ratio - m_snp_het_homvar_ratio) IGNORE NULLS)) as mad_snp_het_homvar_ratio,
`bqutil`.fn.median(ARRAY_AGG(ABS(indel_het_homvar_ratio - m_indel_het_homvar_ratio) IGNORE NULLS)) as mad_indel_het_homvar_ratio
FROM fss
CROSS JOIN medians
WHERE filter_set_name = "~{filter_set_name}")
SELECT
fss.sample_id,
si.sample_name,
del_count, m_del_count, mad_del_count,
CASE WHEN del_count BETWEEN m_del_count - 4*mad_del_count AND m_del_count + 4*mad_del_count THEN true ELSE false END pass_del_count,

ins_count, m_ins_count, mad_ins_count,
CASE WHEN ins_count BETWEEN m_ins_count - 4*mad_ins_count AND m_ins_count + 4*mad_ins_count THEN true ELSE false END pass_ins_count,

snp_count, m_snp_count, mad_snp_count,
CASE WHEN snp_count BETWEEN m_snp_count - 4*mad_snp_count AND m_snp_count + 4*mad_snp_count THEN true ELSE false END pass_snp_count,

singleton, m_singleton, mad_singleton,
CASE WHEN singleton BETWEEN m_singleton - 8*mad_singleton AND m_singleton + 8*mad_singleton THEN true ELSE false END pass_singleton,

ins_del_ratio, m_ins_del_ratio, mad_ins_del_ratio,
CASE WHEN ins_del_ratio BETWEEN m_ins_del_ratio - 4*mad_ins_del_ratio AND m_ins_del_ratio + 4*mad_ins_del_ratio THEN true ELSE false END pass_ins_del_ratio,

ti_tv_ratio, m_ti_tv_ratio, mad_ti_tv_ratio,
CASE WHEN ti_tv_ratio BETWEEN m_ti_tv_ratio - 4*mad_ti_tv_ratio AND m_ti_tv_ratio + 4*mad_ti_tv_ratio THEN true ELSE false END pass_ti_tv_ratio,

snp_het_homvar_ratio, m_snp_het_homvar_ratio, mad_snp_het_homvar_ratio,
CASE WHEN snp_het_homvar_ratio BETWEEN m_snp_het_homvar_ratio - 4*mad_snp_het_homvar_ratio AND m_snp_het_homvar_ratio + 4*mad_snp_het_homvar_ratio THEN true ELSE false END pass_snp_het_homvar_ratio,

indel_het_homvar_ratio, m_indel_het_homvar_ratio, mad_indel_het_homvar_ratio,
CASE WHEN indel_het_homvar_ratio BETWEEN m_indel_het_homvar_ratio - 4*mad_indel_het_homvar_ratio AND m_indel_het_homvar_ratio + 4*mad_indel_het_homvar_ratio THEN true ELSE false END pass_indel_het_homvar_ratio,
FROM fss
JOIN `~{project_id}.~{dataset_name}.~{extract_prefix}__SAMPLES` si ON (fss.sample_id = si.sample_id)
CROSS JOIN medians
CROSS JOIN mads
order by 1
amt.sample_id,
si.sample_name,
del_count,
ins_count,
snp_count,
singleton,
(ins_count / del_count) as ins_del_ratio,
(ti_count / tv_count) as ti_tv_ratio,
(snp_het_count / snp_homvar_count) snp_het_homvar_ratio,
(indel_het_count / indel_homvar_count) as indel_het_homvar_ratio
FROM `~{project_id}.~{dataset_name}.~{aggregate_metrics_table}` amt
JOIN `~{project_id}.~{dataset_name}.sample_info` si ON (amt.sample_id = si.sample_id)
WHERE amt.filter_set_name = "~{filter_set_name}"
AND si.withdrawn IS NULL
ORDER BY 1

'
>>>
Expand Down Expand Up @@ -764,7 +564,7 @@ task ExportToCSV {

bq query --nouse_legacy_sql --project_id=~{project_id} --format=csv --max_rows 1000000000 '

SELECT * from `~{project_id}.~{dataset_name}.~{statistics_table}`
SELECT * FROM `~{project_id}.~{dataset_name}.~{statistics_table}` ORDER BY SAMPLE_NAME

' > '~{statistics_table}.csv'
>>>
Expand Down