From 831830aa1ba46e0323afbd0f96fcae58d954d2a2 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Thu, 16 Nov 2023 09:06:59 +0100 Subject: [PATCH] feat: adding 50bp filter for REF/ALT on clinvar-sv import (#293) (#294) --- src/clinvar_sv/cli/import.rs | 21 +++++++++++++++++++ ...cli__query__test__smoke_query_var_all.snap | 20 ------------------ .../clinvar-sv-grch37.tsv.db/000016.sst | 2 +- .../clinvar-sv-grch37.tsv.db/000018.sst | 4 ++-- .../clinvar-sv-grch37.tsv.db/000020.sst | 4 ++-- .../clinvar-sv-grch37.tsv.db/IDENTITY | 2 +- tests/clinvar-sv/clinvar-sv-grch37.tsv.db/LOG | 4 ++-- .../clinvar-sv-grch37.tsv.db/MANIFEST-000005 | 2 +- 8 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/clinvar_sv/cli/import.rs b/src/clinvar_sv/cli/import.rs index 0cdfac87..0e394840 100644 --- a/src/clinvar_sv/cli/import.rs +++ b/src/clinvar_sv/cli/import.rs @@ -21,6 +21,9 @@ pub struct Args { #[arg(long)] pub path_out_rocksdb: String, + /// Minimal VCF REF/ALT length to consider as SV. + #[arg(long, default_value_t = 50)] + pub min_var_size: u32, /// Name of the column family to import into. #[arg(long, default_value = "clinvar-sv")] pub cf_name: String, @@ -89,6 +92,23 @@ fn jsonl_import( outer_stop, } = sequence_location; + if let (Some(reference_allele_vcf), Some(alternate_allee_vcf)) = + (reference_allele_vcf.as_ref(), alternate_allele_vcf.as_ref()) + { + if reference_allele_vcf.len() < args.min_var_size as usize + && alternate_allee_vcf.len() < args.min_var_size as usize + { + tracing::debug!( + "skipping line because of short REF/ALT: {}/{}: {}>{}", + &vcv, + &rcv, + reference_allele_vcf, + alternate_allee_vcf, + ); + continue; + } + } + let (start, stop, inner_start, inner_stop, outer_start, outer_stop) = if let (Some(start), Some(stop)) = (start, stop) { ( @@ -267,6 +287,7 @@ mod test { String::from("tests/clinvar-sv/clinvar-variants-grch37-strucvars.jsonl"), ], path_out_rocksdb: format!("{}", tmp_dir.join("out-rocksdb").display()), + min_var_size: 50, cf_name: String::from("clinvar-sv"), cf_name_by_rcv: String::from("clinvar-sv-by-rcv"), path_wal_dir: None, diff --git a/src/clinvar_sv/cli/snapshots/annonars__clinvar_sv__cli__query__test__smoke_query_var_all.snap b/src/clinvar_sv/cli/snapshots/annonars__clinvar_sv__cli__query__test__smoke_query_var_all.snap index 8dc0ddca..8ad20402 100644 --- a/src/clinvar_sv/cli/snapshots/annonars__clinvar_sv__cli__query__test__smoke_query_var_all.snap +++ b/src/clinvar_sv/cli/snapshots/annonars__clinvar_sv__cli__query__test__smoke_query_var_all.snap @@ -2,26 +2,6 @@ source: src/clinvar_sv/cli/query.rs expression: "&out_data" --- -{"release":"GRCh37","chromosome":"22","start":41320486,"stop":41320486,"reference":"G","alternative":"T","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000051","reference_assertions":[{"rcv":"RCV000000068","title":"NM_022098.4(XPNPEP3):c.1357G>T (p.Gly453Cys) AND Nephronophthisis-like nephropathy 1","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"22","start":41305199,"stop":41305202,"reference":"TCAAA","alternative":"T","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":0,"vcv":"VCV000000052","reference_assertions":[{"rcv":"RCV000000069","title":"NM_022098.4(XPNPEP3):c.931_934del (p.Asn311fs) AND Nephronophthisis-like nephropathy 1","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"15","start":49048486,"stop":49048486,"reference":"G","alternative":"A","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000056","reference_assertions":[{"rcv":"RCV000000073","title":"NM_001194998.2(CEP152):c.2959C>T (p.Arg987Ter) AND Microcephaly 9, primary, autosomal recessive","clinical_significance":1,"review_status":3}]} -{"release":"GRCh37","chromosome":"19","start":45315576,"stop":45315576,"reference":"C","alternative":"T","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000443","reference_assertions":[{"rcv":"RCV000000472","title":"NM_005581.5(BCAM):c.361C>T (p.Arg121Ter) AND BLOOD GROUP--LUTHERAN NULL","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98304466,"stop":98304466,"reference":"G","alternative":"A","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000451","reference_assertions":[{"rcv":"RCV000000480","title":"NM_000097.7(CPOX):c.991C>T (p.Arg331Trp) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98311840,"stop":98311860,"reference":"TACCTGTGCCAGAGCCTGGCAC","alternative":"T","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":0,"vcv":"VCV000000455","reference_assertions":[{"rcv":"RCV000000484","title":"NM_000097.7(CPOX):c.489_509del (p.Cys164_Val170del) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98307627,"stop":98307627,"reference":"G","alternative":"C","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000456","reference_assertions":[{"rcv":"RCV000000485","title":"NM_000097.7(CPOX):c.883C>G (p.His295Asp) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98300248,"stop":98300248,"reference":"T","alternative":"C","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000457","reference_assertions":[{"rcv":"RCV000000486","title":"NM_000097.7(CPOX):c.1277+3A>G AND Harderoporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98309933,"stop":98309933,"reference":"G","alternative":"A","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000460","reference_assertions":[{"rcv":"RCV000000489","title":"NM_000097.7(CPOX):c.623C>T (p.Ser208Phe) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98304475,"stop":98304475,"reference":"G","alternative":"A","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000461","reference_assertions":[{"rcv":"RCV000000490","title":"NM_000097.7(CPOX):c.982C>T (p.Arg328Cys) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98307653,"stop":98307654,"reference":"G","alternative":"GT","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":1,"vcv":"VCV000000462","reference_assertions":[{"rcv":"RCV000000491","title":"NM_000097.7(CPOX):c.856dup (p.Thr286fs) AND Coproporphyria","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"3","start":98307675,"stop":98307675,"reference":"C","alternative":"G","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000463","reference_assertions":[{"rcv":"RCV000000492","title":"NM_000097.7(CPOX):c.835G>C (p.Gly279Arg) AND Coproporphyria, digenic","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"11","start":112104201,"stop":112104214,"reference":"AGTTCTTCCTGTAGG","alternative":"A","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":0,"vcv":"VCV000000478","reference_assertions":[{"rcv":"RCV000000507","title":"NM_000317.3(PTS):c.361_374del (p.Val121fs) AND Hyperphenylalaninemia, bh4-deficient, a, due to partial pts deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"11","start":112099372,"stop":112099372,"reference":"A","alternative":"G","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000482","reference_assertions":[{"rcv":"RCV000000511","title":"NM_000317.3(PTS):c.139A>G (p.Asn47Asp) AND Hyperphenylalaninemia, bh4-deficient, a, due to partial pts deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"4","start":17503409,"stop":17503410,"reference":"G","alternative":"GGTA","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":1,"vcv":"VCV000000489","reference_assertions":[{"rcv":"RCV000000518","title":"NM_000320.3(QDPR):c.366_368dup (p.Thr123dup) AND Dihydropteridine reductase deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"4","start":17503456,"stop":17503456,"reference":"A","alternative":"C","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000491","reference_assertions":[{"rcv":"RCV000000520","title":"NM_000320.3(QDPR):c.322T>G (p.Trp108Gly) AND Dihydropteridine reductase deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"4","start":17510986,"stop":17510986,"reference":"A","alternative":"G","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000492","reference_assertions":[{"rcv":"RCV000000521","title":"NM_000320.3(QDPR):c.106T>C (p.Trp36Arg) AND Dihydropteridine reductase deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"4","start":17493951,"stop":17493951,"reference":"T","alternative":"C","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000494","reference_assertions":[{"rcv":"RCV000000523","title":"NM_000320.3(QDPR):c.449A>G (p.Tyr150Cys) AND Dihydropteridine reductase deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"4","start":17506027,"stop":17506027,"reference":"C","alternative":"T","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000495","reference_assertions":[{"rcv":"RCV000000524","title":"NM_000320.3(QDPR):c.270G>A (p.Trp90Ter) AND Dihydropteridine reductase deficiency","clinical_significance":0,"review_status":5}]} -{"release":"GRCh37","chromosome":"6","start":117198947,"stop":117198947,"reference":"A","alternative":"G","inner_start":null,"inner_stop":null,"outer_start":null,"outer_stop":null,"variant_type":5,"vcv":"VCV000000497","reference_assertions":[{"rcv":"RCV000000526","title":"NM_173560.4(RFX6):c.224-12A>G AND Hypoplastic pancreas-intestinal atresia-hypoplastic gallbalder syndrome","clinical_significance":0,"review_status":5}]} {"release":"GRCh37","chromosome":"X","start":155210040,"stop":155242832,"reference":null,"alternative":null,"inner_start":155210040,"inner_stop":null,"outer_start":155242832,"outer_stop":null,"variant_type":0,"vcv":"VCV000057500","reference_assertions":[{"rcv":"RCV000051210","title":"GRCh38/hg38 Xq28(chrX:155980375-156013167)x0 AND See cases","clinical_significance":4,"review_status":3}]} {"release":"GRCh37","chromosome":"7","start":64691936,"stop":64866073,"reference":null,"alternative":null,"inner_start":64691936,"inner_stop":null,"outer_start":64866073,"outer_stop":null,"variant_type":0,"vcv":"VCV000057566","reference_assertions":[{"rcv":"RCV000051294","title":"GRCh38/hg38 7q11.21(chr7:65231558-65401160)x1 AND See cases","clinical_significance":2,"review_status":3}]} {"release":"GRCh37","chromosome":"22","start":34150132,"stop":34182300,"reference":null,"alternative":null,"inner_start":34150132,"inner_stop":null,"outer_start":34182300,"outer_stop":null,"variant_type":0,"vcv":"VCV000057627","reference_assertions":[{"rcv":"RCV000051361","title":"GRCh38/hg38 22q12.3(chr22:33754145-33786313)x1 AND See cases","clinical_significance":0,"review_status":3}]} diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000016.sst b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000016.sst index 503f94dc..06280635 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000016.sst +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000016.sst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:955b9d98e8efb84a51442b2eb21a479889d78afa32f220f3a5abb316a26fef48 +oid sha256:82811cf371ff89263417aabae17538ed847fc8fe2861e3310193b8676516b86d size 1343 diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000018.sst b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000018.sst index 4a739f46..0e87fd7d 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000018.sst +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000018.sst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6a2d493d8417536a7876861cf5727eedca7bdd8ed416be0f23082a308e9b88b -size 3611 +oid sha256:a6157d921a9988e42bb8b8ef3ccbc4e5fea921716312bb906cf40fad43e476dd +size 2264 diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000020.sst b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000020.sst index db3799e5..07c9df5d 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000020.sst +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/000020.sst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c7ef4ce2213b1e661dfec367a5518b166c9e3a43c4243b4e08a936e2508284d -size 1523 +oid sha256:6ee3cee6848cf52e72358a14c69dfbe0716b9555bc4e68f358f3b5597fc4ca6c +size 1426 diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/IDENTITY b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/IDENTITY index a73953de..086bf3c2 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/IDENTITY +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/IDENTITY @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12d727116c14ee392e21edfc15a73a9e85863b8011302b8d958aa1e8dec1649a +oid sha256:449dc89920196e68ef164b81aeabe23460356f7530e2d810cee0acc152d4c028 size 36 diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/LOG b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/LOG index 1327c720..ba181fd8 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/LOG +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/LOG @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e56f0334c668df1396f3d090b4982540650959846d8daeb69827cebc9ddaa1c3 -size 81675 +oid sha256:4d87b6f636f3f550628898a8a5fe0a2aed5daa8c13f31785cc5aeb357ee6846f +size 82242 diff --git a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/MANIFEST-000005 b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/MANIFEST-000005 index 430f45fd..c9257c74 100644 --- a/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/MANIFEST-000005 +++ b/tests/clinvar-sv/clinvar-sv-grch37.tsv.db/MANIFEST-000005 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e4cbc23e793f4feb52bbf7f60f68825faa7e9248c7f6689738e02f22236ba6a +oid sha256:361fdbab758631233e7a71147e1a2b68d4046c7dff6c9ac7a4795424b8a146aa size 982