From 9c386d7b97464204b5ec6067e9cab5c924ff8dbb Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Tue, 8 Oct 2024 12:29:54 +0200 Subject: [PATCH] fix: percent encoding clinvar annotation (#553) --- Cargo.lock | 1 + Cargo.toml | 11 ++++++----- src/annotate/seqvars/mod.rs | 29 ++++++++++++++++++++++++++++- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 31909ed8..9a9d9259 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2653,6 +2653,7 @@ dependencies = [ "pbjson", "pbjson-build", "pbjson-types", + "percent-encoding", "pprof", "pretty_assertions", "procfs", diff --git a/Cargo.toml b/Cargo.toml index 58a8a388..0b06499f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,10 +38,12 @@ byte-unit = "5.1" chrono = "0.4" clap-verbosity-flag = "2.2" clap = { version = "4.5", features = ["derive"] } +coz = "0.1.3" csv = "1.3" derivative = "2.2" derive_builder = { version = "0.20", features = ["clippy"] } derive-new = "0.7.0" +dhat = "0.3.3" enumflags2 = { version = "0.7.10", features = ["serde"] } env_logger = "0.11" flate2 = "1.0" @@ -54,7 +56,11 @@ jsonl = "4.0" log = "0.4" nom = "7.1" nutype = { version = "0.5.0", features = ["serde"] } +once_cell = "1.20.1" parse-display = "0.10" +pbjson = "0.7" +pbjson-types = "0.7" +percent-encoding = "2.3" procfs = "0.16" prost = "0.13.3" quick_cache = "0.6.9" @@ -76,11 +82,6 @@ tracing-subscriber = "0.3" tracing = { version = "0.1", features = ["log"] } uuid = { version = "1.9", features = ["fast-rng", "serde"] } zstd = "0.13" -pbjson = "0.7" -pbjson-types = "0.7" -coz = "0.1.3" -dhat = "0.3.3" -once_cell = "1.20.1" [dependencies.noodles] version = "0.77.0" diff --git a/src/annotate/seqvars/mod.rs b/src/annotate/seqvars/mod.rs index 24aed22a..99bc9730 100644 --- a/src/annotate/seqvars/mod.rs +++ b/src/annotate/seqvars/mod.rs @@ -1615,6 +1615,28 @@ pub struct ClinvarAnnotator { db: DBWithThreadMode, } +/// Helper code for percent encoding of strings. +/// +/// cf. https://github.com/varfish-org/varfish-server-worker/issues/485 +mod vcf_encoding { + use percent_encoding::{utf8_percent_encode, AsciiSet, PercentEncode, CONTROLS}; + + // ยง 1.2 "Character encoding, non-printable characters and characters with special meaning" (2023-08-23) + const PERCENT_ENCODE_SET: &AsciiSet = &CONTROLS + .add(b':') + .add(b';') + .add(b'=') + .add(b'%') + .add(b',') + .add(b'\r') + .add(b'\n') + .add(b'\t'); + + pub(super) fn percent_encode(s: &str) -> PercentEncode<'_> { + utf8_percent_encode(s, PERCENT_ENCODE_SET) + } +} + impl ClinvarAnnotator { pub fn new(db: DBWithThreadMode) -> Self { Self { db } @@ -1674,7 +1696,12 @@ impl ClinvarAnnotator { Some(field::Value::Array(field::value::Array::String( clinvar_germline_classifications .into_iter() - .map(Some) + .map(|value| { + // Manually encode until the following is fixed. + // + // https://github.com/varfish-org/varfish-server-worker/issues/485 + Some(vcf_encoding::percent_encode(&value).to_string()) + }) .collect::>(), ))), );