From f82494198c26f7b18d21bf69a62c4a4b3db89b24 Mon Sep 17 00:00:00 2001 From: Manuel Holtgrewe Date: Wed, 31 May 2023 16:08:32 +0200 Subject: [PATCH] feat: storing TSV lines as string to reduce storage size (#57) --- src/tsv/cli/import/mod.rs | 6 ++---- src/tsv/cli/query.rs | 10 ++++++---- tests/tsv/example/data.tsv.gz.db/000014.sst | 4 ++-- tests/tsv/example/data.tsv.gz.db/000016.sst | 4 ++-- tests/tsv/example/data.tsv.gz.db/IDENTITY | 2 +- tests/tsv/example/data.tsv.gz.db/LOG | 4 ++-- tests/tsv/example/data.tsv.gz.db/MANIFEST-000005 | 4 ++-- tests/tsv/example/data.tsv.gz.db/OPTIONS-000009 | 4 ++-- tests/tsv/example/data.tsv.gz.db/OPTIONS-000011 | 4 ++-- 9 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/tsv/cli/import/mod.rs b/src/tsv/cli/import/mod.rs index 72946c5c..6c0cb91d 100644 --- a/src/tsv/cli/import/mod.rs +++ b/src/tsv/cli/import/mod.rs @@ -79,23 +79,21 @@ pub fn process_tsv_line( db: &rocksdb::DBWithThreadMode, cf_data: &std::sync::Arc, ) -> Result<(), anyhow::Error> { - let line = line; let values = ctx.line_to_values(line)?; let values = values.iter().collect::>(); let var = ctx.values_to_var(&values)?; if let Some(var) = var.as_ref() { let key: Vec = var.clone().into(); - let value = ctx.encode_values(&values)?; tracing::trace!( "putting for var = {:?}, key = {:?}, value = {:?}", &var, &key, - &value + &line.as_bytes() ); - db.put_cf(cf_data, key, value)?; + db.put_cf(cf_data, key, line.as_bytes())?; } else { tracing::trace!("skipping line: {:?}", &line); } diff --git a/src/tsv/cli/query.rs b/src/tsv/cli/query.rs index 336ac658..27eb16cd 100644 --- a/src/tsv/cli/query.rs +++ b/src/tsv/cli/query.rs @@ -154,7 +154,8 @@ fn query_for_variant( let raw_value = db .get_cf(&cf_data, key)? .ok_or_else(|| anyhow::anyhow!("could not find variant in database"))?; - let values = ctx.decode_values(&raw_value)?; + let line = std::str::from_utf8(raw_value.as_slice())?; + let values = ctx.line_to_values(line)?; Ok(values) } @@ -229,8 +230,8 @@ pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> // Iterate over all variants until we are behind stop. while iter.valid() { - if let Some(value) = iter.value() { - tracing::trace!("iterator at {:?} => {:?}", &iter.key(), &value); + if let Some(line_raw) = iter.value() { + tracing::trace!("iterator at {:?} => {:?}", &iter.key(), &line_raw); if let Some(stop) = stop.as_ref() { let iter_key = iter.key().unwrap(); let iter_pos: keys::Pos = iter_key.into(); @@ -240,7 +241,8 @@ pub fn run(common: &common::cli::Args, args: &Args) -> Result<(), anyhow::Error> } } - let values = ctx.decode_values(value)?; + let line = std::str::from_utf8(line_raw)?; + let values = ctx.line_to_values(line)?; print_values(&mut out_writer, args.out_format, &meta, values)?; iter.next(); } else { diff --git a/tests/tsv/example/data.tsv.gz.db/000014.sst b/tests/tsv/example/data.tsv.gz.db/000014.sst index f5d2fafb..0eab9581 100644 --- a/tests/tsv/example/data.tsv.gz.db/000014.sst +++ b/tests/tsv/example/data.tsv.gz.db/000014.sst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:921f0cba893131f55458962b3e5c1d63bd6310bf13e7a5278ef7e29b1f0b9876 -size 1491 +oid sha256:68a63d77b7cc5ac1be7c681654faa4dcd6fd6d6982d7aa606ad639e0f53bc021 +size 1507 diff --git a/tests/tsv/example/data.tsv.gz.db/000016.sst b/tests/tsv/example/data.tsv.gz.db/000016.sst index 65d02afc..156a2076 100644 --- a/tests/tsv/example/data.tsv.gz.db/000016.sst +++ b/tests/tsv/example/data.tsv.gz.db/000016.sst @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b0f6c71619ac026fc6e08943c886f2893ef91063c2866ca1f97f5cc5c7a5f1a -size 1213 +oid sha256:e04e0725aa79e411bb109ee9b0f6f139c27ec1b1e91e5c38aa48d3060a619c51 +size 1211 diff --git a/tests/tsv/example/data.tsv.gz.db/IDENTITY b/tests/tsv/example/data.tsv.gz.db/IDENTITY index 20c4ca66..984dab87 100644 --- a/tests/tsv/example/data.tsv.gz.db/IDENTITY +++ b/tests/tsv/example/data.tsv.gz.db/IDENTITY @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80dc4cfd38f3b671e57311a136418eaa9efd36c0d736d421693dd8f1df335fe7 +oid sha256:a8827ed095e73220604adece26d77d3ac32ba75dfe3936b16847b401dd642557 size 36 diff --git a/tests/tsv/example/data.tsv.gz.db/LOG b/tests/tsv/example/data.tsv.gz.db/LOG index 6e376cd0..c63c33e1 100644 --- a/tests/tsv/example/data.tsv.gz.db/LOG +++ b/tests/tsv/example/data.tsv.gz.db/LOG @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b41c8c796e3d35d8146c5135edcdcd420ce504e35d26a73518f8d43c7b055d3 -size 61922 +oid sha256:d11532a911006a9994d5ee4a73519c687b581d44d74ddbb5afefcac9f4323954 +size 62197 diff --git a/tests/tsv/example/data.tsv.gz.db/MANIFEST-000005 b/tests/tsv/example/data.tsv.gz.db/MANIFEST-000005 index fb58f017..32ece643 100644 --- a/tests/tsv/example/data.tsv.gz.db/MANIFEST-000005 +++ b/tests/tsv/example/data.tsv.gz.db/MANIFEST-000005 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5603615b42dec440148a740fec019aa1ce494f35dab305c0f22a5a04a7e5e3c4 -size 658 +oid sha256:a69597f6f5d25269527304e9510dfe7a834a7ddea64c907ce5bba630ae10c503 +size 660 diff --git a/tests/tsv/example/data.tsv.gz.db/OPTIONS-000009 b/tests/tsv/example/data.tsv.gz.db/OPTIONS-000009 index cc5acc38..980aac71 100644 --- a/tests/tsv/example/data.tsv.gz.db/OPTIONS-000009 +++ b/tests/tsv/example/data.tsv.gz.db/OPTIONS-000009 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4be9b1220e95144635b1e9799869b4b1ba1e4c7b9e89dd798b46c0d4db8982e -size 15443 +oid sha256:ee6dd4f8e2aa31e8d89ef081023ad364bb49140da2d490da687b3991aeb58c3f +size 15371 diff --git a/tests/tsv/example/data.tsv.gz.db/OPTIONS-000011 b/tests/tsv/example/data.tsv.gz.db/OPTIONS-000011 index cc5acc38..980aac71 100644 --- a/tests/tsv/example/data.tsv.gz.db/OPTIONS-000011 +++ b/tests/tsv/example/data.tsv.gz.db/OPTIONS-000011 @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4be9b1220e95144635b1e9799869b4b1ba1e4c7b9e89dd798b46c0d4db8982e -size 15443 +oid sha256:ee6dd4f8e2aa31e8d89ef081023ad364bb49140da2d490da687b3991aeb58c3f +size 15371