Skip to content

Commit

Permalink
feat: port over assembly info from bioutils (#8)
Browse files Browse the repository at this point in the history
  • Loading branch information
holtgrewe committed Feb 17, 2023
1 parent 8e81536 commit daae525
Show file tree
Hide file tree
Showing 32 changed files with 198 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
src/static_data/**/*.json* filter=lfs diff=lfs merge=lfs -text
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,10 @@ edition = "2021"

[dependencies]
anyhow = "1.0.69"
enum-map = "2.4.2"
flate2 = "1.0.25"
lazy_static = "1.4.0"
nom = "7.1.3"
pretty_assertions = "1.3.0"
serde = { version = "1.0.152", features = ["derive"] }
serde_json = "1.0.93"
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
pub mod parser;
pub mod static_data;

pub fn add(left: usize, right: usize) -> usize {
left + right
Expand Down
3 changes: 3 additions & 0 deletions src/static_data/_data/CHM1_1.0.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/CHM1_1.1.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p10.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p11.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p12.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p13.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p2.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p5.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh37.p9.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p1.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p10.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p11.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p12.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p2.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p3.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p4.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p5.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p6.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p7.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p8.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/GRCh38.p9.json.gz
Git LFS file not shown
36 changes: 36 additions & 0 deletions src/static_data/_data/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
ASSEMBLIES := \
CHM1_1.0 \
CHM1_1.1 \
GRCh37 \
GRCh37.p10 \
GRCh37.p11 \
GRCh37.p12 \
GRCh37.p13 \
GRCh37.p2 \
GRCh37.p5 \
GRCh37.p9 \
GRCh38 \
GRCh38.p10 \
GRCh38.p11 \
GRCh38.p12 \
GRCh38.p1 \
GRCh38.p2 \
GRCh38.p3 \
GRCh38.p4 \
GRCh38.p5 \
GRCh38.p6 \
GRCh38.p7 \
GRCh38.p8 \
GRCh38.p9 \
NCBI33 \
NCBI34 \
NCBI35 \
NCBI36 \

JSONS := $(addsuffix .json.gz,${ASSEMBLIES})

.PHONY:
all: $(JSONS)

%.json.gz:
wget -O $@ https://github.com/biocommons/bioutils/raw/main/src/bioutils/_data/assemblies/$@
3 changes: 3 additions & 0 deletions src/static_data/_data/NCBI33.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/NCBI34.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/NCBI35.json.gz
Git LFS file not shown
3 changes: 3 additions & 0 deletions src/static_data/_data/NCBI36.json.gz
Git LFS file not shown
74 changes: 74 additions & 0 deletions src/static_data/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
//! Static data.

use std::io::Read;

use enum_map::{enum_map, Enum, EnumMap};
use flate2::read::GzDecoder;
use serde::Deserialize;

const GRCH37_JSON_GZ: &[u8] = include_bytes!("_data/GRCh37.json.gz");
const GRCH38_JSON_GZ: &[u8] = include_bytes!("_data/GRCh38.json.gz");

#[derive(Debug, Deserialize, Enum)]
pub enum Assembly {
Grch37,
Grch38,
}

impl Assembly {
/// Deserialize assembly info from embedded compressed JSON.
fn load_assembly_info(&self) -> AssemblyInfo {
let payload = match self {
Assembly::Grch37 => GRCH37_JSON_GZ,
Assembly::Grch38 => GRCH38_JSON_GZ,
};
let mut d = GzDecoder::new(payload);
let mut grch37_json = String::new();
d.read_to_string(&mut grch37_json).unwrap();
serde_json::from_str::<AssemblyInfo>(&grch37_json).unwrap()
}
}

#[derive(Debug, Deserialize)]
pub struct Sequence {
pub aliases: Vec<String>,
pub assembly_unit: String,
pub genbank_ac: String,
pub length: usize,
pub name: String,
pub refseq_ac: String,
pub relationship: String,
pub sequence_role: String,
}

#[derive(Debug, Deserialize)]
pub struct AssemblyInfo {
pub date: String,
pub description: String,
pub genbank_ac: String,
pub name: String,
pub refseq_ac: String,
pub sequences: Vec<Sequence>,
pub submitter: String,
}

lazy_static::lazy_static! {
/// Provide information about the assemblies.
pub static ref ASSEMBLY_INFOS: EnumMap<Assembly, AssemblyInfo> = enum_map! {
Grch37 => Assembly::Grch37.load_assembly_info(),
Grch38 => Assembly::Grch38.load_assembly_info(),
};
}

#[cfg(test)]
mod test {
use pretty_assertions::assert_eq;

use crate::static_data::{Assembly, ASSEMBLY_INFOS};

#[test]
fn smoke() {
assert_eq!(ASSEMBLY_INFOS[Assembly::Grch37].sequences.len(), 92);
assert_eq!(ASSEMBLY_INFOS[Assembly::Grch38].sequences.len(), 92);
}
}

0 comments on commit daae525

Please sign in to comment.