diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..811b0394 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +tests/data/db/create/txs/latest/** filter=lfs diff=lfs merge=lfs -text diff --git a/.github/actions/install-flatbuffers/action.yml b/.github/actions/install-flatbuffers/action.yml new file mode 100644 index 00000000..ccfba173 --- /dev/null +++ b/.github/actions/install-flatbuffers/action.yml @@ -0,0 +1,38 @@ +name: install-flatbuffers +description: Install flatbuffers + +runs: + using: "composite" + steps: + - name: Cache flatbuffers installation + id: cache-flatbuffers-installation + uses: actions/cache@v3 + env: + cache-name: cache-install-flatbuffers + with: + path: ~/.local/share/flatbuffers + key: ${{ runner.os }}-build-${{ env.cache-name }} + restore-keys: | + ${{ runner.os }}-build- + ${{ runner.os }}- + + - if: ${{ steps. cache-flatbuffers-installation.outputs.cache-hit != 'true' }} + name: Install flatbuffers + shell: bash + run: | + mkdir -p utils/var + cd utils/var + git clone https://github.com/google/flatbuffers.git + cd flatbuffers + git checkout v22.12.06 + cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$HOME/.local/share/flatbuffers + make + ./flattests + sudo make install + export PATH=$PATH:$HOME/.local/share/flatbuffers/bin + flatc --version + + - name: Make flatc available in PATH + shell: bash + run: | + echo "$HOME/.local/share/flatbuffers/bin" >> $GITHUB_PATH diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 0b6a2ffd..250b8ab2 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -22,8 +22,13 @@ jobs: override: true components: rustfmt + - name: Install flatbuffers + uses: ./.github/actions/install-flatbuffers + - name: Check format run: | + flatc -o target/flatbuffers --rust src/world.fbs + rustfmt target/flatbuffers/world_generated.rs cargo fmt -- --check Linting: @@ -41,6 +46,9 @@ jobs: override: true components: clippy + - name: Install flatbuffers + uses: ./.github/actions/install-flatbuffers + # - uses: Swatinem/rust-cache@v1.3.0 # Enable caching of the 'librocksdb-sys' crate by additionally caching the # 'librocksdb-sys' src directory which is managed by cargo @@ -86,6 +94,9 @@ jobs: toolchain: stable override: true + - name: Install flatbuffers + uses: ./.github/actions/install-flatbuffers + # - uses: Swatinem/rust-cache@v1.3.0 # Enable caching of the 'librocksdb-sys' crate by additionally caching the # 'librocksdb-sys' src directory which is managed by cargo diff --git a/.gitignore b/.gitignore index 4cbda408..89a3338a 100644 --- a/.gitignore +++ b/.gitignore @@ -7,9 +7,6 @@ *.lock +## Flatbuffers -# Added by cargo -# -# already existing elements were commented out - -#/target +utils/var diff --git a/Cargo.toml b/Cargo.toml index 5d9bb2de..c4de0533 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,18 +14,29 @@ name = "mehari" [dependencies] anyhow = "1.0.69" byteorder = "1.4.3" +byte-unit = "4.0.18" clap = { version = "4.1.8", features = ["derive"] } clap-verbosity-flag = "2.0.0" csv = "1.2.0" -hgvs = "0.2.0" +flatbuffers = "23.1.21" +flate2 = "1.0.25" +hgvs = "0.3.1" lazy_static = "1.4.0" log = "0.4.17" noodles = { version = "0.33.0", features = ["vcf", "bcf", "csi", "fasta", "bgzf", "tabix"] } noodles-util = { version = "0.5.0", features = ["noodles-bcf", "noodles-bgzf", "noodles-vcf", "variant"] } +procfs = "0.15.1" rocksdb = "0.20.1" +seqrepo = "0.2.3" serde = { version = "1.0.152", features = ["derive"] } +serde_json = "1.0.94" tracing = { version = "0.1.37", features = ["log"] } tracing-subscriber = "0.3.16" +indicatif = "0.17.3" +thousands = "0.2.0" + +[build-dependencies] +flatc-rust = "0.2.0" [dev-dependencies] pretty_assertions = "1.3.0" diff --git a/README.md b/README.md index c759f33a..833aa0c3 100644 --- a/README.md +++ b/README.md @@ -113,3 +113,57 @@ prepare \ $base/GRCh38/gnomAD_genomes/r3.1.1/download/gnomad.genomes.r3.1.1.sites.chrY.vcf.bgz \ tests/data/db/create/seqvar_freqs/xy-38/gnomad.genomes.r3.1.1.sites.chrY.vcf ``` + +Building tx database + + +``` +cd hgvs-rs-data + +seqrepo --root-directory seqrepo-data/master init + +mkdir -p mirror/ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot +cd !$ +wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.files.installed +parallel -j 16 'wget https://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/{}' ::: $(cut -f 2 human.files.installed | grep fna) +cd - + +mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna +cd !$ +wget https://ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/release-108/fasta/homo_sapiens/ncrna +cd !$ +wget https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh38.ncrna.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/ +cd !$ +wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh37.cdna.all.fa.gz +cd - +mkdir -p mirror/ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/ +cd !$ +wget https://ftp.ensembl.org/pub/grch37/release-108/fasta/homo_sapiens/ncrna/Homo_sapiens.GRCh37.ncrna.fa.gz +cd - + +seqrepo --root-directory seqrepo-data/master load -n NCBI $(find mirror/ftp.ncbi.nih.gov -name '*.fna.gz' | sort) +seqrepo --root-directory seqrepo-data/master load -n ENSEMBL $(find mirror/ftp.ensembl.org -name '*.fa.gz' | sort) + +cd ../mehari + +cargo run --release -- \ + -v \ + db create txs \ + --path-out /tmp/txs-out.bin \ + --path-cdot-json ../cdot-0.2.12.ensembl.grch37_grch38.json.gz \ + --path-cdot-json ../cdot-0.2.12.refseq.grch37_grch38.json.gz \ + --path-seqrepo-instance ../hgvs-rs-data/seqrepo-data/master/master +``` + +## Development Setup + +You will need a recent version of flatbuffers, e.g.: + +``` +# bash utils/install-flatbuffers.sh +# export PATH=$PATH:$HOME/.local/share/flatbuffers/bin +``` diff --git a/build.rs b/build.rs new file mode 100644 index 00000000..d7bed186 --- /dev/null +++ b/build.rs @@ -0,0 +1,13 @@ +// The custom build script, needed as we use flatbuffers. + +use std::path::Path; + +fn main() { + println!("cargo:rerun-if-changed=src/world.fbs"); + flatc_rust::run(flatc_rust::Args { + inputs: &[Path::new("src/world.fbs")], + out_dir: Path::new("target/flatbuffers/"), + ..Default::default() + }) + .expect("flatc"); +} diff --git a/src/common.rs b/src/common.rs index 9c5ffa9c..ae5736ec 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,5 +1,6 @@ //! Commonly used code. +use byte_unit::Byte; use clap::Parser; use clap_verbosity_flag::{InfoLevel, Verbosity}; @@ -10,3 +11,13 @@ pub struct Args { #[clap(flatten)] pub verbose: Verbosity, } + +/// Helper to print the current memory resident set size via `tracing`. +pub fn trace_rss_now() { + let me = procfs::process::Process::myself().unwrap(); + let page_size = procfs::page_size(); + tracing::debug!( + "RSS now: {}", + Byte::from_bytes((me.stat().unwrap().rss * page_size) as u128).get_appropriate_unit(true) + ); +} diff --git a/src/db/create/txs.rs b/src/db/create/txs.rs index 69a8d570..d2d6c605 100644 --- a/src/db/create/txs.rs +++ b/src/db/create/txs.rs @@ -1,13 +1,684 @@ //! Transcript database. +use std::collections::HashSet; +use std::path::Path; +use std::{collections::HashMap, fs::File, io::Write, path::PathBuf, time::Instant}; + use clap::Parser; +use flatbuffers::FlatBufferBuilder; +use hgvs::data::cdot::json::models::{self, BioType}; +use indicatif::{ProgressBar, ProgressStyle}; +use seqrepo::{AliasOrSeqId, Interface, SeqRepo}; +use thousands::Separable; + +use crate::common::trace_rss_now; +use crate::world_flatbuffers::mehari::{ + ExonAlignment, ExonAlignmentArgs, GeneToTxId, GeneToTxIdArgs, GenomeAlignment, + GenomeAlignmentArgs, GenomeBuild, SequenceDb, SequenceDbArgs, Strand, Transcript, + TranscriptArgs, TranscriptBiotype, TranscriptDb, TranscriptDbArgs, TranscriptTag, + TxSeqDatabase, TxSeqDatabaseArgs, +}; + +lazy_static::lazy_static! { + /// Progress bar style to use. + pub static ref PROGRESS_STYLE: ProgressStyle = ProgressStyle::with_template( + "[{elapsed_precise}] [{wide_bar:.cyan/blue}] {human_pos}/{human_len} ({eta})", + ) + .unwrap(); +} /// Command line arguments for `db create txs` sub command. #[derive(Parser, Debug)] -#[command(about = "Construct mehari transcripts database", long_about = None)] -pub struct Args {} +#[command(about = "Construct mehari transcripts and sequence database", long_about = None)] +pub struct Args { + /// Path to output flatbuffers file to write to. + #[arg(long)] + pub path_out: PathBuf, + /// Paths to the cdot JSON transcripts to import. + #[arg(long, required = true)] + pub path_cdot_json: Vec, + /// Path to the seqrepo instance directory to use. + #[arg(long)] + pub path_seqrepo_instance: PathBuf, +} + +/// Load and extract from cdot JSON. +fn load_and_extract( + json_path: &Path, + transcript_ids_for_gene: &mut HashMap>, + genes: &mut HashMap, + transcripts: &mut HashMap, +) -> Result<(), anyhow::Error> { + tracing::info!("Loading cdot transcripts from {:?}", json_path); + let start = Instant::now(); + let models::Container { + genes: c_genes, + transcripts: c_txs, + .. + } = if json_path.ends_with(".gz") { + serde_json::from_reader(flate2::bufread::GzDecoder::new(std::io::BufReader::new( + std::fs::File::open(json_path)?, + )))? + } else { + serde_json::from_reader(std::io::BufReader::new(std::fs::File::open(json_path)?))? + }; + tracing::info!( + "loading / deserializing {} genes and {} transcripts from cdot took {:?}", + c_genes.len().separate_with_commas(), + c_txs.len().separate_with_commas(), + start.elapsed() + ); + + let start = Instant::now(); + c_genes + .values() + .filter(|gene| { + gene.gene_symbol.is_some() + && !gene.gene_symbol.as_ref().unwrap().is_empty() + && gene.map_location.is_some() + && !gene.map_location.as_ref().unwrap().is_empty() + && gene.hgnc.is_some() + && !gene.hgnc.as_ref().unwrap().is_empty() + }) + .for_each(|gene| { + let gene_symbol = gene.gene_symbol.as_ref().unwrap().clone(); + transcript_ids_for_gene + .entry(gene_symbol.clone()) + .or_insert(Vec::new()); + genes.insert(gene_symbol, gene.clone()); + }); + tracing::info!( + "Processed {} genes; total gene count: {}", + c_genes.len().separate_with_commas(), + genes.len().separate_with_commas() + ); + tracing::debug!( + "some 10 genes: {:?}", + genes.keys().take(10).collect::>() + ); + + tracing::info!("Processing transcripts"); + c_txs + .values() + .filter(|tx| { + tx.gene_name.is_some() + && !tx.gene_name.as_ref().unwrap().is_empty() + && genes.contains_key(tx.gene_name.as_ref().unwrap()) + }) + .for_each(|tx| { + let gene_name = tx.gene_name.as_ref().unwrap(); + transcript_ids_for_gene + .get_mut(gene_name) + .unwrap_or_else(|| panic!("tx {:?} for unknown gene {:?}", tx.id, gene_name)) + .push(tx.id.clone()); + transcripts.insert(tx.id.clone(), tx.clone()); + }); + tracing::info!( + "Processed {} genes; total transcript count: {}", + c_txs.len().separate_with_commas(), + transcripts.len().separate_with_commas() + ); + tracing::info!("extracting datastructures took {:?}", start.elapsed()); + Ok(()) +} + +/// Perform flatbuffers file construction. +fn build_flatbuffers( + path_out: &Path, + seqrepo: SeqRepo, + tx_data: TranscriptData, + is_silent: bool, +) -> Result<(), anyhow::Error> { + let TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + } = tx_data; + + tracing::info!("Constructing flatbuffers file ..."); + trace_rss_now(); + let mut builder = FlatBufferBuilder::new(); + + // Construct sequence database. + tracing::info!(" Constructing sequence database ..."); + let mut tx_skipped_noseq = HashSet::new(); // skipped because of missing sequence + let seq_db = { + // Insert into flatbuffer and keep track of pointers in `Vec`s. + let mut aliases = Vec::new(); + let mut aliases_idx = Vec::new(); + let mut seqs = Vec::new(); + let pb = if is_silent { + ProgressBar::hidden() + } else { + ProgressBar::new(transcripts.len() as u64) + }; + pb.set_style(PROGRESS_STYLE.clone()); + for tx_id in transcripts.keys() { + pb.inc(1); + let res_seq = seqrepo.fetch_sequence(&AliasOrSeqId::Alias { + value: tx_id.clone(), + namespace: None, + }); + let seq = if let Ok(seq) = res_seq { + seq + } else { + tx_skipped_noseq.insert(tx_id.clone()); + continue; + }; + + aliases.push(builder.create_shared_string(tx_id.as_str())); + aliases_idx.push(seqs.len() as u32); + let tx_seq = builder.create_shared_string(&seq); + seqs.push(tx_seq); + } + pb.finish_and_clear(); + // Convert these `Vec`s to flatbuffer vectors. + let aliases = builder.create_vector(aliases.as_slice()); + let aliases_idx = builder.create_vector(aliases_idx.as_slice()); + let seqs = builder.create_vector(seqs.as_slice()); + // Finalize by creating `SequenceDb`. + SequenceDb::create( + &mut builder, + &SequenceDbArgs { + aliases: Some(aliases), + aliases_idx: Some(aliases_idx), + seqs: Some(seqs), + }, + ) + }; + tracing::info!( + " ... done constructing sequence database (no seq for {} transcripts, will be skipped)", + tx_skipped_noseq.len().separate_with_commas() + ); + + trace_rss_now(); + + tracing::info!(" Creating transcript records for each gene..."); + let flat_txs = { + let gene_symbols = { + let mut gene_symbols: Vec<_> = genes.keys().cloned().collect(); + gene_symbols.sort(); + gene_symbols + }; + let mut flat_txs = Vec::new(); + // For each gene (in lexicographic symbol order) ... + for gene_symbol in &gene_symbols { + let gene = genes.get(gene_symbol).unwrap(); + let tx_ids = transcript_ids_for_gene + .get(gene_symbol.as_str()) + .unwrap_or_else(|| panic!("No transcripts for gene {:?}", &gene_symbol)); + let tx_ids = tx_ids + .iter() + .filter(|tx_id| !tx_skipped_noseq.contains(*tx_id)) + .collect::>(); + if tx_ids.is_empty() { + tracing::debug!( + "Skipping gene {} as all transcripts have been removed.", + gene_symbol + ); + continue; + } + + // ... for each transcript of the gene ... + for tx_id in tx_ids { + let tx_model = transcripts + .get(tx_id) + .unwrap_or_else(|| panic!("No transcript model for id {:?}", tx_id)); + // ... build genome alignment for each genome release: + let mut genome_alignments = Vec::new(); + for (genome_build, alignment) in &tx_model.genome_builds { + // obtain basic properties + let genome_build = match genome_build.as_ref() { + "GRCh37" => GenomeBuild::Grch37, + "GRCh38" => GenomeBuild::Grch38, + _ => panic!("Unknown genome build {:?}", genome_build), + }; + let contig = Some(builder.create_shared_string(&alignment.contig)); + let cds_start = alignment.cds_start.unwrap_or(-1); + let cds_end = alignment.cds_end.unwrap_or(-1); + let strand = match alignment.strand { + models::Strand::Plus => Strand::Plus, + models::Strand::Minus => Strand::Minus, + }; + // and construct vector of all exons + let exons: Vec<_> = alignment + .exons + .iter() + .map(|exon| { + let cigar = Some(builder.create_shared_string(&exon.cigar)); + ExonAlignment::create( + &mut builder, + &ExonAlignmentArgs { + alt_start_i: exon.alt_start_i, + alt_end_i: exon.alt_end_i, + ord: exon.ord, + alt_cds_start_i: exon.alt_cds_start_i, + alt_cds_end_i: exon.alt_end_i, + cigar, + }, + ) + }) + .collect(); + let exons = Some(builder.create_vector(exons.as_slice())); + // and finally push the genome alignment + genome_alignments.push(GenomeAlignment::create( + &mut builder, + &GenomeAlignmentArgs { + genome_build, + contig, + cds_start, + cds_end, + strand, + exons, + }, + )); + } + + // Now, just obtain the basic properties and create a new transcript using the + // flatbuffers builder. + let id = Some(builder.create_shared_string(tx_id)); + let gene_name = + Some(builder.create_shared_string(gene.gene_symbol.as_ref().unwrap())); + let gene_id = Some(builder.create_shared_string(gene.hgnc.as_ref().unwrap())); + let biotype = if gene + .biotype + .as_ref() + .unwrap() + .contains(&BioType::ProteinCoding) + { + TranscriptBiotype::Coding + } else { + TranscriptBiotype::NonCoding + }; + let mut tags = 0u8; + if let Some(tag) = tx_model.tag.as_ref() { + for t in tag { + tags |= match t { + models::Tag::Basic => TranscriptTag::Basic.0 as u8, + models::Tag::EnsemblCanonical => { + TranscriptTag::EnsemblCanonical.0 as u8 + } + models::Tag::ManeSelect => TranscriptTag::ManeSelect.0 as u8, + models::Tag::ManePlusClinical => { + TranscriptTag::ManePlusClinical.0 as u8 + } + models::Tag::RefSeqSelect => TranscriptTag::RefSeqSelect.0 as u8, + } + } + } + let protein = tx_model + .protein + .as_ref() + .map(|protein| builder.create_shared_string(protein)); + let start_codon = tx_model.start_codon.unwrap_or(-1); + let stop_codon = tx_model.stop_codon.unwrap_or(-1); + let genome_alignments = Some(builder.create_vector(genome_alignments.as_slice())); + + flat_txs.push(Transcript::create( + &mut builder, + &TranscriptArgs { + id, + gene_name, + gene_id, + biotype, + tags, + protein, + start_codon, + stop_codon, + genome_alignments, + }, + )); + } + } + + builder.create_vector(flat_txs.as_slice()) + }; + tracing::info!(" ... done creating transcripts"); + + trace_rss_now(); + + // Build mapping of gene HGNC symbol to transcript IDs. + tracing::info!(" Build gene symbol to transcript ID mapping ..."); + let gene_to_tx = { + let items = transcript_ids_for_gene + .iter() + .map(|(gene_name, tx_ids)| { + let gene_name = Some(builder.create_shared_string(gene_name)); + let tx_ids = tx_ids + .iter() + .map(|s| builder.create_shared_string(s)) + .collect::>(); + let tx_ids = Some(builder.create_vector(&tx_ids)); + GeneToTxId::create(&mut builder, &GeneToTxIdArgs { gene_name, tx_ids }) + }) + .collect::>(); + builder.create_vector(&items) + }; + tracing::info!(" ... done building gene symbol to transcript ID mapping"); + + trace_rss_now(); + + // Compose transcript database from transcripts and gene to transcript mapping. + tracing::info!(" Composing transcript database ..."); + let tx_db = TranscriptDb::create( + &mut builder, + &TranscriptDbArgs { + transcripts: Some(flat_txs), + gene_to_tx: Some(gene_to_tx), + }, + ); + tracing::info!(" ... done composing transcript database"); + + trace_rss_now(); + + // Compose the final transcript and sequence database. + tracing::info!(" Constructing final tx and seq database ..."); + let tx_seq_db = TxSeqDatabase::create( + &mut builder, + &TxSeqDatabaseArgs { + tx_db: Some(tx_db), + seq_db: Some(seq_db), + }, + ); + tracing::info!(" ... done constructing final tx and seq database"); + + trace_rss_now(); + + // Write out the final transcript and sequence database. + tracing::info!(" Writing out final database ..."); + builder.finish_minimal(tx_seq_db); + let mut output_file = File::create(path_out)?; + output_file.write_all(builder.finished_data())?; + output_file.flush()?; + tracing::info!(" ... done writing out final database"); + + trace_rss_now(); + + tracing::info!("... done with constructing flatbuffers file"); + Ok(()) +} + +/// Data as loaded from cdot after processing. +struct TranscriptData { + pub genes: HashMap, + pub transcripts: HashMap, + pub transcript_ids_for_gene: HashMap>, +} + +/// Filter transcripts for gene. +/// +/// We employ the following rules: +/// +/// - Remove redundant transcripts with the same identifier and pick only the +/// transcripts that have the highest version number for one assembly. +/// - Do not pick any `XM_`/`XR_` (NCBI predicted only) transcripts. +/// - Do not pick any `NR_` transcripts when there are coding `NM_` transcripts. +fn filter_transcripts(tx_data: TranscriptData) -> TranscriptData { + tracing::info!("Filtering transcripts ..."); + let start = Instant::now(); + + let TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + } = tx_data; + + let mut chosen = HashSet::new(); + let transcript_ids_for_gene = transcript_ids_for_gene + .into_iter() + .map(|(gene_symbol, prev_tx_ids)| { + // Split off transcript versions from accessions and look for NM transcript. + let mut seen_nm = false; + let mut versioned: Vec<_> = prev_tx_ids + .iter() + .map(|tx_id| { + if tx_id.starts_with("NM_") { + seen_nm = true; + } + let s: Vec<_> = tx_id.split('.').collect(); + (s[0], s[1].parse::().expect("invalid version")) + }) + .collect(); + // Sort descendingly by version. + versioned.sort_by(|a, b| b.1.cmp(&a.1)); + + // Build `next_tx_ids`. + let mut seen_ac = HashSet::new(); + let mut next_tx_ids = Vec::new(); + for (ac, version) in versioned { + let full_ac = format!("{}.{}", &ac, version); + let ac = ac.to_string(); + + let releases = transcripts + .get(&full_ac) + .map(|tx| tx.genome_builds.keys().cloned().collect::>()) + .unwrap_or_default(); + + for release in releases { + #[allow(clippy::if_same_then_else)] + if seen_ac.contains(&(ac.clone(), release.clone())) { + continue; // skip, already have later version + } else if ac.starts_with("NR_") && seen_nm { + continue; // skip NR transcript as we have NM one + } else if ac.starts_with('X') { + continue; // skip XR/XM transcript + } else { + next_tx_ids.push(full_ac.clone()); + seen_ac.insert((ac.clone(), release)); + } + } + } + + next_tx_ids.sort(); + next_tx_ids.dedup(); + chosen.extend(next_tx_ids.iter().cloned()); + + (gene_symbol, next_tx_ids) + }) + .filter(|(_, next_tx_ids)| !next_tx_ids.is_empty()) + .collect(); + + let transcripts: HashMap<_, _> = transcripts + .into_iter() + .filter(|(key, _)| chosen.contains(key)) + .collect(); + + let genes = genes + .into_iter() + .filter(|(key, _)| transcripts.contains_key(key)) + .collect(); + + tracing::info!("... done filtering transcripts in {:?}", start.elapsed()); + TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + } +} + +/// Create file-backed `SeqRepo`. +fn open_seqrepo(args: &Args) -> Result { + tracing::info!("Opening seqrepo..."); + let start = Instant::now(); + let seqrepo = PathBuf::from(&args.path_seqrepo_instance); + let path = seqrepo + .parent() + .ok_or(anyhow::anyhow!( + "Could not get parent from {:?}", + &args.path_seqrepo_instance + ))? + .to_str() + .unwrap() + .to_string(); + let instance = seqrepo + .file_name() + .ok_or(anyhow::anyhow!( + "Could not get basename from {:?}", + &args.path_seqrepo_instance + ))? + .to_str() + .unwrap() + .to_string(); + let seqrepo = SeqRepo::new(path, &instance)?; + tracing::info!("... seqrepo opened in {:?}", start.elapsed()); + Ok(seqrepo) +} + +/// Load the cdot JSON files. +fn load_cdot_files(args: &Args) -> Result { + tracing::info!("Loading cdot JSON files ..."); + let start = Instant::now(); + let mut genes = HashMap::new(); + let mut transcripts = HashMap::new(); + let mut transcript_ids_for_gene = HashMap::new(); + for json_path in &args.path_cdot_json { + load_and_extract( + json_path, + &mut transcript_ids_for_gene, + &mut genes, + &mut transcripts, + )?; + } + tracing::info!( + "... done loading cdot JSON files in {:?} -- #genes = {}, #transcripts = {}, #transcript_ids_for_gene = {}", + start.elapsed(), + genes.len().separate_with_commas(), + transcripts.len().separate_with_commas(), + transcript_ids_for_gene.len().separate_with_commas() + ); + + Ok(TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + }) +} /// Main entry point for `db create txs` sub command. -pub fn run(_common: &crate::common::Args, _args: &Args) -> Result<(), anyhow::Error> { - todo!() +pub fn run(common: &crate::common::Args, args: &Args) -> Result<(), anyhow::Error> { + tracing::info!( + "Building transcript and sequence database file\ncommon args: {:#?}\nargs: {:#?}", + common, + args + ); + + // Open seqrepo, + let seqrepo = open_seqrepo(args)?; + // then load cdot files, + let tx_data = load_cdot_files(args)?; + // then remove redundant onces, and + let tx_data = filter_transcripts(tx_data); + // finally build flatbuffers file. + build_flatbuffers(&args.path_out, seqrepo, tx_data, common.verbose.is_silent())?; + + tracing::info!("Done building transcript and sequence database file"); + Ok(()) +} + +#[cfg(test)] +pub mod test { + use std::collections::HashMap; + use std::path::{Path, PathBuf}; + + use clap_verbosity_flag::Verbosity; + use pretty_assertions::assert_eq; + use temp_testdir::TempDir; + + use crate::common::Args as CommonArgs; + use crate::db::create::txs::TranscriptData; + + use super::{filter_transcripts, load_and_extract, run, Args}; + + #[test] + fn filter_transcripts_brca1() -> Result<(), anyhow::Error> { + let mut genes = HashMap::new(); + let mut transcripts = HashMap::new(); + let mut transcript_ids_for_gene = HashMap::new(); + load_and_extract( + Path::new("tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1.json"), + &mut transcript_ids_for_gene, + &mut genes, + &mut transcripts, + )?; + + let tx_data = TranscriptData { + genes, + transcripts, + transcript_ids_for_gene, + }; + + assert_eq!( + &tx_data + .transcript_ids_for_gene + .get("BRCA1") + .unwrap() + .iter() + .map(|s| s.as_str()) + .collect::>(), + &vec![ + "NM_007294.3", + "NM_007294.4", + "NM_007297.3", + "NM_007297.4", + "NM_007298.3", + "NM_007299.3", + "NM_007299.4", + "NM_007300.3", + "NM_007300.4", + "NR_027676.1", + "NR_027676.2", + "XM_006722029.1", + "XM_006722030.1", + "XM_006722031.1", + "XM_006722032.1", + "XM_006722033.1", + "XM_006722034.1", + "XM_006722035.1", + "XM_006722036.1", + "XM_006722037.1", + "XM_006722038.1", + "XM_006722039.1", + "XM_006722040.1", + "XM_006722041.1" + ] + ); + let filtered = filter_transcripts(tx_data); + assert_eq!( + &filtered + .transcript_ids_for_gene + .get("BRCA1") + .unwrap() + .iter() + .map(|s| s.as_str()) + .collect::>(), + &vec![ + "NM_007294.4", + "NM_007297.4", + "NM_007298.3", + "NM_007299.4", + "NM_007300.4" + ] + ); + Ok(()) + } + + #[test] + fn run_smoke() -> Result<(), anyhow::Error> { + let tmp_dir = TempDir::default(); + + let common_args = CommonArgs { + verbose: Verbosity::new(0, 1), + }; + let args = Args { + path_out: tmp_dir.join("out.bin"), + path_cdot_json: vec![PathBuf::from( + "tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1.json", + )], + path_seqrepo_instance: PathBuf::from("tests/data/db/create/txs/latest"), + }; + + run(&common_args, &args)?; + + Ok(()) + } } diff --git a/src/main.rs b/src/main.rs index ba849816..a2e8a002 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,6 +4,17 @@ pub mod annotate; pub mod common; pub mod db; +#[allow( + non_snake_case, + unused_imports, + clippy::extra_unused_lifetimes, + clippy::missing_safety_doc, + clippy::derivable_impls, + clippy::size_of_in_element_count +)] +#[path = "../target/flatbuffers/world_generated.rs"] +pub mod world_flatbuffers; + use clap::{command, Args, Parser, Subcommand}; #[derive(Debug, Parser)] diff --git a/src/world.fbs b/src/world.fbs new file mode 100644 index 00000000..6211a26c --- /dev/null +++ b/src/world.fbs @@ -0,0 +1,121 @@ +namespace Mehari; + +// Stores long array of sequences with an "index" of sequence names to their +// index. +// +// The fields `aliases` and `aliases_idx` have the same length and `aliases_idx[i]` +// stores the index into `seqs` for the sequence `aliases[i]`. In other words. +// `seqs[aliases_idx[i]]` stores the sequence for `aliases[i]`. +table SequenceDb { + // The sequence aliases, cf. `aliases_idx`. + aliases: [string]; + // The corresponding index in `seqs`, cf. `aliases`. + aliases_idx: [uint]; + // The corresponding sequences. + seqs: [string]; +} + +// Mapping from gene to transcript ID. +table GeneToTxId { + // Gene HGNC symbol; serves as gene identifier. + gene_name: string; + // Vector of all transcript IDs. + tx_ids: [string]; +} + +// Container for the transcript-related database. +table TranscriptDb { + // Vector of all transcripts. + transcripts: [Transcript]; + // Mapping from gene ID to vector of all transcript IDs. + gene_to_tx: [GeneToTxId]; +} + +// Enumeration for `Transcript::biotype`. +enum TranscriptBiotype: byte { + Coding, + NonCoding, +} + +// Bit values for the transcript tags. +enum TranscriptTag: byte { + Basic = 1, + EnsemblCanonical = 2, + ManeSelect = 4, + ManePlusClinical = 8, + RefSeqSelect = 16, +} + +// Store information about a transcript. +table Transcript { + // Transcript accession with version, e.g., `"NM_007294.3"` or `"ENST00000461574.1"` for BRCA1. + id: string; + // HGNC symbol, e.g., `"BRCA1"` + gene_name: string; + // HGNC gene identifier, e.g., `"1100"` for BRCA1. + gene_id: string; + // Transcript biotype. + biotype: TranscriptBiotype; + // Transcript flags, values from `TranscriptTag`, stored as OR-ed ubyte values. + tags: ubyte; + // Identifier of the corresponding protein. + protein: string; + // CDS start codon. + start_codon: int; + // CDS stop codon. + stop_codon: int; + // Alignments on the different genome builds. + genome_alignments: [GenomeAlignment]; +} + +// Enumeration for the known genome builds. +enum GenomeBuild: byte { + Grch37, + Grch38, +} + +// Enumeration for the two strands of the genome. +enum Strand: byte { + Plus, + Minus, +} + +// Store information about a transcript aligning to a genome. +table GenomeAlignment { + // The genome build identifier. + genome_build: GenomeBuild; + // Accession of the contig sequence. + contig: string; + // CDS end position, `-1` to indicate `None`. + cds_start: int; + // CDS end position, `-1` to indicate `None`. + cds_end: int; + // The strand. + strand: Strand; + // Exons of the alignment. + exons: [ExonAlignment]; +} + +// Store the alignment of one exon to the reference. +table ExonAlignment { + // Start position on reference. + alt_start_i: int; + // End position on reference. + alt_end_i: int; + // Exon number. + ord: int; + // CDS start coordinate. + alt_cds_start_i: int; + // CDS end coordinate. + alt_cds_end_i: int; + // CIGAR string of alignment, empty indicates full matches. + cigar: string; +} + +// Database of transcripts with sequences. +table TxSeqDatabase { + /// Store transcripts with their aliases. + tx_db: TranscriptDb; + /// Store sequence with their aliases. + seq_db: SequenceDb; +} diff --git a/tests/data/db/create/txs/bootstrap.sh b/tests/data/db/create/txs/bootstrap.sh new file mode 100644 index 00000000..7f90d986 --- /dev/null +++ b/tests/data/db/create/txs/bootstrap.sh @@ -0,0 +1,55 @@ +#!/usr/bin/bash + +# Setup Logging ------------------------------------------------------------- + +log() +{ + >&2 echo $@ +} + +debug() +{ + [[ "${VERBOSE-0}" -ne 0 ]] && >&2 echo $@ +} + +set -euo pipefail + +if [[ "${VERBOSE-0}" -ne 0 ]]; then + set -x +fi + +# Initialization ------------------------------------------------------------ + +if [[ "$#" -ne 2 ]]; then + log "USAGE: bootstrap.sh SEQREPO INSTANCE" + log "" + log "Set VERBOSE=1 to increase verbosity." + exit 1 +fi + +# path to the directory where the script resides. +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +# SeqRepo source directory. +SRC=$1 + +# SeqRepo instance. +INSTANCE=$2 + +# Destination directory. +DST=$SCRIPT_DIR + +# Import SQLite database ---------------------------------------------------- + +rm -rf $DST/latest brca1.fasta + +seqrepo --root-directory $DST init --instance-name latest + +gene=brca1 +tx="NM_007294.4 NM_007297.4 NM_007298.3 NM_007299.4 NM_007300.4" + +seqrepo --root-directory $SRC export -i $INSTANCE $tx \ +| sed -e 's/NCBI://g' -e 's/ refseq:.*//g' \ +> $DST/$gene.fasta + +seqrepo --root-directory $DST load --instance-name latest --namespace refseq $DST/$gene.fasta diff --git a/tests/data/db/create/txs/brca1.fasta b/tests/data/db/create/txs/brca1.fasta new file mode 100644 index 00000000..d5fdcbc8 --- /dev/null +++ b/tests/data/db/create/txs/brca1.fasta @@ -0,0 +1,255 @@ +>NM_007300.4 +GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGCCTTCACCCTCTGCTCTGGGTAAAGTTCATT +GGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAG +TTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTT +TATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGA +CACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGC +TACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGA +GAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAAGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTA +TTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCT +GAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGG +GTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAG +AATGAATGTAGAAAAGGCTGAATTCTGTAATAAAAGCAAACAGCCTGGCTTAGCAAGGAGCCAACATAACAGATGGGCTGGAAGTAAGGAAACATGTAAT +GATAGGCGGACTCCCAGCACAGAAAAAAAGGTAGATCTGAATGCTGATCCCCTGTGTGAGAGAAAAGAATGGAATAAGCAGAAACTGCCATGCTCAGAGA +ATCCTAGAGATACTGAAGATGTTCCTTGGATAACACTAAATAGCAGCATTCAGAAAGTTAATGAGTGGTTTTCCAGAAGTGATGAACTGTTAGGTTCTGA +TGACTCACATGATGGGGAGTCTGAATCAAATGCCAAAGTAGCTGATGTATTGGACGTTCTAAATGAGGTAGATGAATATTCTGGTTCTTCAGAGAAAATA +GACTTACTGGCCAGTGATCCTCATGAGGCTTTAATATGTAAAAGTGAAAGAGTTCACTCCAAATCAGTAGAGAGTAATATTGAAGACAAAATATTTGGGA +AAACCTATCGGAAGAAGGCAAGCCTCCCCAACTTAAGCCATGTAACTGAAAATCTAATTATAGGAGCATTTGTTACTGAGCCACAGATAATACAAGAGCG +TCCCCTCACAAATAAATTAAAGCGTAAAAGGAGACCTACATCAGGCCTTCATCCTGAGGATTTTATCAAGAAAGCAGATTTGGCAGTTCAAAAGACTCCT +GAAATGATAAATCAGGGAACTAACCAAACGGAGCAGAATGGTCAAGTGATGAATATTACTAATAGTGGTCATGAGAATAAAACAAAAGGTGATTCTATTC +AGAATGAGAAAAATCCTAACCCAATAGAATCACTCGAAAAAGAATCTGCTTTCAAAACGAAAGCTGAACCTATAAGCAGCAGTATAAGCAATATGGAACT +CGAATTAAATATCCACAATTCAAAAGCACCTAAAAAGAATAGGCTGAGGAGGAAGTCTTCTACCAGGCATATTCATGCGCTTGAACTAGTAGTCAGTAGA +AATCTAAGCCCACCTAATTGTACTGAATTGCAAATTGATAGTTGTTCTAGCAGTGAAGAGATAAAGAAAAAAAAGTACAACCAAATGCCAGTCAGGCACA +GCAGAAACCTACAACTCATGGAAGGTAAAGAACCTGCAACTGGAGCCAAGAAGAGTAACAAGCCAAATGAACAGACAAGTAAAAGACATGACAGCGATAC +TTTCCCAGAGCTGAAGTTAACAAATGCACCTGGTTCTTTTACTAAGTGTTCAAATACCAGTGAACTTAAAGAATTTGTCAATCCTAGCCTTCCAAGAGAA +GAAAAAGAAGAGAAACTAGAAACAGTTAAAGTGTCTAATAATGCTGAAGACCCCAAAGATCTCATGTTAAGTGGAGAAAGGGTTTTGCAAACTGAAAGAT +CTGTAGAGAGTAGCAGTATTTCATTGGTACCTGGTACTGATTATGGCACTCAGGAAAGTATCTCGTTACTGGAAGTTAGCACTCTAGGGAAGGCAAAAAC +AGAACCAAATAAATGTGTGAGTCAGTGTGCAGCATTTGAAAACCCCAAGGGACTAATTCATGGTTGTTCCAAAGATAATAGAAATGACACAGAAGGCTTT +AAGTATCCATTGGGACATGAAGTTAACCACAGTCGGGAAACAAGCATAGAAATGGAAGAAAGTGAACTTGATGCTCAGTATTTGCAGAATACATTCAAGG +TTTCAAAGCGCCAGTCATTTGCTCCGTTTTCAAATCCAGGAAATGCAGAAGAGGAATGTGCAACATTCTCTGCCCACTCTGGGTCCTTAAAGAAACAAAG +TCCAAAAGTCACTTTTGAATGTGAACAAAAGGAAGAAAATCAAGGAAAGAATGAGTCTAATATCAAGCCTGTACAGACAGTTAATATCACTGCAGGCTTT +CCTGTGGTTGGTCAGAAAGATAAGCCAGTTGATAATGCCAAATGTAGTATCAAAGGAGGCTCTAGGTTTTGTCTATCATCTCAGTTCAGAGGCAACGAAA +CTGGACTCATTACTCCAAATAAACATGGACTTTTACAAAACCCATATCGTATACCACCACTTTTTCCCATCAAGTCATTTGTTAAAACTAAATGTAAGAA +AAATCTGCTAGAGGAAAACTTTGAGGAACATTCAATGTCACCTGAAAGAGAAATGGGAAATGAGAACATTCCAAGTACAGTGAGCACAATTAGCCGTAAT +AACATTAGAGAAAATGTTTTTAAAGAAGCCAGCTCAAGCAATATTAATGAAGTAGGTTCCAGTACTAATGAAGTGGGCTCCAGTATTAATGAAATAGGTT +CCAGTGATGAAAACATTCAAGCAGAACTAGGTAGAAACAGAGGGCCAAAATTGAATGCTATGCTTAGATTAGGGGTTTTGCAACCTGAGGTCTATAAACA +AAGTCTTCCTGGAAGTAATTGTAAGCATCCTGAAATAAAAAAGCAAGAATATGAAGAAGTAGTTCAGACTGTTAATACAGATTTCTCTCCATATCTGATT +TCAGATAACTTAGAACAGCCTATGGGAAGTAGTCATGCATCTCAGGTTTGTTCTGAGACACCTGATGACCTGTTAGATGATGGTGAAATAAAGGAAGATA +CTAGTTTTGCTGAAAATGACATTAAGGAAAGTTCTGCTGTTTTTAGCAAAAGCGTCCAGAAAGGAGAGCTTAGCAGGAGTCCTAGCCCTTTCACCCATAC +ACATTTGGCTCAGGGTTACCGAAGAGGGGCCAAGAAATTAGAGTCCTCAGAAGAGAACTTATCTAGTGAGGATGAAGAGCTTCCCTGCTTCCAACACTTG +TTATTTGGTAAAGTAAACAATATACCTTCTCAGTCTACTAGGCATAGCACCGTTGCTACCGAGTGTCTGTCTAAGAACACAGAGGAGAATTTATTATCAT +TGAAGAATAGCTTAAATGACTGCAGTAACCAGGTAATATTGGCAAAGGCATCTCAGGAACATCACCTTAGTGAGGAAACAAAATGTTCTGCTAGCTTGTT +TTCTTCACAGTGCAGTGAATTGGAAGACTTGACTGCAAATACAAACACCCAGGATCCTTTCTTGATTGGTTCTTCCAAACAAATGAGGCATCAGTCTGAA +AGCCAGGGAGTTGGTCTGAGTGACAAGGAATTGGTTTCAGATGATGAAGAAAGAGGAACGGGCTTGGAAGAAAATAATCAAGAAGAGCAAAGCATGGATT +CAAACTTAGGTGAAGCAGCATCTGGGTGTGAGAGTGAAACAAGCGTCTCTGAAGACTGCTCAGGGCTATCCTCTCAGAGTGACATTTTAACCACTCAGCA +GAGGGATACCATGCAACATAACCTGATAAAGCTCCAGCAGGAAATGGCTGAACTAGAAGCTGTGTTAGAACAGCATGGGAGCCAGCCTTCTAACAGCTAC +CCTTCCATCATAAGTGACTCTTCTGCCCTTGAGGACCTGCGAAATCCAGAACAAAGCACATCAGAAAAAGATTCGCATATACATGGCCAAAGGAACAACT +CCATGTTTTCTAAAAGGCCTAGAGAACATATATCAGTATTAACTTCACAGAAAAGTAGTGAATACCCTATAAGCCAGAATCCAGAAGGCCTTTCTGCTGA +CAAGTTTGAGGTGTCTGCAGATAGTTCTACCAGTAAAAATAAAGAACCAGGAGTGGAAAGGTCATCCCCTTCTAAATGCCCATCATTAGATGATAGGTGG +TACATGCACAGTTGCTCTGGGAGTCTTCAGAATAGAAACTACCCATCTCAAGAGGAGCTCATTAAGGTTGTTGATGTGGAGGAGCAACAGCTGGAAGAGT +CTGGGCCACACGATTTGACGGAAACATCTTACTTGCCAAGGCAAGATCTAGAGGGAACCCCTTACCTGGAATCTGGAATCAGCCTCTTCTCTGATGACCC +TGAATCTGATCCTTCTGAAGACAGAGCCCCAGAGTCAGCTCGTGTTGGCAACATACCATCTTCAACCTCTGCATTGAAAGTTCCCCAATTGAAAGTTGCA +GAATCTGCCCAGAGTCCAGCTGCTGCTCATACTACTGATACTGCTGGGTATAATGCAATGGAAGAAAGTGTGAGCAGGGAGAAGCCAGAATTGACAGCTT +CAACAGAAAGGGTCAACAAAAGAATGTCCATGGTGGTGTCTGGCCTGACCCCAGAAGAATTTATGCTCGTGTACAAGTTTGCCAGAAAACACCACATCAC +TTTAACTAATCTAATTACTGAAGAGACTACTCATGTTGTTATGAAAACAGATGCTGAGTTTGTGTGTGAACGGACACTGAAATATTTTCTAGGAATTGCG +GGAGGAAAATGGGTAGTTAGCTATTTCTGGGTGACCCAGTCTATTAAAGAAAGAAAAATGCTGAATGAGCATGATTTTGAAGTCAGAGGAGATGTGGTCA +ATGGAAGAAACCACCAAGGTCCAAAGCGAGCAAGAGAATCCCAGGACAGAAAGATCTTCAGGGGGCTAGAAATCTGTTGCTATGGGCCCTTCACCAACAT +GCCCACAGATCAACTGGAATGGATGGTACAGCTGTGTGGTGCTTCTGTGGTGAAGGAGCTTTCATCATTCACCCTTGGCACAGGTGTCCACCCAATTGTG +GTTGTGCAGCCAGATGCCTGGACAGAGGACAATGGCTTCCATGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTG +TAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGA +CCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTCCTTCTACTGTCCTGGCTACTAAATATTTTATGTACAT +CAGCCTGAAAAGGACTTCTGGCTATGCAAGGGTCCCTTAAAGATTTTCTGCTTGAAGTCTCCCTTGGAAATCTGCCATGAGCACAAAATTATGGTAATTT +TTCACCTGAGAAGATTTTAAAACCATTTAAACGCCACCAATTGAGCAAGATGCTGATTCATTATTTATCAGCCCTATTCTTTCTATTCAGGCTGTTGTTG +GCTTAGGGCTGGAAGCACAGAGTGGCTTGGCCTCAAGAGAATAGCTGGTTTCCCTAAGTTTACTTCTCTAAAACCCTGTGTTCACAAAGGCAGAGAGTCA +GACCCTTCAATGGAAGGAGAGTGCTTGGGATCGATTATGTGACTTAAAGTCAGAATAGTCCTTGGGCAGTTCTCAAATGTTGGAGTGGAACATTGGGGAG +GAAATTCTGAGGCAGGTATTAGAAATGAAAAGGAAACTTGAAACCTGGGCATGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGCA +GATCACTGGAGGTCAGGAGTTCGAAACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAGAAATTAGCCGGTCATGGTGGTGGACACC +TGTAATCCCAGCTACTCAGGTGGCTAAGGCAGGAGAATCACTTCAGCCCGGGAGGTGGAGGTTGCAGTGAGCCAAGATCATACCACGGCACTCCAGCCTG +GGTGACAGTGAGACTGTGGCTCAAAAAAAAAAAAAAAAAAAGGAAAATGAAACTAGAAGAGATTTCTAAAAGTCTGAGATATATTTGCTAGATTTCTAAA +GAATGTGTTCTAAAACAGCAGAAGATTTTCAAGAACCGGTTTCCAAAGACAGTCTTCTAATTCCTCATTAGTAATAAGTAAAATGTTTATTGTTGTAGCT +CTGGTATATAATCCATTCCTCTTAAAATATAAGACCTCTGGCATGAATATTTCATATCTATAAAATGACAGATCCCACCAGGAAGGAAGCTGTTGCTTTC +TTTGAGGTGATTTTTTTCCTTTGCTCCCTGTTGCTGAAACCATACAGCTTCATAAATAATTTTGCTTGCTGAAGGAAGAAAAAGTGTTTTTCATAAACCC +ATTATCCAGGACTGTTTATAGCTGTTGGAAGGACTAGGTCTTCCCTAGCCCCCCCAGTGTGCAAGGGCAGTGAAGACTTGATTGTACAAAATACGTTTTG +TAAATGTTGTGCTGTTAACACTGCAAATAAACTTGGTAGCAAACACTTCCA +>NM_007294.4 +GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGCCTTCACCCTCTGCTCTGGGTAAAGTTCATT +GGAACAGAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAG +TTGATCAAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTT +TATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGA +CACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGC +TACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGA +GAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAAGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTA +TTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCT +GAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGG +GTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAG +AATGAATGTAGAAAAGGCTGAATTCTGTAATAAAAGCAAACAGCCTGGCTTAGCAAGGAGCCAACATAACAGATGGGCTGGAAGTAAGGAAACATGTAAT +GATAGGCGGACTCCCAGCACAGAAAAAAAGGTAGATCTGAATGCTGATCCCCTGTGTGAGAGAAAAGAATGGAATAAGCAGAAACTGCCATGCTCAGAGA +ATCCTAGAGATACTGAAGATGTTCCTTGGATAACACTAAATAGCAGCATTCAGAAAGTTAATGAGTGGTTTTCCAGAAGTGATGAACTGTTAGGTTCTGA +TGACTCACATGATGGGGAGTCTGAATCAAATGCCAAAGTAGCTGATGTATTGGACGTTCTAAATGAGGTAGATGAATATTCTGGTTCTTCAGAGAAAATA +GACTTACTGGCCAGTGATCCTCATGAGGCTTTAATATGTAAAAGTGAAAGAGTTCACTCCAAATCAGTAGAGAGTAATATTGAAGACAAAATATTTGGGA +AAACCTATCGGAAGAAGGCAAGCCTCCCCAACTTAAGCCATGTAACTGAAAATCTAATTATAGGAGCATTTGTTACTGAGCCACAGATAATACAAGAGCG +TCCCCTCACAAATAAATTAAAGCGTAAAAGGAGACCTACATCAGGCCTTCATCCTGAGGATTTTATCAAGAAAGCAGATTTGGCAGTTCAAAAGACTCCT +GAAATGATAAATCAGGGAACTAACCAAACGGAGCAGAATGGTCAAGTGATGAATATTACTAATAGTGGTCATGAGAATAAAACAAAAGGTGATTCTATTC +AGAATGAGAAAAATCCTAACCCAATAGAATCACTCGAAAAAGAATCTGCTTTCAAAACGAAAGCTGAACCTATAAGCAGCAGTATAAGCAATATGGAACT +CGAATTAAATATCCACAATTCAAAAGCACCTAAAAAGAATAGGCTGAGGAGGAAGTCTTCTACCAGGCATATTCATGCGCTTGAACTAGTAGTCAGTAGA +AATCTAAGCCCACCTAATTGTACTGAATTGCAAATTGATAGTTGTTCTAGCAGTGAAGAGATAAAGAAAAAAAAGTACAACCAAATGCCAGTCAGGCACA +GCAGAAACCTACAACTCATGGAAGGTAAAGAACCTGCAACTGGAGCCAAGAAGAGTAACAAGCCAAATGAACAGACAAGTAAAAGACATGACAGCGATAC +TTTCCCAGAGCTGAAGTTAACAAATGCACCTGGTTCTTTTACTAAGTGTTCAAATACCAGTGAACTTAAAGAATTTGTCAATCCTAGCCTTCCAAGAGAA +GAAAAAGAAGAGAAACTAGAAACAGTTAAAGTGTCTAATAATGCTGAAGACCCCAAAGATCTCATGTTAAGTGGAGAAAGGGTTTTGCAAACTGAAAGAT +CTGTAGAGAGTAGCAGTATTTCATTGGTACCTGGTACTGATTATGGCACTCAGGAAAGTATCTCGTTACTGGAAGTTAGCACTCTAGGGAAGGCAAAAAC +AGAACCAAATAAATGTGTGAGTCAGTGTGCAGCATTTGAAAACCCCAAGGGACTAATTCATGGTTGTTCCAAAGATAATAGAAATGACACAGAAGGCTTT +AAGTATCCATTGGGACATGAAGTTAACCACAGTCGGGAAACAAGCATAGAAATGGAAGAAAGTGAACTTGATGCTCAGTATTTGCAGAATACATTCAAGG +TTTCAAAGCGCCAGTCATTTGCTCCGTTTTCAAATCCAGGAAATGCAGAAGAGGAATGTGCAACATTCTCTGCCCACTCTGGGTCCTTAAAGAAACAAAG +TCCAAAAGTCACTTTTGAATGTGAACAAAAGGAAGAAAATCAAGGAAAGAATGAGTCTAATATCAAGCCTGTACAGACAGTTAATATCACTGCAGGCTTT +CCTGTGGTTGGTCAGAAAGATAAGCCAGTTGATAATGCCAAATGTAGTATCAAAGGAGGCTCTAGGTTTTGTCTATCATCTCAGTTCAGAGGCAACGAAA +CTGGACTCATTACTCCAAATAAACATGGACTTTTACAAAACCCATATCGTATACCACCACTTTTTCCCATCAAGTCATTTGTTAAAACTAAATGTAAGAA +AAATCTGCTAGAGGAAAACTTTGAGGAACATTCAATGTCACCTGAAAGAGAAATGGGAAATGAGAACATTCCAAGTACAGTGAGCACAATTAGCCGTAAT +AACATTAGAGAAAATGTTTTTAAAGAAGCCAGCTCAAGCAATATTAATGAAGTAGGTTCCAGTACTAATGAAGTGGGCTCCAGTATTAATGAAATAGGTT +CCAGTGATGAAAACATTCAAGCAGAACTAGGTAGAAACAGAGGGCCAAAATTGAATGCTATGCTTAGATTAGGGGTTTTGCAACCTGAGGTCTATAAACA +AAGTCTTCCTGGAAGTAATTGTAAGCATCCTGAAATAAAAAAGCAAGAATATGAAGAAGTAGTTCAGACTGTTAATACAGATTTCTCTCCATATCTGATT +TCAGATAACTTAGAACAGCCTATGGGAAGTAGTCATGCATCTCAGGTTTGTTCTGAGACACCTGATGACCTGTTAGATGATGGTGAAATAAAGGAAGATA +CTAGTTTTGCTGAAAATGACATTAAGGAAAGTTCTGCTGTTTTTAGCAAAAGCGTCCAGAAAGGAGAGCTTAGCAGGAGTCCTAGCCCTTTCACCCATAC +ACATTTGGCTCAGGGTTACCGAAGAGGGGCCAAGAAATTAGAGTCCTCAGAAGAGAACTTATCTAGTGAGGATGAAGAGCTTCCCTGCTTCCAACACTTG +TTATTTGGTAAAGTAAACAATATACCTTCTCAGTCTACTAGGCATAGCACCGTTGCTACCGAGTGTCTGTCTAAGAACACAGAGGAGAATTTATTATCAT +TGAAGAATAGCTTAAATGACTGCAGTAACCAGGTAATATTGGCAAAGGCATCTCAGGAACATCACCTTAGTGAGGAAACAAAATGTTCTGCTAGCTTGTT +TTCTTCACAGTGCAGTGAATTGGAAGACTTGACTGCAAATACAAACACCCAGGATCCTTTCTTGATTGGTTCTTCCAAACAAATGAGGCATCAGTCTGAA +AGCCAGGGAGTTGGTCTGAGTGACAAGGAATTGGTTTCAGATGATGAAGAAAGAGGAACGGGCTTGGAAGAAAATAATCAAGAAGAGCAAAGCATGGATT +CAAACTTAGGTGAAGCAGCATCTGGGTGTGAGAGTGAAACAAGCGTCTCTGAAGACTGCTCAGGGCTATCCTCTCAGAGTGACATTTTAACCACTCAGCA +GAGGGATACCATGCAACATAACCTGATAAAGCTCCAGCAGGAAATGGCTGAACTAGAAGCTGTGTTAGAACAGCATGGGAGCCAGCCTTCTAACAGCTAC +CCTTCCATCATAAGTGACTCTTCTGCCCTTGAGGACCTGCGAAATCCAGAACAAAGCACATCAGAAAAAGCAGTATTAACTTCACAGAAAAGTAGTGAAT +ACCCTATAAGCCAGAATCCAGAAGGCCTTTCTGCTGACAAGTTTGAGGTGTCTGCAGATAGTTCTACCAGTAAAAATAAAGAACCAGGAGTGGAAAGGTC +ATCCCCTTCTAAATGCCCATCATTAGATGATAGGTGGTACATGCACAGTTGCTCTGGGAGTCTTCAGAATAGAAACTACCCATCTCAAGAGGAGCTCATT +AAGGTTGTTGATGTGGAGGAGCAACAGCTGGAAGAGTCTGGGCCACACGATTTGACGGAAACATCTTACTTGCCAAGGCAAGATCTAGAGGGAACCCCTT +ACCTGGAATCTGGAATCAGCCTCTTCTCTGATGACCCTGAATCTGATCCTTCTGAAGACAGAGCCCCAGAGTCAGCTCGTGTTGGCAACATACCATCTTC +AACCTCTGCATTGAAAGTTCCCCAATTGAAAGTTGCAGAATCTGCCCAGAGTCCAGCTGCTGCTCATACTACTGATACTGCTGGGTATAATGCAATGGAA +GAAAGTGTGAGCAGGGAGAAGCCAGAATTGACAGCTTCAACAGAAAGGGTCAACAAAAGAATGTCCATGGTGGTGTCTGGCCTGACCCCAGAAGAATTTA +TGCTCGTGTACAAGTTTGCCAGAAAACACCACATCACTTTAACTAATCTAATTACTGAAGAGACTACTCATGTTGTTATGAAAACAGATGCTGAGTTTGT +GTGTGAACGGACACTGAAATATTTTCTAGGAATTGCGGGAGGAAAATGGGTAGTTAGCTATTTCTGGGTGACCCAGTCTATTAAAGAAAGAAAAATGCTG +AATGAGCATGATTTTGAAGTCAGAGGAGATGTGGTCAATGGAAGAAACCACCAAGGTCCAAAGCGAGCAAGAGAATCCCAGGACAGAAAGATCTTCAGGG +GGCTAGAAATCTGTTGCTATGGGCCCTTCACCAACATGCCCACAGATCAACTGGAATGGATGGTACAGCTGTGTGGTGCTTCTGTGGTGAAGGAGCTTTC +ATCATTCACCCTTGGCACAGGTGTCCACCCAATTGTGGTTGTGCAGCCAGATGCCTGGACAGAGGACAATGGCTTCCATGCAATTGGGCAGATGTGTGAG +GCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACT +ACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTC +CTTCTACTGTCCTGGCTACTAAATATTTTATGTACATCAGCCTGAAAAGGACTTCTGGCTATGCAAGGGTCCCTTAAAGATTTTCTGCTTGAAGTCTCCC +TTGGAAATCTGCCATGAGCACAAAATTATGGTAATTTTTCACCTGAGAAGATTTTAAAACCATTTAAACGCCACCAATTGAGCAAGATGCTGATTCATTA +TTTATCAGCCCTATTCTTTCTATTCAGGCTGTTGTTGGCTTAGGGCTGGAAGCACAGAGTGGCTTGGCCTCAAGAGAATAGCTGGTTTCCCTAAGTTTAC +TTCTCTAAAACCCTGTGTTCACAAAGGCAGAGAGTCAGACCCTTCAATGGAAGGAGAGTGCTTGGGATCGATTATGTGACTTAAAGTCAGAATAGTCCTT +GGGCAGTTCTCAAATGTTGGAGTGGAACATTGGGGAGGAAATTCTGAGGCAGGTATTAGAAATGAAAAGGAAACTTGAAACCTGGGCATGGTGGCTCACG +CCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGCAGATCACTGGAGGTCAGGAGTTCGAAACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTA +AAAATACAGAAATTAGCCGGTCATGGTGGTGGACACCTGTAATCCCAGCTACTCAGGTGGCTAAGGCAGGAGAATCACTTCAGCCCGGGAGGTGGAGGTT +GCAGTGAGCCAAGATCATACCACGGCACTCCAGCCTGGGTGACAGTGAGACTGTGGCTCAAAAAAAAAAAAAAAAAAAGGAAAATGAAACTAGAAGAGAT +TTCTAAAAGTCTGAGATATATTTGCTAGATTTCTAAAGAATGTGTTCTAAAACAGCAGAAGATTTTCAAGAACCGGTTTCCAAAGACAGTCTTCTAATTC +CTCATTAGTAATAAGTAAAATGTTTATTGTTGTAGCTCTGGTATATAATCCATTCCTCTTAAAATATAAGACCTCTGGCATGAATATTTCATATCTATAA +AATGACAGATCCCACCAGGAAGGAAGCTGTTGCTTTCTTTGAGGTGATTTTTTTCCTTTGCTCCCTGTTGCTGAAACCATACAGCTTCATAAATAATTTT +GCTTGCTGAAGGAAGAAAAAGTGTTTTTCATAAACCCATTATCCAGGACTGTTTATAGCTGTTGGAAGGACTAGGTCTTCCCTAGCCCCCCCAGTGTGCA +AGGGCAGTGAAGACTTGATTGTACAAAATACGTTTTGTAAATGTTGTGCTGTTAACACTGCAAATAAACTTGGTAGCAAACACTTCCA +>NM_007297.4 +GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGCCTTCACCCTCTGCTCTGGTTCATTGGAACA +GAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGATTTTGCATGCTG +AAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTAAGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTG +TTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGGTTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGA +ACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGAAACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAA +ACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGAGAACTCTGAGGACAAAGCAGCGGATACAACCTCAAAAGACGTCTGTCTACATTGAATTGG +GATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAGTGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAAT +CAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCTGAGACGGATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACT +GAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTAGTTCTGTTTCAAACTTGCATGTGGAGCCATGTGGCACAAATACTCATGCCAGCTCAT +TACAGCATGAGAACAGCAGTTTATTACTCACTAAAGACAGAATGAATGTAGAAAAGGCTGAATTCTGTAATAAAAGCAAACAGCCTGGCTTAGCAAGGAG +CCAACATAACAGATGGGCTGGAAGTAAGGAAACATGTAATGATAGGCGGACTCCCAGCACAGAAAAAAAGGTAGATCTGAATGCTGATCCCCTGTGTGAG +AGAAAAGAATGGAATAAGCAGAAACTGCCATGCTCAGAGAATCCTAGAGATACTGAAGATGTTCCTTGGATAACACTAAATAGCAGCATTCAGAAAGTTA +ATGAGTGGTTTTCCAGAAGTGATGAACTGTTAGGTTCTGATGACTCACATGATGGGGAGTCTGAATCAAATGCCAAAGTAGCTGATGTATTGGACGTTCT +AAATGAGGTAGATGAATATTCTGGTTCTTCAGAGAAAATAGACTTACTGGCCAGTGATCCTCATGAGGCTTTAATATGTAAAAGTGAAAGAGTTCACTCC +AAATCAGTAGAGAGTAATATTGAAGACAAAATATTTGGGAAAACCTATCGGAAGAAGGCAAGCCTCCCCAACTTAAGCCATGTAACTGAAAATCTAATTA +TAGGAGCATTTGTTACTGAGCCACAGATAATACAAGAGCGTCCCCTCACAAATAAATTAAAGCGTAAAAGGAGACCTACATCAGGCCTTCATCCTGAGGA +TTTTATCAAGAAAGCAGATTTGGCAGTTCAAAAGACTCCTGAAATGATAAATCAGGGAACTAACCAAACGGAGCAGAATGGTCAAGTGATGAATATTACT +AATAGTGGTCATGAGAATAAAACAAAAGGTGATTCTATTCAGAATGAGAAAAATCCTAACCCAATAGAATCACTCGAAAAAGAATCTGCTTTCAAAACGA +AAGCTGAACCTATAAGCAGCAGTATAAGCAATATGGAACTCGAATTAAATATCCACAATTCAAAAGCACCTAAAAAGAATAGGCTGAGGAGGAAGTCTTC +TACCAGGCATATTCATGCGCTTGAACTAGTAGTCAGTAGAAATCTAAGCCCACCTAATTGTACTGAATTGCAAATTGATAGTTGTTCTAGCAGTGAAGAG +ATAAAGAAAAAAAAGTACAACCAAATGCCAGTCAGGCACAGCAGAAACCTACAACTCATGGAAGGTAAAGAACCTGCAACTGGAGCCAAGAAGAGTAACA +AGCCAAATGAACAGACAAGTAAAAGACATGACAGCGATACTTTCCCAGAGCTGAAGTTAACAAATGCACCTGGTTCTTTTACTAAGTGTTCAAATACCAG +TGAACTTAAAGAATTTGTCAATCCTAGCCTTCCAAGAGAAGAAAAAGAAGAGAAACTAGAAACAGTTAAAGTGTCTAATAATGCTGAAGACCCCAAAGAT +CTCATGTTAAGTGGAGAAAGGGTTTTGCAAACTGAAAGATCTGTAGAGAGTAGCAGTATTTCATTGGTACCTGGTACTGATTATGGCACTCAGGAAAGTA +TCTCGTTACTGGAAGTTAGCACTCTAGGGAAGGCAAAAACAGAACCAAATAAATGTGTGAGTCAGTGTGCAGCATTTGAAAACCCCAAGGGACTAATTCA +TGGTTGTTCCAAAGATAATAGAAATGACACAGAAGGCTTTAAGTATCCATTGGGACATGAAGTTAACCACAGTCGGGAAACAAGCATAGAAATGGAAGAA +AGTGAACTTGATGCTCAGTATTTGCAGAATACATTCAAGGTTTCAAAGCGCCAGTCATTTGCTCCGTTTTCAAATCCAGGAAATGCAGAAGAGGAATGTG +CAACATTCTCTGCCCACTCTGGGTCCTTAAAGAAACAAAGTCCAAAAGTCACTTTTGAATGTGAACAAAAGGAAGAAAATCAAGGAAAGAATGAGTCTAA +TATCAAGCCTGTACAGACAGTTAATATCACTGCAGGCTTTCCTGTGGTTGGTCAGAAAGATAAGCCAGTTGATAATGCCAAATGTAGTATCAAAGGAGGC +TCTAGGTTTTGTCTATCATCTCAGTTCAGAGGCAACGAAACTGGACTCATTACTCCAAATAAACATGGACTTTTACAAAACCCATATCGTATACCACCAC +TTTTTCCCATCAAGTCATTTGTTAAAACTAAATGTAAGAAAAATCTGCTAGAGGAAAACTTTGAGGAACATTCAATGTCACCTGAAAGAGAAATGGGAAA +TGAGAACATTCCAAGTACAGTGAGCACAATTAGCCGTAATAACATTAGAGAAAATGTTTTTAAAGAAGCCAGCTCAAGCAATATTAATGAAGTAGGTTCC +AGTACTAATGAAGTGGGCTCCAGTATTAATGAAATAGGTTCCAGTGATGAAAACATTCAAGCAGAACTAGGTAGAAACAGAGGGCCAAAATTGAATGCTA +TGCTTAGATTAGGGGTTTTGCAACCTGAGGTCTATAAACAAAGTCTTCCTGGAAGTAATTGTAAGCATCCTGAAATAAAAAAGCAAGAATATGAAGAAGT +AGTTCAGACTGTTAATACAGATTTCTCTCCATATCTGATTTCAGATAACTTAGAACAGCCTATGGGAAGTAGTCATGCATCTCAGGTTTGTTCTGAGACA +CCTGATGACCTGTTAGATGATGGTGAAATAAAGGAAGATACTAGTTTTGCTGAAAATGACATTAAGGAAAGTTCTGCTGTTTTTAGCAAAAGCGTCCAGA +AAGGAGAGCTTAGCAGGAGTCCTAGCCCTTTCACCCATACACATTTGGCTCAGGGTTACCGAAGAGGGGCCAAGAAATTAGAGTCCTCAGAAGAGAACTT +ATCTAGTGAGGATGAAGAGCTTCCCTGCTTCCAACACTTGTTATTTGGTAAAGTAAACAATATACCTTCTCAGTCTACTAGGCATAGCACCGTTGCTACC +GAGTGTCTGTCTAAGAACACAGAGGAGAATTTATTATCATTGAAGAATAGCTTAAATGACTGCAGTAACCAGGTAATATTGGCAAAGGCATCTCAGGAAC +ATCACCTTAGTGAGGAAACAAAATGTTCTGCTAGCTTGTTTTCTTCACAGTGCAGTGAATTGGAAGACTTGACTGCAAATACAAACACCCAGGATCCTTT +CTTGATTGGTTCTTCCAAACAAATGAGGCATCAGTCTGAAAGCCAGGGAGTTGGTCTGAGTGACAAGGAATTGGTTTCAGATGATGAAGAAAGAGGAACG +GGCTTGGAAGAAAATAATCAAGAAGAGCAAAGCATGGATTCAAACTTAGGTGAAGCAGCATCTGGGTGTGAGAGTGAAACAAGCGTCTCTGAAGACTGCT +CAGGGCTATCCTCTCAGAGTGACATTTTAACCACTCAGCAGAGGGATACCATGCAACATAACCTGATAAAGCTCCAGCAGGAAATGGCTGAACTAGAAGC +TGTGTTAGAACAGCATGGGAGCCAGCCTTCTAACAGCTACCCTTCCATCATAAGTGACTCTTCTGCCCTTGAGGACCTGCGAAATCCAGAACAAAGCACA +TCAGAAAAAGCAGTATTAACTTCACAGAAAAGTAGTGAATACCCTATAAGCCAGAATCCAGAAGGCCTTTCTGCTGACAAGTTTGAGGTGTCTGCAGATA +GTTCTACCAGTAAAAATAAAGAACCAGGAGTGGAAAGGTCATCCCCTTCTAAATGCCCATCATTAGATGATAGGTGGTACATGCACAGTTGCTCTGGGAG +TCTTCAGAATAGAAACTACCCATCTCAAGAGGAGCTCATTAAGGTTGTTGATGTGGAGGAGCAACAGCTGGAAGAGTCTGGGCCACACGATTTGACGGAA +ACATCTTACTTGCCAAGGCAAGATCTAGAGGGAACCCCTTACCTGGAATCTGGAATCAGCCTCTTCTCTGATGACCCTGAATCTGATCCTTCTGAAGACA +GAGCCCCAGAGTCAGCTCGTGTTGGCAACATACCATCTTCAACCTCTGCATTGAAAGTTCCCCAATTGAAAGTTGCAGAATCTGCCCAGAGTCCAGCTGC +TGCTCATACTACTGATACTGCTGGGTATAATGCAATGGAAGAAAGTGTGAGCAGGGAGAAGCCAGAATTGACAGCTTCAACAGAAAGGGTCAACAAAAGA +ATGTCCATGGTGGTGTCTGGCCTGACCCCAGAAGAATTTATGCTCGTGTACAAGTTTGCCAGAAAACACCACATCACTTTAACTAATCTAATTACTGAAG +AGACTACTCATGTTGTTATGAAAACAGATGCTGAGTTTGTGTGTGAACGGACACTGAAATATTTTCTAGGAATTGCGGGAGGAAAATGGGTAGTTAGCTA +TTTCTGGGTGACCCAGTCTATTAAAGAAAGAAAAATGCTGAATGAGCATGATTTTGAAGTCAGAGGAGATGTGGTCAATGGAAGAAACCACCAAGGTCCA +AAGCGAGCAAGAGAATCCCAGGACAGAAAGATCTTCAGGGGGCTAGAAATCTGTTGCTATGGGCCCTTCACCAACATGCCCACAGATCAACTGGAATGGA +TGGTACAGCTGTGTGGTGCTTCTGTGGTGAAGGAGCTTTCATCATTCACCCTTGGCACAGGTGTCCACCCAATTGTGGTTGTGCAGCCAGATGCCTGGAC +AGAGGACAATGGCTTCCATGCAATTGGGCAGATGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAG +CTGGACACCTACCTGATACCCCAGATCCCCCACAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGT +GGCCTTTCCAGGCCCTGGGAGCTCCTCTCACTCTTCAGTCCTTCTACTGTCCTGGCTACTAAATATTTTATGTACATCAGCCTGAAAAGGACTTCTGGCT +ATGCAAGGGTCCCTTAAAGATTTTCTGCTTGAAGTCTCCCTTGGAAATCTGCCATGAGCACAAAATTATGGTAATTTTTCACCTGAGAAGATTTTAAAAC +CATTTAAACGCCACCAATTGAGCAAGATGCTGATTCATTATTTATCAGCCCTATTCTTTCTATTCAGGCTGTTGTTGGCTTAGGGCTGGAAGCACAGAGT +GGCTTGGCCTCAAGAGAATAGCTGGTTTCCCTAAGTTTACTTCTCTAAAACCCTGTGTTCACAAAGGCAGAGAGTCAGACCCTTCAATGGAAGGAGAGTG +CTTGGGATCGATTATGTGACTTAAAGTCAGAATAGTCCTTGGGCAGTTCTCAAATGTTGGAGTGGAACATTGGGGAGGAAATTCTGAGGCAGGTATTAGA +AATGAAAAGGAAACTTGAAACCTGGGCATGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGCAGATCACTGGAGGTCAGGAGTTCG +AAACCAGCCTGGCCAACATGGTGAAACCCCATCTCTACTAAAAATACAGAAATTAGCCGGTCATGGTGGTGGACACCTGTAATCCCAGCTACTCAGGTGG +CTAAGGCAGGAGAATCACTTCAGCCCGGGAGGTGGAGGTTGCAGTGAGCCAAGATCATACCACGGCACTCCAGCCTGGGTGACAGTGAGACTGTGGCTCA +AAAAAAAAAAAAAAAAAAGGAAAATGAAACTAGAAGAGATTTCTAAAAGTCTGAGATATATTTGCTAGATTTCTAAAGAATGTGTTCTAAAACAGCAGAA +GATTTTCAAGAACCGGTTTCCAAAGACAGTCTTCTAATTCCTCATTAGTAATAAGTAAAATGTTTATTGTTGTAGCTCTGGTATATAATCCATTCCTCTT +AAAATATAAGACCTCTGGCATGAATATTTCATATCTATAAAATGACAGATCCCACCAGGAAGGAAGCTGTTGCTTTCTTTGAGGTGATTTTTTTCCTTTG +CTCCCTGTTGCTGAAACCATACAGCTTCATAAATAATTTTGCTTGCTGAAGGAAGAAAAAGTGTTTTTCATAAACCCATTATCCAGGACTGTTTATAGCT +GTTGGAAGGACTAGGTCTTCCCTAGCCCCCCCAGTGTGCAAGGGCAGTGAAGACTTGATTGTACAAAATACGTTTTGTAAATGTTGTGCTGTTAACACTG +CAAATAAACTTGGTAGCAAACACTTCCA +>NM_007299.4 +GCTGAGACTTCCTGGACGGGGGACAGGCTGTGGGGTTTCTCAGATAACTGGGCCCCTGCGCTCAGGAGGCCTTCACCCTCTGCTCTGGTTCATTGGAACA +GAAAGAAATGGATTTATCTGCTCTTCGCGTTGAAGAAGTACAAAATGTCATTAATGCTATGCAGAAAATCTTAGAGTGTCCCATCTGTCTGGAGTTGATC +AAGGAACCTGTCTCCACAAAGTGTGACCACATATTTTGCAAATTTTGCATGCTGAAACTTCTCAACCAGAAGAAAGGGCCTTCACAGTGTCCTTTATGTA +AGAATGATATAACCAAAAGGAGCCTACAAGAAAGTACGAGATTTAGTCAACTTGTTGAAGAGCTATTGAAAATCATTTGTGCTTTTCAGCTTGACACAGG +TTTGGAGTATGCAAACAGCTATAATTTTGCAAAAAAGGAAAATAACTCTCCTGAACATCTAAAAGATGAAGTTTCTATCATCCAAAGTATGGGCTACAGA +AACCGTGCCAAAAGACTTCTACAGAGTGAACCCGAAAATCCTTCCTTGCAGGAAACCAGTCTCAGTGTCCAACTCTCTAACCTTGGAACTGTGAGAACTC +TGAGGACAAAGCAGCGGATACAACCTCAAAAGACGTCTGTCTACATTGAATTGGGATCTGATTCTTCTGAAGATACCGTTAATAAGGCAACTTATTGCAG +TGTGGGAGATCAAGAATTGTTACAAATCACCCCTCAAGGAACCAGGGATGAAATCAGTTTGGATTCTGCAAAAAAGGCTGCTTGTGAATTTTCTGAGACG +GATGTAACAAATACTGAACATCATCAACCCAGTAATAATGATTTGAACACCACTGAGAAGCGTGCAGCTGAGAGGCATCCAGAAAAGTATCAGGGTGAAG +CAGCATCTGGGTGTGAGAGTGAAACAAGCGTCTCTGAAGACTGCTCAGGGCTATCCTCTCAGAGTGACATTTTAACCACTCAGCAGAGGGATACCATGCA +ACATAACCTGATAAAGCTCCAGCAGGAAATGGCTGAACTAGAAGCTGTGTTAGAACAGCATGGGAGCCAGCCTTCTAACAGCTACCCTTCCATCATAAGT +GACTCTTCTGCCCTTGAGGACCTGCGAAATCCAGAACAAAGCACATCAGAAAAAGTATTAACTTCACAGAAAAGTAGTGAATACCCTATAAGCCAGAATC +CAGAAGGCCTTTCTGCTGACAAGTTTGAGGTGTCTGCAGATAGTTCTACCAGTAAAAATAAAGAACCAGGAGTGGAAAGGTCATCCCCTTCTAAATGCCC +ATCATTAGATGATAGGTGGTACATGCACAGTTGCTCTGGGAGTCTTCAGAATAGAAACTACCCATCTCAAGAGGAGCTCATTAAGGTTGTTGATGTGGAG +GAGCAACAGCTGGAAGAGTCTGGGCCACACGATTTGACGGAAACATCTTACTTGCCAAGGCAAGATCTAGAGGGAACCCCTTACCTGGAATCTGGAATCA +GCCTCTTCTCTGATGACCCTGAATCTGATCCTTCTGAAGACAGAGCCCCAGAGTCAGCTCGTGTTGGCAACATACCATCTTCAACCTCTGCATTGAAAGT +TCCCCAATTGAAAGTTGCAGAATCTGCCCAGAGTCCAGCTGCTGCTCATACTACTGATACTGCTGGGTATAATGCAATGGAAGAAAGTGTGAGCAGGGAG +AAGCCAGAATTGACAGCTTCAACAGAAAGGGTCAACAAAAGAATGTCCATGGTGGTGTCTGGCCTGACCCCAGAAGAATTTATGCTCGTGTACAAGTTTG +CCAGAAAACACCACATCACTTTAACTAATCTAATTACTGAAGAGACTACTCATGTTGTTATGAAAACAGATGCTGAGTTTGTGTGTGAACGGACACTGAA +ATATTTTCTAGGAATTGCGGGAGGAAAATGGGTAGTTAGCTATTTCTGGGTGACCCAGTCTATTAAAGAAAGAAAAATGCTGAATGAGCATGATTTTGAA +GTCAGAGGAGATGTGGTCAATGGAAGAAACCACCAAGGTCCAAAGCGAGCAAGAGAATCCCAGGACAGAAAGATCTTCAGGGGGCTAGAAATCTGTTGCT +ATGGGCCCTTCACCAACATGCCCACAGGGTGTCCACCCAATTGTGGTTGTGCAGCCAGATGCCTGGACAGAGGACAATGGCTTCCATGCAATTGGGCAGA +TGTGTGAGGCACCTGTGGTGACCCGAGAGTGGGTGTTGGACAGTGTAGCACTCTACCAGTGCCAGGAGCTGGACACCTACCTGATACCCCAGATCCCCCA +CAGCCACTACTGACTGCAGCCAGCCACAGGTACAGAGCCACAGGACCCCAAGAATGAGCTTACAAAGTGGCCTTTCCAGGCCCTGGGAGCTCCTCTCACT +CTTCAGTCCTTCTACTGTCCTGGCTACTAAATATTTTATGTACATCAGCCTGAAAAGGACTTCTGGCTATGCAAGGGTCCCTTAAAGATTTTCTGCTTGA +AGTCTCCCTTGGAAATCTGCCATGAGCACAAAATTATGGTAATTTTTCACCTGAGAAGATTTTAAAACCATTTAAACGCCACCAATTGAGCAAGATGCTG +ATTCATTATTTATCAGCCCTATTCTTTCTATTCAGGCTGTTGTTGGCTTAGGGCTGGAAGCACAGAGTGGCTTGGCCTCAAGAGAATAGCTGGTTTCCCT +AAGTTTACTTCTCTAAAACCCTGTGTTCACAAAGGCAGAGAGTCAGACCCTTCAATGGAAGGAGAGTGCTTGGGATCGATTATGTGACTTAAAGTCAGAA +TAGTCCTTGGGCAGTTCTCAAATGTTGGAGTGGAACATTGGGGAGGAAATTCTGAGGCAGGTATTAGAAATGAAAAGGAAACTTGAAACCTGGGCATGGT +GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCAAGGTGGGCAGATCACTGGAGGTCAGGAGTTCGAAACCAGCCTGGCCAACATGGTGAAACCCCAT +CTCTACTAAAAATACAGAAATTAGCCGGTCATGGTGGTGGACACCTGTAATCCCAGCTACTCAGGTGGCTAAGGCAGGAGAATCACTTCAGCCCGGGAGG +TGGAGGTTGCAGTGAGCCAAGATCATACCACGGCACTCCAGCCTGGGTGACAGTGAGACTGTGGCTCAAAAAAAAAAAAAAAAAAAGGAAAATGAAACTA +GAAGAGATTTCTAAAAGTCTGAGATATATTTGCTAGATTTCTAAAGAATGTGTTCTAAAACAGCAGAAGATTTTCAAGAACCGGTTTCCAAAGACAGTCT +TCTAATTCCTCATTAGTAATAAGTAAAATGTTTATTGTTGTAGCTCTGGTATATAATCCATTCCTCTTAAAATATAAGACCTCTGGCATGAATATTTCAT +ATCTATAAAATGACAGATCCCACCAGGAAGGAAGCTGTTGCTTTCTTTGAGGTGATTTTTTTCCTTTGCTCCCTGTTGCTGAAACCATACAGCTTCATAA +ATAATTTTGCTTGCTGAAGGAAGAAAAAGTGTTTTTCATAAACCCATTATCCAGGACTGTTTATAGCTGTTGGAAGGACTAGGTCTTCCCTAGCCCCCCC +AGTGTGCAAGGGCAGTGAAGACTTGATTGTACAAAATACGTTTTGTAAATGTTGTGCTGTTAACACTGCAAATAAACTTGGTAGCAAACACTTCCA diff --git a/tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1.json b/tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1.json new file mode 100644 index 00000000..9f7cc3d4 --- /dev/null +++ b/tests/data/db/create/txs/cdot-0.2.12.refseq.grch37_grch38.brca1.json @@ -0,0 +1,6986 @@ +{ + "transcripts": { + "NM_007294.3": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 22, + 5700, + 7207, + null + ], + [ + 41199659, + 41199720, + 21, + 5639, + 5699, + null + ], + [ + 41201137, + 41201211, + 20, + 5565, + 5638, + null + ], + [ + 41203079, + 41203134, + 19, + 5510, + 5564, + null + ], + [ + 41209068, + 41209152, + 18, + 5426, + 5509, + null + ], + [ + 41215349, + 41215390, + 17, + 5385, + 5425, + null + ], + [ + 41215890, + 41215968, + 16, + 5307, + 5384, + null + ], + [ + 41219624, + 41219712, + 15, + 5219, + 5306, + null + ], + [ + 41222944, + 41223255, + 14, + 4908, + 5218, + null + ], + [ + 41226347, + 41226538, + 13, + 4717, + 4907, + null + ], + [ + 41228504, + 41228631, + 12, + 4590, + 4716, + null + ], + [ + 41234420, + 41234592, + 11, + 4418, + 4589, + null + ], + [ + 41242960, + 41243049, + 10, + 4329, + 4417, + null + ], + [ + 41243451, + 41246877, + 9, + 903, + 4328, + null + ], + [ + 41247862, + 41247939, + 8, + 826, + 902, + null + ], + [ + 41249260, + 41249306, + 7, + 780, + 825, + null + ], + [ + 41251791, + 41251897, + 6, + 674, + 779, + null + ], + [ + 41256138, + 41256278, + 5, + 534, + 673, + null + ], + [ + 41256884, + 41256973, + 4, + 445, + 533, + null + ], + [ + 41258472, + 41258550, + 3, + 367, + 444, + null + ], + [ + 41267742, + 41267796, + 2, + 313, + 366, + null + ], + [ + 41276033, + 41276132, + 1, + 214, + 312, + null + ], + [ + 41277287, + 41277500, + 0, + 1, + 213, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5700, + 7207, + null + ], + [ + 43047642, + 43047703, + 21, + 5639, + 5699, + null + ], + [ + 43049120, + 43049194, + 20, + 5565, + 5638, + null + ], + [ + 43051062, + 43051117, + 19, + 5510, + 5564, + null + ], + [ + 43057051, + 43057135, + 18, + 5426, + 5509, + null + ], + [ + 43063332, + 43063373, + 17, + 5385, + 5425, + null + ], + [ + 43063873, + 43063951, + 16, + 5307, + 5384, + null + ], + [ + 43067607, + 43067695, + 15, + 5219, + 5306, + null + ], + [ + 43070927, + 43071238, + 14, + 4908, + 5218, + null + ], + [ + 43074330, + 43074521, + 13, + 4717, + 4907, + null + ], + [ + 43076487, + 43076614, + 12, + 4590, + 4716, + null + ], + [ + 43082403, + 43082575, + 11, + 4418, + 4589, + null + ], + [ + 43090943, + 43091032, + 10, + 4329, + 4417, + null + ], + [ + 43091434, + 43094860, + 9, + 903, + 4328, + null + ], + [ + 43095845, + 43095922, + 8, + 826, + 902, + null + ], + [ + 43097243, + 43097289, + 7, + 780, + 825, + null + ], + [ + 43099774, + 43099880, + 6, + 674, + 779, + null + ], + [ + 43104121, + 43104261, + 5, + 534, + 673, + null + ], + [ + 43104867, + 43104956, + 4, + 445, + 533, + null + ], + [ + 43106455, + 43106533, + 3, + 367, + 444, + null + ], + [ + 43115725, + 43115779, + 2, + 313, + 366, + null + ], + [ + 43124016, + 43124115, + 1, + 214, + 312, + null + ], + [ + 43125270, + 43125483, + 0, + 1, + 213, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007294.3", + "protein": "NP_009225.1", + "start_codon": 232, + "stop_codon": 5824, + "tag": "RefSeq Select" + }, + "NM_007294.4": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 22, + 5581, + 7088, + null + ], + [ + 41199659, + 41199720, + 21, + 5520, + 5580, + null + ], + [ + 41201137, + 41201211, + 20, + 5446, + 5519, + null + ], + [ + 41203079, + 41203134, + 19, + 5391, + 5445, + null + ], + [ + 41209068, + 41209152, + 18, + 5307, + 5390, + null + ], + [ + 41215349, + 41215390, + 17, + 5266, + 5306, + null + ], + [ + 41215890, + 41215968, + 16, + 5188, + 5265, + null + ], + [ + 41219624, + 41219712, + 15, + 5100, + 5187, + null + ], + [ + 41222944, + 41223255, + 14, + 4789, + 5099, + null + ], + [ + 41226347, + 41226538, + 13, + 4598, + 4788, + null + ], + [ + 41228504, + 41228631, + 12, + 4471, + 4597, + null + ], + [ + 41234420, + 41234592, + 11, + 4299, + 4470, + null + ], + [ + 41242960, + 41243049, + 10, + 4210, + 4298, + null + ], + [ + 41243451, + 41246877, + 9, + 784, + 4209, + null + ], + [ + 41247862, + 41247939, + 8, + 707, + 783, + null + ], + [ + 41249260, + 41249306, + 7, + 661, + 706, + null + ], + [ + 41251791, + 41251897, + 6, + 555, + 660, + null + ], + [ + 41256138, + 41256278, + 5, + 415, + 554, + null + ], + [ + 41256884, + 41256973, + 4, + 326, + 414, + null + ], + [ + 41258472, + 41258550, + 3, + 248, + 325, + null + ], + [ + 41267742, + 41267796, + 2, + 194, + 247, + null + ], + [ + 41276033, + 41276132, + 1, + 95, + 193, + null + ], + [ + 41277287, + 41277381, + 0, + 1, + 94, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5581, + 7088, + null + ], + [ + 43047642, + 43047703, + 21, + 5520, + 5580, + null + ], + [ + 43049120, + 43049194, + 20, + 5446, + 5519, + null + ], + [ + 43051062, + 43051117, + 19, + 5391, + 5445, + null + ], + [ + 43057051, + 43057135, + 18, + 5307, + 5390, + null + ], + [ + 43063332, + 43063373, + 17, + 5266, + 5306, + null + ], + [ + 43063873, + 43063951, + 16, + 5188, + 5265, + null + ], + [ + 43067607, + 43067695, + 15, + 5100, + 5187, + null + ], + [ + 43070927, + 43071238, + 14, + 4789, + 5099, + null + ], + [ + 43074330, + 43074521, + 13, + 4598, + 4788, + null + ], + [ + 43076487, + 43076614, + 12, + 4471, + 4597, + null + ], + [ + 43082403, + 43082575, + 11, + 4299, + 4470, + null + ], + [ + 43090943, + 43091032, + 10, + 4210, + 4298, + null + ], + [ + 43091434, + 43094860, + 9, + 784, + 4209, + null + ], + [ + 43095845, + 43095922, + 8, + 707, + 783, + null + ], + [ + 43097243, + 43097289, + 7, + 661, + 706, + null + ], + [ + 43099774, + 43099880, + 6, + 555, + 660, + null + ], + [ + 43104121, + 43104261, + 5, + 415, + 554, + null + ], + [ + 43104867, + 43104956, + 4, + 326, + 414, + null + ], + [ + 43106455, + 43106533, + 3, + 248, + 325, + null + ], + [ + 43115725, + 43115779, + 2, + 194, + 247, + null + ], + [ + 43124016, + 43124115, + 1, + 95, + 193, + null + ], + [ + 43125270, + 43125364, + 0, + 1, + 94, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007294.4", + "protein": "NP_009225.1", + "start_codon": 113, + "stop_codon": 5705, + "tag": "MANE Select" + }, + "NM_007297.3": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41258543, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 21, + 5608, + 7115, + null + ], + [ + 41199659, + 41199720, + 20, + 5547, + 5607, + null + ], + [ + 41201137, + 41201211, + 19, + 5473, + 5546, + null + ], + [ + 41203079, + 41203134, + 18, + 5418, + 5472, + null + ], + [ + 41209068, + 41209152, + 17, + 5334, + 5417, + null + ], + [ + 41215349, + 41215390, + 16, + 5293, + 5333, + null + ], + [ + 41215890, + 41215968, + 15, + 5215, + 5292, + null + ], + [ + 41219624, + 41219712, + 14, + 5127, + 5214, + null + ], + [ + 41222944, + 41223255, + 13, + 4816, + 5126, + null + ], + [ + 41226347, + 41226538, + 12, + 4625, + 4815, + null + ], + [ + 41228504, + 41228631, + 11, + 4498, + 4624, + null + ], + [ + 41234420, + 41234592, + 10, + 4326, + 4497, + null + ], + [ + 41242960, + 41243049, + 9, + 4237, + 4325, + null + ], + [ + 41243451, + 41246877, + 8, + 811, + 4236, + null + ], + [ + 41247862, + 41247939, + 7, + 734, + 810, + null + ], + [ + 41249260, + 41249306, + 6, + 688, + 733, + null + ], + [ + 41251791, + 41251897, + 5, + 582, + 687, + null + ], + [ + 41256138, + 41256278, + 4, + 442, + 581, + null + ], + [ + 41256884, + 41256973, + 3, + 353, + 441, + null + ], + [ + 41258472, + 41258550, + 2, + 275, + 352, + null + ], + [ + 41276033, + 41276132, + 1, + 176, + 274, + null + ], + [ + 41277293, + 41277468, + 0, + 1, + 175, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + }, + "GRCh38": { + "cds_end": 43106526, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 21, + 5608, + 7115, + null + ], + [ + 43047642, + 43047703, + 20, + 5547, + 5607, + null + ], + [ + 43049120, + 43049194, + 19, + 5473, + 5546, + null + ], + [ + 43051062, + 43051117, + 18, + 5418, + 5472, + null + ], + [ + 43057051, + 43057135, + 17, + 5334, + 5417, + null + ], + [ + 43063332, + 43063373, + 16, + 5293, + 5333, + null + ], + [ + 43063873, + 43063951, + 15, + 5215, + 5292, + null + ], + [ + 43067607, + 43067695, + 14, + 5127, + 5214, + null + ], + [ + 43070927, + 43071238, + 13, + 4816, + 5126, + null + ], + [ + 43074330, + 43074521, + 12, + 4625, + 4815, + null + ], + [ + 43076487, + 43076614, + 11, + 4498, + 4624, + null + ], + [ + 43082403, + 43082575, + 10, + 4326, + 4497, + null + ], + [ + 43090943, + 43091032, + 9, + 4237, + 4325, + null + ], + [ + 43091434, + 43094860, + 8, + 811, + 4236, + null + ], + [ + 43095845, + 43095922, + 7, + 734, + 810, + null + ], + [ + 43097243, + 43097289, + 6, + 688, + 733, + null + ], + [ + 43099774, + 43099880, + 5, + 582, + 687, + null + ], + [ + 43104121, + 43104261, + 4, + 442, + 581, + null + ], + [ + 43104867, + 43104956, + 3, + 353, + 441, + null + ], + [ + 43106455, + 43106533, + 2, + 275, + 352, + null + ], + [ + 43124016, + 43124115, + 1, + 176, + 274, + null + ], + [ + 43125276, + 43125451, + 0, + 1, + 175, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007297.3", + "protein": "NP_009228.2", + "start_codon": 281, + "stop_codon": 5732 + }, + "NM_007297.4": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41258543, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 21, + 5521, + 7028, + null + ], + [ + 41199659, + 41199720, + 20, + 5460, + 5520, + null + ], + [ + 41201137, + 41201211, + 19, + 5386, + 5459, + null + ], + [ + 41203079, + 41203134, + 18, + 5331, + 5385, + null + ], + [ + 41209068, + 41209152, + 17, + 5247, + 5330, + null + ], + [ + 41215349, + 41215390, + 16, + 5206, + 5246, + null + ], + [ + 41215890, + 41215968, + 15, + 5128, + 5205, + null + ], + [ + 41219624, + 41219712, + 14, + 5040, + 5127, + null + ], + [ + 41222944, + 41223255, + 13, + 4729, + 5039, + null + ], + [ + 41226347, + 41226538, + 12, + 4538, + 4728, + null + ], + [ + 41228504, + 41228631, + 11, + 4411, + 4537, + null + ], + [ + 41234420, + 41234592, + 10, + 4239, + 4410, + null + ], + [ + 41242960, + 41243049, + 9, + 4150, + 4238, + null + ], + [ + 41243451, + 41246877, + 8, + 724, + 4149, + null + ], + [ + 41247862, + 41247939, + 7, + 647, + 723, + null + ], + [ + 41249260, + 41249306, + 6, + 601, + 646, + null + ], + [ + 41251791, + 41251897, + 5, + 495, + 600, + null + ], + [ + 41256138, + 41256278, + 4, + 355, + 494, + null + ], + [ + 41256884, + 41256973, + 3, + 266, + 354, + null + ], + [ + 41258472, + 41258550, + 2, + 188, + 265, + null + ], + [ + 41276033, + 41276132, + 1, + 89, + 187, + null + ], + [ + 41277293, + 41277381, + 0, + 1, + 88, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "cds_end": 43106526, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 21, + 5521, + 7028, + null + ], + [ + 43047642, + 43047703, + 20, + 5460, + 5520, + null + ], + [ + 43049120, + 43049194, + 19, + 5386, + 5459, + null + ], + [ + 43051062, + 43051117, + 18, + 5331, + 5385, + null + ], + [ + 43057051, + 43057135, + 17, + 5247, + 5330, + null + ], + [ + 43063332, + 43063373, + 16, + 5206, + 5246, + null + ], + [ + 43063873, + 43063951, + 15, + 5128, + 5205, + null + ], + [ + 43067607, + 43067695, + 14, + 5040, + 5127, + null + ], + [ + 43070927, + 43071238, + 13, + 4729, + 5039, + null + ], + [ + 43074330, + 43074521, + 12, + 4538, + 4728, + null + ], + [ + 43076487, + 43076614, + 11, + 4411, + 4537, + null + ], + [ + 43082403, + 43082575, + 10, + 4239, + 4410, + null + ], + [ + 43090943, + 43091032, + 9, + 4150, + 4238, + null + ], + [ + 43091434, + 43094860, + 8, + 724, + 4149, + null + ], + [ + 43095845, + 43095922, + 7, + 647, + 723, + null + ], + [ + 43097243, + 43097289, + 6, + 601, + 646, + null + ], + [ + 43099774, + 43099880, + 5, + 495, + 600, + null + ], + [ + 43104121, + 43104261, + 4, + 355, + 494, + null + ], + [ + 43104867, + 43104956, + 3, + 266, + 354, + null + ], + [ + 43106455, + 43106533, + 2, + 188, + 265, + null + ], + [ + 43124016, + 43124115, + 1, + 89, + 187, + null + ], + [ + 43125276, + 43125364, + 0, + 1, + 88, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007297.4", + "protein": "NP_009228.2", + "start_codon": 194, + "stop_codon": 5645 + }, + "NM_007298.3": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 21, + 2175, + 3682, + null + ], + [ + 41199659, + 41199720, + 20, + 2114, + 2174, + null + ], + [ + 41201137, + 41201211, + 19, + 2040, + 2113, + null + ], + [ + 41203079, + 41203134, + 18, + 1985, + 2039, + null + ], + [ + 41209068, + 41209152, + 17, + 1901, + 1984, + null + ], + [ + 41215349, + 41215390, + 16, + 1860, + 1900, + null + ], + [ + 41215890, + 41215968, + 15, + 1782, + 1859, + null + ], + [ + 41219624, + 41219712, + 14, + 1694, + 1781, + null + ], + [ + 41222944, + 41223255, + 13, + 1383, + 1693, + null + ], + [ + 41226347, + 41226538, + 12, + 1192, + 1382, + null + ], + [ + 41228504, + 41228628, + 11, + 1068, + 1191, + null + ], + [ + 41234420, + 41234592, + 10, + 896, + 1067, + null + ], + [ + 41242960, + 41243049, + 9, + 807, + 895, + null + ], + [ + 41246760, + 41246877, + 8, + 690, + 806, + null + ], + [ + 41247862, + 41247939, + 7, + 613, + 689, + null + ], + [ + 41249260, + 41249306, + 6, + 567, + 612, + null + ], + [ + 41251791, + 41251897, + 5, + 461, + 566, + null + ], + [ + 41256138, + 41256278, + 4, + 321, + 460, + null + ], + [ + 41256884, + 41256973, + 3, + 232, + 320, + null + ], + [ + 41258472, + 41258550, + 2, + 154, + 231, + null + ], + [ + 41267742, + 41267796, + 1, + 100, + 153, + null + ], + [ + 41276033, + 41276132, + 0, + 1, + 99, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 21, + 2175, + 3682, + null + ], + [ + 43047642, + 43047703, + 20, + 2114, + 2174, + null + ], + [ + 43049120, + 43049194, + 19, + 2040, + 2113, + null + ], + [ + 43051062, + 43051117, + 18, + 1985, + 2039, + null + ], + [ + 43057051, + 43057135, + 17, + 1901, + 1984, + null + ], + [ + 43063332, + 43063373, + 16, + 1860, + 1900, + null + ], + [ + 43063873, + 43063951, + 15, + 1782, + 1859, + null + ], + [ + 43067607, + 43067695, + 14, + 1694, + 1781, + null + ], + [ + 43070927, + 43071238, + 13, + 1383, + 1693, + null + ], + [ + 43074330, + 43074521, + 12, + 1192, + 1382, + null + ], + [ + 43076487, + 43076611, + 11, + 1068, + 1191, + null + ], + [ + 43082403, + 43082575, + 10, + 896, + 1067, + null + ], + [ + 43090943, + 43091032, + 9, + 807, + 895, + null + ], + [ + 43094743, + 43094860, + 8, + 690, + 806, + null + ], + [ + 43095845, + 43095922, + 7, + 613, + 689, + null + ], + [ + 43097243, + 43097289, + 6, + 567, + 612, + null + ], + [ + 43099774, + 43099880, + 5, + 461, + 566, + null + ], + [ + 43104121, + 43104261, + 4, + 321, + 460, + null + ], + [ + 43104867, + 43104956, + 3, + 232, + 320, + null + ], + [ + 43106455, + 43106533, + 2, + 154, + 231, + null + ], + [ + 43115725, + 43115779, + 1, + 100, + 153, + null + ], + [ + 43124016, + 43124115, + 0, + 1, + 99, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007298.3", + "protein": "NP_009229.2", + "start_codon": 19, + "stop_codon": 2299 + }, + "NM_007299.3": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197800, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 21, + 2276, + 3783, + null + ], + [ + 41199659, + 41199720, + 20, + 2215, + 2275, + null + ], + [ + 41203079, + 41203134, + 19, + 2160, + 2214, + null + ], + [ + 41209068, + 41209152, + 18, + 2076, + 2159, + null + ], + [ + 41215349, + 41215390, + 17, + 2035, + 2075, + null + ], + [ + 41215890, + 41215968, + 16, + 1957, + 2034, + null + ], + [ + 41219624, + 41219712, + 15, + 1869, + 1956, + null + ], + [ + 41222944, + 41223255, + 14, + 1558, + 1868, + null + ], + [ + 41226347, + 41226538, + 13, + 1367, + 1557, + null + ], + [ + 41228504, + 41228628, + 12, + 1243, + 1366, + null + ], + [ + 41234420, + 41234592, + 11, + 1071, + 1242, + null + ], + [ + 41242960, + 41243049, + 10, + 982, + 1070, + null + ], + [ + 41246760, + 41246877, + 9, + 865, + 981, + null + ], + [ + 41247862, + 41247939, + 8, + 788, + 864, + null + ], + [ + 41249260, + 41249306, + 7, + 742, + 787, + null + ], + [ + 41251791, + 41251897, + 6, + 636, + 741, + null + ], + [ + 41256138, + 41256278, + 5, + 496, + 635, + null + ], + [ + 41256884, + 41256973, + 4, + 407, + 495, + null + ], + [ + 41258472, + 41258550, + 3, + 329, + 406, + null + ], + [ + 41267742, + 41267796, + 2, + 275, + 328, + null + ], + [ + 41276033, + 41276132, + 1, + 176, + 274, + null + ], + [ + 41277293, + 41277468, + 0, + 1, + 175, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045783, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 21, + 2276, + 3783, + null + ], + [ + 43047642, + 43047703, + 20, + 2215, + 2275, + null + ], + [ + 43051062, + 43051117, + 19, + 2160, + 2214, + null + ], + [ + 43057051, + 43057135, + 18, + 2076, + 2159, + null + ], + [ + 43063332, + 43063373, + 17, + 2035, + 2075, + null + ], + [ + 43063873, + 43063951, + 16, + 1957, + 2034, + null + ], + [ + 43067607, + 43067695, + 15, + 1869, + 1956, + null + ], + [ + 43070927, + 43071238, + 14, + 1558, + 1868, + null + ], + [ + 43074330, + 43074521, + 13, + 1367, + 1557, + null + ], + [ + 43076487, + 43076611, + 12, + 1243, + 1366, + null + ], + [ + 43082403, + 43082575, + 11, + 1071, + 1242, + null + ], + [ + 43090943, + 43091032, + 10, + 982, + 1070, + null + ], + [ + 43094743, + 43094860, + 9, + 865, + 981, + null + ], + [ + 43095845, + 43095922, + 8, + 788, + 864, + null + ], + [ + 43097243, + 43097289, + 7, + 742, + 787, + null + ], + [ + 43099774, + 43099880, + 6, + 636, + 741, + null + ], + [ + 43104121, + 43104261, + 5, + 496, + 635, + null + ], + [ + 43104867, + 43104956, + 4, + 407, + 495, + null + ], + [ + 43106455, + 43106533, + 3, + 329, + 406, + null + ], + [ + 43115725, + 43115779, + 2, + 275, + 328, + null + ], + [ + 43124016, + 43124115, + 1, + 176, + 274, + null + ], + [ + 43125276, + 43125451, + 0, + 1, + 175, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007299.3", + "protein": "NP_009230.2", + "start_codon": 194, + "stop_codon": 2294 + }, + "NM_007299.4": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197800, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 21, + 2189, + 3696, + null + ], + [ + 41199659, + 41199720, + 20, + 2128, + 2188, + null + ], + [ + 41203079, + 41203134, + 19, + 2073, + 2127, + null + ], + [ + 41209068, + 41209152, + 18, + 1989, + 2072, + null + ], + [ + 41215349, + 41215390, + 17, + 1948, + 1988, + null + ], + [ + 41215890, + 41215968, + 16, + 1870, + 1947, + null + ], + [ + 41219624, + 41219712, + 15, + 1782, + 1869, + null + ], + [ + 41222944, + 41223255, + 14, + 1471, + 1781, + null + ], + [ + 41226347, + 41226538, + 13, + 1280, + 1470, + null + ], + [ + 41228504, + 41228628, + 12, + 1156, + 1279, + null + ], + [ + 41234420, + 41234592, + 11, + 984, + 1155, + null + ], + [ + 41242960, + 41243049, + 10, + 895, + 983, + null + ], + [ + 41246760, + 41246877, + 9, + 778, + 894, + null + ], + [ + 41247862, + 41247939, + 8, + 701, + 777, + null + ], + [ + 41249260, + 41249306, + 7, + 655, + 700, + null + ], + [ + 41251791, + 41251897, + 6, + 549, + 654, + null + ], + [ + 41256138, + 41256278, + 5, + 409, + 548, + null + ], + [ + 41256884, + 41256973, + 4, + 320, + 408, + null + ], + [ + 41258472, + 41258550, + 3, + 242, + 319, + null + ], + [ + 41267742, + 41267796, + 2, + 188, + 241, + null + ], + [ + 41276033, + 41276132, + 1, + 89, + 187, + null + ], + [ + 41277293, + 41277381, + 0, + 1, + 88, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045783, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 21, + 2189, + 3696, + null + ], + [ + 43047642, + 43047703, + 20, + 2128, + 2188, + null + ], + [ + 43051062, + 43051117, + 19, + 2073, + 2127, + null + ], + [ + 43057051, + 43057135, + 18, + 1989, + 2072, + null + ], + [ + 43063332, + 43063373, + 17, + 1948, + 1988, + null + ], + [ + 43063873, + 43063951, + 16, + 1870, + 1947, + null + ], + [ + 43067607, + 43067695, + 15, + 1782, + 1869, + null + ], + [ + 43070927, + 43071238, + 14, + 1471, + 1781, + null + ], + [ + 43074330, + 43074521, + 13, + 1280, + 1470, + null + ], + [ + 43076487, + 43076611, + 12, + 1156, + 1279, + null + ], + [ + 43082403, + 43082575, + 11, + 984, + 1155, + null + ], + [ + 43090943, + 43091032, + 10, + 895, + 983, + null + ], + [ + 43094743, + 43094860, + 9, + 778, + 894, + null + ], + [ + 43095845, + 43095922, + 8, + 701, + 777, + null + ], + [ + 43097243, + 43097289, + 7, + 655, + 700, + null + ], + [ + 43099774, + 43099880, + 6, + 549, + 654, + null + ], + [ + 43104121, + 43104261, + 5, + 409, + 548, + null + ], + [ + 43104867, + 43104956, + 4, + 320, + 408, + null + ], + [ + 43106455, + 43106533, + 3, + 242, + 319, + null + ], + [ + 43115725, + 43115779, + 2, + 188, + 241, + null + ], + [ + 43124016, + 43124115, + 1, + 89, + 187, + null + ], + [ + 43125276, + 43125364, + 0, + 1, + 88, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007299.4", + "protein": "NP_009230.2", + "start_codon": 107, + "stop_codon": 2207 + }, + "NM_007300.3": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 23, + 5763, + 7270, + null + ], + [ + 41199659, + 41199720, + 22, + 5702, + 5762, + null + ], + [ + 41201137, + 41201211, + 21, + 5628, + 5701, + null + ], + [ + 41203079, + 41203134, + 20, + 5573, + 5627, + null + ], + [ + 41209068, + 41209152, + 19, + 5489, + 5572, + null + ], + [ + 41215349, + 41215390, + 18, + 5448, + 5488, + null + ], + [ + 41215890, + 41215968, + 17, + 5370, + 5447, + null + ], + [ + 41219624, + 41219712, + 16, + 5282, + 5369, + null + ], + [ + 41222944, + 41223255, + 15, + 4971, + 5281, + null + ], + [ + 41226347, + 41226538, + 14, + 4780, + 4970, + null + ], + [ + 41228504, + 41228628, + 13, + 4656, + 4779, + null + ], + [ + 41231350, + 41231416, + 12, + 4590, + 4655, + null + ], + [ + 41234420, + 41234592, + 11, + 4418, + 4589, + null + ], + [ + 41242960, + 41243049, + 10, + 4329, + 4417, + null + ], + [ + 41243451, + 41246877, + 9, + 903, + 4328, + null + ], + [ + 41247862, + 41247939, + 8, + 826, + 902, + null + ], + [ + 41249260, + 41249306, + 7, + 780, + 825, + null + ], + [ + 41251791, + 41251897, + 6, + 674, + 779, + null + ], + [ + 41256138, + 41256278, + 5, + 534, + 673, + null + ], + [ + 41256884, + 41256973, + 4, + 445, + 533, + null + ], + [ + 41258472, + 41258550, + 3, + 367, + 444, + null + ], + [ + 41267742, + 41267796, + 2, + 313, + 366, + null + ], + [ + 41276033, + 41276132, + 1, + 214, + 312, + null + ], + [ + 41277287, + 41277500, + 0, + 1, + 213, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 23, + 5763, + 7270, + null + ], + [ + 43047642, + 43047703, + 22, + 5702, + 5762, + null + ], + [ + 43049120, + 43049194, + 21, + 5628, + 5701, + null + ], + [ + 43051062, + 43051117, + 20, + 5573, + 5627, + null + ], + [ + 43057051, + 43057135, + 19, + 5489, + 5572, + null + ], + [ + 43063332, + 43063373, + 18, + 5448, + 5488, + null + ], + [ + 43063873, + 43063951, + 17, + 5370, + 5447, + null + ], + [ + 43067607, + 43067695, + 16, + 5282, + 5369, + null + ], + [ + 43070927, + 43071238, + 15, + 4971, + 5281, + null + ], + [ + 43074330, + 43074521, + 14, + 4780, + 4970, + null + ], + [ + 43076487, + 43076611, + 13, + 4656, + 4779, + null + ], + [ + 43079333, + 43079399, + 12, + 4590, + 4655, + null + ], + [ + 43082403, + 43082575, + 11, + 4418, + 4589, + null + ], + [ + 43090943, + 43091032, + 10, + 4329, + 4417, + null + ], + [ + 43091434, + 43094860, + 9, + 903, + 4328, + null + ], + [ + 43095845, + 43095922, + 8, + 826, + 902, + null + ], + [ + 43097243, + 43097289, + 7, + 780, + 825, + null + ], + [ + 43099774, + 43099880, + 6, + 674, + 779, + null + ], + [ + 43104121, + 43104261, + 5, + 534, + 673, + null + ], + [ + 43104867, + 43104956, + 4, + 445, + 533, + null + ], + [ + 43106455, + 43106533, + 3, + 367, + 444, + null + ], + [ + 43115725, + 43115779, + 2, + 313, + 366, + null + ], + [ + 43124016, + 43124115, + 1, + 214, + 312, + null + ], + [ + 43125270, + 43125483, + 0, + 1, + 213, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109/GCF_000001405.38_GRCh38.p12/GCF_000001405.38_GRCh38.p12_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007300.3", + "protein": "NP_009231.2", + "start_codon": 232, + "stop_codon": 5887 + }, + "NM_007300.4": { + "biotype": [ + "protein_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "cds_end": 41276113, + "cds_start": 41197694, + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 23, + 5644, + 7151, + null + ], + [ + 41199659, + 41199720, + 22, + 5583, + 5643, + null + ], + [ + 41201137, + 41201211, + 21, + 5509, + 5582, + null + ], + [ + 41203079, + 41203134, + 20, + 5454, + 5508, + null + ], + [ + 41209068, + 41209152, + 19, + 5370, + 5453, + null + ], + [ + 41215349, + 41215390, + 18, + 5329, + 5369, + null + ], + [ + 41215890, + 41215968, + 17, + 5251, + 5328, + null + ], + [ + 41219624, + 41219712, + 16, + 5163, + 5250, + null + ], + [ + 41222944, + 41223255, + 15, + 4852, + 5162, + null + ], + [ + 41226347, + 41226538, + 14, + 4661, + 4851, + null + ], + [ + 41228504, + 41228628, + 13, + 4537, + 4660, + null + ], + [ + 41231350, + 41231416, + 12, + 4471, + 4536, + null + ], + [ + 41234420, + 41234592, + 11, + 4299, + 4470, + null + ], + [ + 41242960, + 41243049, + 10, + 4210, + 4298, + null + ], + [ + 41243451, + 41246877, + 9, + 784, + 4209, + null + ], + [ + 41247862, + 41247939, + 8, + 707, + 783, + null + ], + [ + 41249260, + 41249306, + 7, + 661, + 706, + null + ], + [ + 41251791, + 41251897, + 6, + 555, + 660, + null + ], + [ + 41256138, + 41256278, + 5, + 415, + 554, + null + ], + [ + 41256884, + 41256973, + 4, + 326, + 414, + null + ], + [ + 41258472, + 41258550, + 3, + 248, + 325, + null + ], + [ + 41267742, + 41267796, + 2, + 194, + 247, + null + ], + [ + 41276033, + 41276132, + 1, + 95, + 193, + null + ], + [ + 41277287, + 41277381, + 0, + 1, + 94, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 23, + 5644, + 7151, + null + ], + [ + 43047642, + 43047703, + 22, + 5583, + 5643, + null + ], + [ + 43049120, + 43049194, + 21, + 5509, + 5582, + null + ], + [ + 43051062, + 43051117, + 20, + 5454, + 5508, + null + ], + [ + 43057051, + 43057135, + 19, + 5370, + 5453, + null + ], + [ + 43063332, + 43063373, + 18, + 5329, + 5369, + null + ], + [ + 43063873, + 43063951, + 17, + 5251, + 5328, + null + ], + [ + 43067607, + 43067695, + 16, + 5163, + 5250, + null + ], + [ + 43070927, + 43071238, + 15, + 4852, + 5162, + null + ], + [ + 43074330, + 43074521, + 14, + 4661, + 4851, + null + ], + [ + 43076487, + 43076611, + 13, + 4537, + 4660, + null + ], + [ + 43079333, + 43079399, + 12, + 4471, + 4536, + null + ], + [ + 43082403, + 43082575, + 11, + 4299, + 4470, + null + ], + [ + 43090943, + 43091032, + 10, + 4210, + 4298, + null + ], + [ + 43091434, + 43094860, + 9, + 784, + 4209, + null + ], + [ + 43095845, + 43095922, + 8, + 707, + 783, + null + ], + [ + 43097243, + 43097289, + 7, + 661, + 706, + null + ], + [ + 43099774, + 43099880, + 6, + 555, + 660, + null + ], + [ + 43104121, + 43104261, + 5, + 415, + 554, + null + ], + [ + 43104867, + 43104956, + 4, + 326, + 414, + null + ], + [ + 43106455, + 43106533, + 3, + 248, + 325, + null + ], + [ + 43115725, + 43115779, + 2, + 194, + 247, + null + ], + [ + 43124016, + 43124115, + 1, + 95, + 193, + null + ], + [ + 43125270, + 43125364, + 0, + 1, + 94, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NM_007300.4", + "protein": "NP_009231.2", + "start_codon": 113, + "stop_codon": 5768 + }, + "NR_027676.1": { + "biotype": [ + "non_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 22, + 5604, + 7111, + null + ], + [ + 41199659, + 41199720, + 21, + 5543, + 5603, + null + ], + [ + 41201137, + 41201211, + 20, + 5469, + 5542, + null + ], + [ + 41203079, + 41203134, + 19, + 5414, + 5468, + null + ], + [ + 41209068, + 41209152, + 18, + 5330, + 5413, + null + ], + [ + 41215349, + 41215390, + 17, + 5289, + 5329, + null + ], + [ + 41215890, + 41215968, + 16, + 5211, + 5288, + null + ], + [ + 41219624, + 41219712, + 15, + 5123, + 5210, + null + ], + [ + 41222944, + 41223255, + 14, + 4812, + 5122, + null + ], + [ + 41226347, + 41226538, + 13, + 4621, + 4811, + null + ], + [ + 41228504, + 41228631, + 12, + 4494, + 4620, + null + ], + [ + 41234420, + 41234592, + 11, + 4322, + 4493, + null + ], + [ + 41242960, + 41243049, + 10, + 4233, + 4321, + null + ], + [ + 41243451, + 41246877, + 9, + 807, + 4232, + null + ], + [ + 41247862, + 41247939, + 8, + 730, + 806, + null + ], + [ + 41249260, + 41249306, + 7, + 684, + 729, + null + ], + [ + 41251791, + 41251894, + 6, + 581, + 683, + null + ], + [ + 41256138, + 41256278, + 5, + 441, + 580, + null + ], + [ + 41256884, + 41256973, + 4, + 352, + 440, + null + ], + [ + 41258494, + 41258550, + 3, + 296, + 351, + null + ], + [ + 41267742, + 41267796, + 2, + 242, + 295, + null + ], + [ + 41276033, + 41276132, + 1, + 143, + 241, + null + ], + [ + 41277198, + 41277340, + 0, + 1, + 142, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.105/GFF/ref_GRCh37.p13_top_level.gff3.gz" + }, + "GRCh38": { + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5604, + 7111, + null + ], + [ + 43047642, + 43047703, + 21, + 5543, + 5603, + null + ], + [ + 43049120, + 43049194, + 20, + 5469, + 5542, + null + ], + [ + 43051062, + 43051117, + 19, + 5414, + 5468, + null + ], + [ + 43057051, + 43057135, + 18, + 5330, + 5413, + null + ], + [ + 43063332, + 43063373, + 17, + 5289, + 5329, + null + ], + [ + 43063873, + 43063951, + 16, + 5211, + 5288, + null + ], + [ + 43067607, + 43067695, + 15, + 5123, + 5210, + null + ], + [ + 43070927, + 43071238, + 14, + 4812, + 5122, + null + ], + [ + 43074330, + 43074521, + 13, + 4621, + 4811, + null + ], + [ + 43076487, + 43076614, + 12, + 4494, + 4620, + null + ], + [ + 43082403, + 43082575, + 11, + 4322, + 4493, + null + ], + [ + 43090943, + 43091032, + 10, + 4233, + 4321, + null + ], + [ + 43091434, + 43094860, + 9, + 807, + 4232, + null + ], + [ + 43095845, + 43095922, + 8, + 730, + 806, + null + ], + [ + 43097243, + 43097289, + 7, + 684, + 729, + null + ], + [ + 43099774, + 43099877, + 6, + 581, + 683, + null + ], + [ + 43104121, + 43104261, + 5, + 441, + 580, + null + ], + [ + 43104867, + 43104956, + 4, + 352, + 440, + null + ], + [ + 43106477, + 43106533, + 3, + 296, + 351, + null + ], + [ + 43115725, + 43115779, + 2, + 242, + 295, + null + ], + [ + 43124016, + 43124115, + 1, + 143, + 241, + null + ], + [ + 43125181, + 43125323, + 0, + 1, + 142, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/109.20190607/GCF_000001405.39_GRCh38.p13/GCF_000001405.39_GRCh38.p13_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NR_027676.1" + }, + "NR_027676.2": { + "biotype": [ + "non_coding" + ], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh37": { + "contig": "NC_000017.10", + "exons": [ + [ + 41196311, + 41197819, + 22, + 5645, + 7152, + null + ], + [ + 41199659, + 41199720, + 21, + 5584, + 5644, + null + ], + [ + 41201137, + 41201211, + 20, + 5510, + 5583, + null + ], + [ + 41203079, + 41203134, + 19, + 5455, + 5509, + null + ], + [ + 41209068, + 41209152, + 18, + 5371, + 5454, + null + ], + [ + 41215349, + 41215390, + 17, + 5330, + 5370, + null + ], + [ + 41215890, + 41215968, + 16, + 5252, + 5329, + null + ], + [ + 41219624, + 41219712, + 15, + 5164, + 5251, + null + ], + [ + 41222944, + 41223255, + 14, + 4853, + 5163, + null + ], + [ + 41226347, + 41226538, + 13, + 4662, + 4852, + null + ], + [ + 41228504, + 41228631, + 12, + 4535, + 4661, + null + ], + [ + 41234420, + 41234592, + 11, + 4363, + 4534, + null + ], + [ + 41242960, + 41243049, + 10, + 4274, + 4362, + null + ], + [ + 41243451, + 41246877, + 9, + 848, + 4273, + null + ], + [ + 41247862, + 41247939, + 8, + 771, + 847, + null + ], + [ + 41249260, + 41249306, + 7, + 725, + 770, + null + ], + [ + 41251791, + 41251894, + 6, + 622, + 724, + null + ], + [ + 41256138, + 41256278, + 5, + 482, + 621, + null + ], + [ + 41256884, + 41256973, + 4, + 393, + 481, + null + ], + [ + 41258494, + 41258550, + 3, + 337, + 392, + null + ], + [ + 41267742, + 41267796, + 2, + 283, + 336, + null + ], + [ + 41276033, + 41276132, + 1, + 184, + 282, + null + ], + [ + 41277198, + 41277381, + 0, + 1, + 183, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/105.20220307/GCF_000001405.25_GRCh37.p13/GCF_000001405.25_GRCh37.p13_genomic.gff.gz" + }, + "GRCh38": { + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5645, + 7152, + null + ], + [ + 43047642, + 43047703, + 21, + 5584, + 5644, + null + ], + [ + 43049120, + 43049194, + 20, + 5510, + 5583, + null + ], + [ + 43051062, + 43051117, + 19, + 5455, + 5509, + null + ], + [ + 43057051, + 43057135, + 18, + 5371, + 5454, + null + ], + [ + 43063332, + 43063373, + 17, + 5330, + 5370, + null + ], + [ + 43063873, + 43063951, + 16, + 5252, + 5329, + null + ], + [ + 43067607, + 43067695, + 15, + 5164, + 5251, + null + ], + [ + 43070927, + 43071238, + 14, + 4853, + 5163, + null + ], + [ + 43074330, + 43074521, + 13, + 4662, + 4852, + null + ], + [ + 43076487, + 43076614, + 12, + 4535, + 4661, + null + ], + [ + 43082403, + 43082575, + 11, + 4363, + 4534, + null + ], + [ + 43090943, + 43091032, + 10, + 4274, + 4362, + null + ], + [ + 43091434, + 43094860, + 9, + 848, + 4273, + null + ], + [ + 43095845, + 43095922, + 8, + 771, + 847, + null + ], + [ + 43097243, + 43097289, + 7, + 725, + 770, + null + ], + [ + 43099774, + 43099877, + 6, + 622, + 724, + null + ], + [ + 43104121, + 43104261, + 5, + 482, + 621, + null + ], + [ + 43104867, + 43104956, + 4, + 393, + 481, + null + ], + [ + 43106477, + 43106533, + 3, + 337, + 392, + null + ], + [ + 43115725, + 43115779, + 2, + 283, + 336, + null + ], + [ + 43124016, + 43124115, + 1, + 184, + 282, + null + ], + [ + 43125181, + 43125364, + 0, + 1, + 183, + null + ] + ], + "strand": "-", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + }, + "hgnc": "1100", + "id": "NR_027676.2" + }, + "XM_006722029.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5661, + 7168, + null + ], + [ + 43047642, + 43047703, + 21, + 5600, + 5660, + null + ], + [ + 43049120, + 43049194, + 20, + 5526, + 5599, + null + ], + [ + 43051062, + 43051117, + 19, + 5471, + 5525, + null + ], + [ + 43057051, + 43057135, + 18, + 5387, + 5470, + null + ], + [ + 43063332, + 43063373, + 17, + 5346, + 5386, + null + ], + [ + 43063873, + 43063951, + 16, + 5268, + 5345, + null + ], + [ + 43067607, + 43067695, + 15, + 5180, + 5267, + null + ], + [ + 43070927, + 43071238, + 14, + 4869, + 5179, + null + ], + [ + 43074330, + 43074521, + 13, + 4678, + 4868, + null + ], + [ + 43076487, + 43076614, + 12, + 4551, + 4677, + null + ], + [ + 43082403, + 43082575, + 11, + 4379, + 4550, + null + ], + [ + 43090943, + 43091032, + 10, + 4290, + 4378, + null + ], + [ + 43091434, + 43094860, + 9, + 864, + 4289, + null + ], + [ + 43095845, + 43095922, + 8, + 787, + 863, + null + ], + [ + 43097243, + 43097289, + 7, + 741, + 786, + null + ], + [ + 43099774, + 43099880, + 6, + 635, + 740, + null + ], + [ + 43104121, + 43104261, + 5, + 495, + 634, + null + ], + [ + 43104867, + 43104956, + 4, + 406, + 494, + null + ], + [ + 43106455, + 43106533, + 3, + 328, + 405, + null + ], + [ + 43115725, + 43115779, + 2, + 274, + 327, + null + ], + [ + 43124016, + 43124115, + 1, + 175, + 273, + null + ], + [ + 43125276, + 43125450, + 0, + 1, + 174, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722029.1", + "protein": "XP_006722092.1", + "start_codon": 193, + "stop_codon": 5785 + }, + "XM_006722030.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5629, + 7136, + null + ], + [ + 43047642, + 43047703, + 21, + 5568, + 5628, + null + ], + [ + 43049120, + 43049194, + 20, + 5494, + 5567, + null + ], + [ + 43051062, + 43051117, + 19, + 5439, + 5493, + null + ], + [ + 43057051, + 43057135, + 18, + 5355, + 5438, + null + ], + [ + 43063332, + 43063373, + 17, + 5314, + 5354, + null + ], + [ + 43063873, + 43063951, + 16, + 5236, + 5313, + null + ], + [ + 43067607, + 43067695, + 15, + 5148, + 5235, + null + ], + [ + 43070927, + 43071238, + 14, + 4837, + 5147, + null + ], + [ + 43074330, + 43074521, + 13, + 4646, + 4836, + null + ], + [ + 43076487, + 43076614, + 12, + 4519, + 4645, + null + ], + [ + 43082403, + 43082575, + 11, + 4347, + 4518, + null + ], + [ + 43090943, + 43091032, + 10, + 4258, + 4346, + null + ], + [ + 43091434, + 43094860, + 9, + 832, + 4257, + null + ], + [ + 43095845, + 43095922, + 8, + 755, + 831, + null + ], + [ + 43097243, + 43097289, + 7, + 709, + 754, + null + ], + [ + 43099774, + 43099880, + 6, + 603, + 708, + null + ], + [ + 43104121, + 43104261, + 5, + 463, + 602, + null + ], + [ + 43104867, + 43104956, + 4, + 374, + 462, + null + ], + [ + 43106455, + 43106533, + 3, + 296, + 373, + null + ], + [ + 43115725, + 43115779, + 2, + 242, + 295, + null + ], + [ + 43124016, + 43124115, + 1, + 143, + 241, + null + ], + [ + 43125181, + 43125323, + 0, + 1, + 142, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722030.1", + "protein": "XP_006722093.1", + "start_codon": 161, + "stop_codon": 5753 + }, + "XM_006722031.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 6185, + 7692, + null + ], + [ + 43047642, + 43047703, + 21, + 6124, + 6184, + null + ], + [ + 43049120, + 43049194, + 20, + 6050, + 6123, + null + ], + [ + 43051062, + 43051117, + 19, + 5995, + 6049, + null + ], + [ + 43057051, + 43057135, + 18, + 5911, + 5994, + null + ], + [ + 43063332, + 43063373, + 17, + 5870, + 5910, + null + ], + [ + 43063873, + 43063951, + 16, + 5792, + 5869, + null + ], + [ + 43067607, + 43067695, + 15, + 5704, + 5791, + null + ], + [ + 43070927, + 43071238, + 14, + 5393, + 5703, + null + ], + [ + 43074330, + 43074521, + 13, + 5202, + 5392, + null + ], + [ + 43076487, + 43076614, + 12, + 5075, + 5201, + null + ], + [ + 43082403, + 43082575, + 11, + 4903, + 5074, + null + ], + [ + 43090943, + 43091032, + 10, + 4814, + 4902, + null + ], + [ + 43091434, + 43094860, + 9, + 1388, + 4813, + null + ], + [ + 43095845, + 43095922, + 8, + 1311, + 1387, + null + ], + [ + 43097243, + 43097289, + 7, + 1265, + 1310, + null + ], + [ + 43099774, + 43099880, + 6, + 1159, + 1264, + null + ], + [ + 43104121, + 43104261, + 5, + 1019, + 1158, + null + ], + [ + 43104867, + 43104956, + 4, + 930, + 1018, + null + ], + [ + 43106455, + 43106533, + 3, + 852, + 929, + null + ], + [ + 43115725, + 43115779, + 2, + 798, + 851, + null + ], + [ + 43124016, + 43124115, + 1, + 699, + 797, + null + ], + [ + 43124736, + 43125434, + 0, + 1, + 698, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722031.1", + "protein": "XP_006722094.1", + "start_codon": 717, + "stop_codon": 6309 + }, + "XM_006722032.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5610, + 7117, + null + ], + [ + 43047642, + 43047703, + 21, + 5549, + 5609, + null + ], + [ + 43049120, + 43049194, + 20, + 5475, + 5548, + null + ], + [ + 43051062, + 43051117, + 19, + 5420, + 5474, + null + ], + [ + 43057051, + 43057135, + 18, + 5336, + 5419, + null + ], + [ + 43063332, + 43063373, + 17, + 5295, + 5335, + null + ], + [ + 43063873, + 43063951, + 16, + 5217, + 5294, + null + ], + [ + 43067607, + 43067695, + 15, + 5129, + 5216, + null + ], + [ + 43070927, + 43071238, + 14, + 4818, + 5128, + null + ], + [ + 43074330, + 43074521, + 13, + 4627, + 4817, + null + ], + [ + 43076487, + 43076614, + 12, + 4500, + 4626, + null + ], + [ + 43082403, + 43082575, + 11, + 4328, + 4499, + null + ], + [ + 43090943, + 43091032, + 10, + 4239, + 4327, + null + ], + [ + 43091434, + 43094860, + 9, + 813, + 4238, + null + ], + [ + 43095845, + 43095922, + 8, + 736, + 812, + null + ], + [ + 43097243, + 43097289, + 7, + 690, + 735, + null + ], + [ + 43099774, + 43099877, + 6, + 587, + 689, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722032.1", + "protein": "XP_006722095.1", + "start_codon": 145, + "stop_codon": 5734 + }, + "XM_006722033.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5610, + 7117, + null + ], + [ + 43047642, + 43047703, + 21, + 5549, + 5609, + null + ], + [ + 43049120, + 43049194, + 20, + 5475, + 5548, + null + ], + [ + 43051062, + 43051117, + 19, + 5420, + 5474, + null + ], + [ + 43057051, + 43057135, + 18, + 5336, + 5419, + null + ], + [ + 43063332, + 43063373, + 17, + 5295, + 5335, + null + ], + [ + 43063873, + 43063951, + 16, + 5217, + 5294, + null + ], + [ + 43067607, + 43067695, + 15, + 5129, + 5216, + null + ], + [ + 43070927, + 43071238, + 14, + 4818, + 5128, + null + ], + [ + 43074330, + 43074521, + 13, + 4627, + 4817, + null + ], + [ + 43076487, + 43076611, + 12, + 4503, + 4626, + null + ], + [ + 43082403, + 43082575, + 11, + 4331, + 4502, + null + ], + [ + 43090943, + 43091032, + 10, + 4242, + 4330, + null + ], + [ + 43091434, + 43094860, + 9, + 816, + 4241, + null + ], + [ + 43095845, + 43095922, + 8, + 739, + 815, + null + ], + [ + 43097243, + 43097289, + 7, + 693, + 738, + null + ], + [ + 43099774, + 43099880, + 6, + 587, + 692, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722033.1", + "protein": "XP_006722096.1", + "start_codon": 145, + "stop_codon": 5734 + }, + "XM_006722034.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5610, + 7117, + null + ], + [ + 43047642, + 43047703, + 21, + 5549, + 5609, + null + ], + [ + 43049120, + 43049194, + 20, + 5475, + 5548, + null + ], + [ + 43051062, + 43051117, + 19, + 5420, + 5474, + null + ], + [ + 43057051, + 43057135, + 18, + 5336, + 5419, + null + ], + [ + 43063332, + 43063373, + 17, + 5295, + 5335, + null + ], + [ + 43063873, + 43063951, + 16, + 5217, + 5294, + null + ], + [ + 43067607, + 43067695, + 15, + 5129, + 5216, + null + ], + [ + 43070927, + 43071238, + 14, + 4818, + 5128, + null + ], + [ + 43074330, + 43074521, + 13, + 4627, + 4817, + null + ], + [ + 43076487, + 43076614, + 12, + 4500, + 4626, + null + ], + [ + 43082403, + 43082572, + 11, + 4331, + 4499, + null + ], + [ + 43090943, + 43091032, + 10, + 4242, + 4330, + null + ], + [ + 43091434, + 43094860, + 9, + 816, + 4241, + null + ], + [ + 43095845, + 43095922, + 8, + 739, + 815, + null + ], + [ + 43097243, + 43097289, + 7, + 693, + 738, + null + ], + [ + 43099774, + 43099880, + 6, + 587, + 692, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722034.1", + "protein": "XP_006722097.1", + "start_codon": 145, + "stop_codon": 5734 + }, + "XM_006722035.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43106479, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 5622, + 7129, + null + ], + [ + 43047642, + 43047703, + 21, + 5561, + 5621, + null + ], + [ + 43049120, + 43049194, + 20, + 5487, + 5560, + null + ], + [ + 43051062, + 43051117, + 19, + 5432, + 5486, + null + ], + [ + 43057051, + 43057135, + 18, + 5348, + 5431, + null + ], + [ + 43063332, + 43063373, + 17, + 5307, + 5347, + null + ], + [ + 43063873, + 43063951, + 16, + 5229, + 5306, + null + ], + [ + 43067607, + 43067695, + 15, + 5141, + 5228, + null + ], + [ + 43070927, + 43071238, + 14, + 4830, + 5140, + null + ], + [ + 43074330, + 43074521, + 13, + 4639, + 4829, + null + ], + [ + 43076487, + 43076614, + 12, + 4512, + 4638, + null + ], + [ + 43082403, + 43082575, + 11, + 4340, + 4511, + null + ], + [ + 43090943, + 43091032, + 10, + 4251, + 4339, + null + ], + [ + 43091434, + 43094860, + 9, + 825, + 4250, + null + ], + [ + 43095845, + 43095922, + 8, + 748, + 824, + null + ], + [ + 43097243, + 43097289, + 7, + 702, + 747, + null + ], + [ + 43099774, + 43099880, + 6, + 596, + 701, + null + ], + [ + 43104121, + 43104261, + 5, + 456, + 595, + null + ], + [ + 43104867, + 43104956, + 4, + 367, + 455, + null + ], + [ + 43106477, + 43106533, + 3, + 311, + 366, + null + ], + [ + 43115725, + 43115779, + 2, + 257, + 310, + null + ], + [ + 43124016, + 43124115, + 1, + 158, + 256, + null + ], + [ + 43125276, + 43125433, + 0, + 1, + 157, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722035.1", + "protein": "XP_006722098.1", + "start_codon": 364, + "stop_codon": 5746 + }, + "XM_006722036.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43104181, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 19, + 5201, + 6708, + null + ], + [ + 43047642, + 43047703, + 18, + 5140, + 5200, + null + ], + [ + 43049120, + 43049194, + 17, + 5066, + 5139, + null + ], + [ + 43051062, + 43051117, + 16, + 5011, + 5065, + null + ], + [ + 43057051, + 43057135, + 15, + 4927, + 5010, + null + ], + [ + 43063332, + 43063373, + 14, + 4886, + 4926, + null + ], + [ + 43063873, + 43063951, + 13, + 4808, + 4885, + null + ], + [ + 43067607, + 43067695, + 12, + 4720, + 4807, + null + ], + [ + 43070927, + 43071238, + 11, + 4409, + 4719, + null + ], + [ + 43074330, + 43074521, + 10, + 4218, + 4408, + null + ], + [ + 43076487, + 43076614, + 9, + 4091, + 4217, + null + ], + [ + 43082403, + 43082575, + 8, + 3919, + 4090, + null + ], + [ + 43090943, + 43091032, + 7, + 3830, + 3918, + null + ], + [ + 43091434, + 43094860, + 6, + 404, + 3829, + null + ], + [ + 43095845, + 43095922, + 5, + 327, + 403, + null + ], + [ + 43097243, + 43097289, + 4, + 281, + 326, + null + ], + [ + 43099774, + 43099880, + 3, + 175, + 280, + null + ], + [ + 43104121, + 43104261, + 2, + 35, + 174, + null + ], + [ + 43104867, + 43104886, + 1, + 16, + 34, + null + ], + [ + 43106477, + 43106492, + 0, + 1, + 15, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722036.1", + "protein": "XP_006722099.1", + "start_codon": 114, + "stop_codon": 5325 + }, + "XM_006722037.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 2304, + 3811, + null + ], + [ + 43047642, + 43047703, + 21, + 2243, + 2303, + null + ], + [ + 43049120, + 43049194, + 20, + 2169, + 2242, + null + ], + [ + 43051062, + 43051117, + 19, + 2114, + 2168, + null + ], + [ + 43057051, + 43057135, + 18, + 2030, + 2113, + null + ], + [ + 43063332, + 43063373, + 17, + 1989, + 2029, + null + ], + [ + 43063873, + 43063951, + 16, + 1911, + 1988, + null + ], + [ + 43067607, + 43067695, + 15, + 1823, + 1910, + null + ], + [ + 43070927, + 43071238, + 14, + 1512, + 1822, + null + ], + [ + 43074330, + 43074521, + 13, + 1321, + 1511, + null + ], + [ + 43076487, + 43076614, + 12, + 1194, + 1320, + null + ], + [ + 43082403, + 43082575, + 11, + 1022, + 1193, + null + ], + [ + 43090943, + 43091032, + 10, + 933, + 1021, + null + ], + [ + 43094743, + 43094860, + 9, + 816, + 932, + null + ], + [ + 43095845, + 43095922, + 8, + 739, + 815, + null + ], + [ + 43097243, + 43097289, + 7, + 693, + 738, + null + ], + [ + 43099774, + 43099880, + 6, + 587, + 692, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722037.1", + "protein": "XP_006722100.1", + "start_codon": 145, + "stop_codon": 2428 + }, + "XM_006722038.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 2301, + 3808, + null + ], + [ + 43047642, + 43047703, + 21, + 2240, + 2300, + null + ], + [ + 43049120, + 43049194, + 20, + 2166, + 2239, + null + ], + [ + 43051062, + 43051117, + 19, + 2111, + 2165, + null + ], + [ + 43057051, + 43057135, + 18, + 2027, + 2110, + null + ], + [ + 43063332, + 43063373, + 17, + 1986, + 2026, + null + ], + [ + 43063873, + 43063951, + 16, + 1908, + 1985, + null + ], + [ + 43067607, + 43067695, + 15, + 1820, + 1907, + null + ], + [ + 43070927, + 43071238, + 14, + 1509, + 1819, + null + ], + [ + 43074330, + 43074521, + 13, + 1318, + 1508, + null + ], + [ + 43076487, + 43076614, + 12, + 1191, + 1317, + null + ], + [ + 43082403, + 43082575, + 11, + 1019, + 1190, + null + ], + [ + 43090943, + 43091032, + 10, + 930, + 1018, + null + ], + [ + 43094743, + 43094860, + 9, + 813, + 929, + null + ], + [ + 43095845, + 43095922, + 8, + 736, + 812, + null + ], + [ + 43097243, + 43097289, + 7, + 690, + 735, + null + ], + [ + 43099774, + 43099877, + 6, + 587, + 689, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722038.1", + "protein": "XP_006722101.1", + "start_codon": 145, + "stop_codon": 2425 + }, + "XM_006722039.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 2329, + 3836, + null + ], + [ + 43047642, + 43047703, + 21, + 2268, + 2328, + null + ], + [ + 43049120, + 43049194, + 20, + 2194, + 2267, + null + ], + [ + 43051062, + 43051117, + 19, + 2139, + 2193, + null + ], + [ + 43057051, + 43057135, + 18, + 2055, + 2138, + null + ], + [ + 43063332, + 43063373, + 17, + 2014, + 2054, + null + ], + [ + 43063873, + 43063951, + 16, + 1936, + 2013, + null + ], + [ + 43067607, + 43067695, + 15, + 1848, + 1935, + null + ], + [ + 43070927, + 43071238, + 14, + 1537, + 1847, + null + ], + [ + 43074330, + 43074521, + 13, + 1346, + 1536, + null + ], + [ + 43076487, + 43076614, + 12, + 1219, + 1345, + null + ], + [ + 43082403, + 43082575, + 11, + 1047, + 1218, + null + ], + [ + 43090943, + 43091032, + 10, + 958, + 1046, + null + ], + [ + 43094743, + 43094860, + 9, + 841, + 957, + null + ], + [ + 43095845, + 43095922, + 8, + 764, + 840, + null + ], + [ + 43097243, + 43097289, + 7, + 718, + 763, + null + ], + [ + 43099774, + 43099877, + 6, + 615, + 717, + null + ], + [ + 43104121, + 43104261, + 5, + 475, + 614, + null + ], + [ + 43104867, + 43104956, + 4, + 386, + 474, + null + ], + [ + 43106455, + 43106533, + 3, + 308, + 385, + null + ], + [ + 43115725, + 43115779, + 2, + 254, + 307, + null + ], + [ + 43124016, + 43124115, + 1, + 155, + 253, + null + ], + [ + 43125276, + 43125430, + 0, + 1, + 154, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722039.1", + "protein": "XP_006722102.1", + "start_codon": 173, + "stop_codon": 2453 + }, + "XM_006722040.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 22, + 2301, + 3808, + null + ], + [ + 43047642, + 43047703, + 21, + 2240, + 2300, + null + ], + [ + 43049120, + 43049194, + 20, + 2166, + 2239, + null + ], + [ + 43051062, + 43051117, + 19, + 2111, + 2165, + null + ], + [ + 43057051, + 43057135, + 18, + 2027, + 2110, + null + ], + [ + 43063332, + 43063373, + 17, + 1986, + 2026, + null + ], + [ + 43063873, + 43063951, + 16, + 1908, + 1985, + null + ], + [ + 43067607, + 43067695, + 15, + 1820, + 1907, + null + ], + [ + 43070927, + 43071238, + 14, + 1509, + 1819, + null + ], + [ + 43074330, + 43074521, + 13, + 1318, + 1508, + null + ], + [ + 43076487, + 43076611, + 12, + 1194, + 1317, + null + ], + [ + 43082403, + 43082575, + 11, + 1022, + 1193, + null + ], + [ + 43090943, + 43091032, + 10, + 933, + 1021, + null + ], + [ + 43094743, + 43094860, + 9, + 816, + 932, + null + ], + [ + 43095845, + 43095922, + 8, + 739, + 815, + null + ], + [ + 43097243, + 43097289, + 7, + 693, + 738, + null + ], + [ + 43099774, + 43099880, + 6, + 587, + 692, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722040.1", + "protein": "XP_006722103.1", + "start_codon": 145, + "stop_codon": 2425 + }, + "XM_006722041.1": { + "biotype": [], + "gene_name": "BRCA1", + "gene_version": "672", + "genome_builds": { + "GRCh38": { + "cds_end": 43124096, + "cds_start": 43045677, + "contig": "NC_000017.11", + "exons": [ + [ + 43044294, + 43045802, + 20, + 2178, + 3685, + null + ], + [ + 43047642, + 43047703, + 19, + 2117, + 2177, + null + ], + [ + 43049120, + 43049194, + 18, + 2043, + 2116, + null + ], + [ + 43051062, + 43051117, + 17, + 1988, + 2042, + null + ], + [ + 43057051, + 43057135, + 16, + 1904, + 1987, + null + ], + [ + 43063332, + 43063373, + 15, + 1863, + 1903, + null + ], + [ + 43063873, + 43063951, + 14, + 1785, + 1862, + null + ], + [ + 43067607, + 43067695, + 13, + 1697, + 1784, + null + ], + [ + 43070927, + 43071238, + 12, + 1386, + 1696, + null + ], + [ + 43074330, + 43074521, + 11, + 1195, + 1385, + null + ], + [ + 43076487, + 43076614, + 10, + 1068, + 1194, + null + ], + [ + 43082403, + 43082575, + 9, + 896, + 1067, + null + ], + [ + 43090943, + 43091032, + 8, + 807, + 895, + null + ], + [ + 43094743, + 43094860, + 7, + 690, + 806, + null + ], + [ + 43099774, + 43099877, + 6, + 587, + 689, + null + ], + [ + 43104121, + 43104261, + 5, + 447, + 586, + null + ], + [ + 43104867, + 43104956, + 4, + 358, + 446, + null + ], + [ + 43106455, + 43106533, + 3, + 280, + 357, + null + ], + [ + 43115725, + 43115779, + 2, + 226, + 279, + null + ], + [ + 43124016, + 43124115, + 1, + 127, + 225, + null + ], + [ + 43125270, + 43125396, + 0, + 1, + 126, + null + ] + ], + "strand": "-", + "url": "http://ftp.ncbi.nlm.nih.gov/genomes/archive/old_refseq/Homo_sapiens/ARCHIVE/ANNOTATION_RELEASE.106/GFF/ref_GRCh38_top_level.gff3.gz" + } + }, + "hgnc": "1100", + "id": "XM_006722041.1", + "protein": "XP_006722104.1", + "start_codon": 145, + "stop_codon": 2302 + } + }, + "cdot_version": "0.2.12", + "genome_builds": [ + "GRCh37", + "GRCh38" + ], + "genes": { + "672": { + "aliases": "BRCAI, BRCC1, BROVCA1, FANCS, IRIS, PNCA4, PPP1R53, PSCP, RNF53", + "biotype": "non_coding,protein_coding", + "description": "BRCA1 DNA repair associated", + "gene_symbol": "BRCA1", + "hgnc": "1100", + "map_location": "17q21.31", + "summary": "This gene encodes a 190 kD nuclear phosphoprotein that plays a role in maintaining genomic stability, and it also acts as a tumor suppressor. The BRCA1 gene contains 22 exons spanning about 110 kb of DNA. The encoded protein combines with other tumor suppressors, DNA damage sensors, and signal transducers to form a large multi-subunit protein complex known as the BRCA1-associated genome surveillance complex (BASC). This gene product associates with RNA polymerase II, and through the C-terminal domain, also interacts with histone deacetylase complexes. This protein thus plays a role in transcription, DNA repair of double-stranded breaks, and recombination. Mutations in this gene are responsible for approximately 40% of inherited breast cancers and more than 80% of inherited breast and ovarian cancers. Alternative splicing plays a role in modulating the subcellular localization and physiological function of this gene. Many alternatively spliced transcript variants, some of which are disease-associated mutations, have been described for this gene, but the full-length natures of only some of these variants has been described. A related pseudogene, which is also located on chromosome 17, has been identified. [provided by RefSeq, May 2020]", + "url": "https://ftp.ncbi.nlm.nih.gov/genomes/all/annotation_releases/9606/110/GCF_000001405.40_GRCh38.p14/GCF_000001405.40_GRCh38.p14_genomic.gff.gz" + } + } +} diff --git a/tests/data/db/create/txs/latest/aliases.sqlite3 b/tests/data/db/create/txs/latest/aliases.sqlite3 new file mode 100644 index 00000000..969dc45e --- /dev/null +++ b/tests/data/db/create/txs/latest/aliases.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b82616f947c5ab317b447bba8f2bc073878c8db7f818b3fd7eae94a07464e9 +size 69632 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz new file mode 100644 index 00000000..58991574 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d518239c0db3e532b0ff7e2e5aa3d431bb075bbfe402928c5b42190a30bee85 +size 3925 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.fai b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.fai new file mode 100644 index 00000000..3eed443e --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.fai @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:765f4a74149ef310426ea57eb6f5eedfaf1ad23e351b9c51640f5d23c7aa36fa +size 204 diff --git a/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.gzi b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.gzi new file mode 100644 index 00000000..a2b71017 --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/2023/0314/1048/1678790932.4032044.fa.bgz.gzi @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5570f5a1810b7af78caf4bc70a660f0df51e42baf91d4de5b2328de0e83dfc +size 8 diff --git a/tests/data/db/create/txs/latest/sequences/db.sqlite3 b/tests/data/db/create/txs/latest/sequences/db.sqlite3 new file mode 100644 index 00000000..b7a3790c --- /dev/null +++ b/tests/data/db/create/txs/latest/sequences/db.sqlite3 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7be74272990851cc4e06183e3c7d31cf29625877a9b8f33837cc19452df66cb2 +size 61440 diff --git a/utils/install-flatbuffers.sh b/utils/install-flatbuffers.sh new file mode 100644 index 00000000..d03010bf --- /dev/null +++ b/utils/install-flatbuffers.sh @@ -0,0 +1,18 @@ +#!/usr/bin/bash + +# Will install into ~/.local/share/flatbuffers, so make sure to add the following +# to your PATH: ~/.local/share/flatbuffers/bin +# +# Will go into ./utils/var for cloning/building. + +mkdir -p utils/var +cd utils/var + +git clone https://github.com/google/flatbuffers.git +cd flatbuffers +git checkout v22.12.06 +cmake -G "Unix Makefiles" -DCMAKE_INSTALL_PREFIX=$HOME/.local/share/flatbuffers +make +./flattests +make install +flatc --version