Skip to content

Commit

Permalink
Merge pull request #27 from databio/dev
Browse files Browse the repository at this point in the history
Hierarchical tokenizers
  • Loading branch information
nleroy917 authored Jul 29, 2024
2 parents 506dabe + f636780 commit 8c0811a
Show file tree
Hide file tree
Showing 46 changed files with 3,496 additions and 671 deletions.
32 changes: 32 additions & 0 deletions .github/workflows/codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
name: Compute coverage

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

env:
CARGO_TERM_COLOR: always

jobs:
coverage:
runs-on: ubuntu-latest
env:
CARGO_TERM_COLOR: always
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: rustup update stable
- name: Install cargo-llvm-cov
uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
working-directory: ./gtars
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: lcov.info
fail_ci_if_error: true
working-directory: ./gtars
22 changes: 22 additions & 0 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
name: Run tests

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master" ]

env:
CARGO_TERM_COLOR: always

jobs:
run:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Build
run: cargo build --verbose
working-directory: ./gtars
- name: Run tests
run: cargo test --verbose
working-directory: ./gtars
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[![codecov](https://codecov.io/gh/databio/gtars/branch/master/graph/badge.svg)](https://codecov.io/gh/databio/gtars)
[![crates.io](https://img.shields.io/crates/v/gtars?&logo=rust)](https://crates.io/crates/gtars)

<h1 align="center">
<img src="gtars/docs/logo.svg" alt="gtars logo" height="100px">
</h1>
Expand Down
2 changes: 1 addition & 1 deletion bindings/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "gtars-py"
version = "0.0.14"
version = "0.0.15"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
167 changes: 166 additions & 1 deletion bindings/gtars/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ class TokenizedRegionSet:
class TreeTokenizer:
def __new__(cls, path: str) -> TreeTokenizer:
"""
Construct a new TreeTokenize from a universe file.
Construct a new TreeTokenizer from a universe file.
:param path: The path to the universe file. This should be a BED file.
"""
Expand Down Expand Up @@ -348,6 +348,13 @@ class TreeTokenizer:
"""
The universe object.
"""

def export(self, path: str):
"""
Export the tokenizer configuration to a file.
:param path: The path to the output file.
"""

def __call__(self, regions: List[Region]) -> TokenizedRegionSet:
"""
Expand Down Expand Up @@ -383,4 +390,162 @@ class FragmentTokenizer:
:param file_path: The path to the file containing fragments.
:param out_path: The path to the output file. If None, the output is written to the standard output.
:param filter: A list of chromosomes to filter. If None, all chromosomes are included.
"""

class MetaTokenizer:
def __new__(cls, path: str) -> MetaTokenizer:
"""
Construct a new MetaTokenizer from a universe file.
:param path: The path to the universe file. This should be a BED file.
"""

def unknown_token(self) -> Region:
"""
Get the unknown token.
"""

def padding_token(self) -> Region:
"""
Get the padding token.
"""

def mask_token(self) -> Region:
"""
Get the mask token.
"""

def cls_token(self) -> Region:
"""
Get the CLS token.
"""

def bos_token(self) -> Region:
"""
Get the BOS token.
"""

def eos_token(self) -> Region:
"""
Get the EOS token.
"""

def sep_token(self) -> Region:
"""
Get the SEP token.
"""

def unknown_token_id(self) -> int:
"""
Get the ID of the unknown token.
"""

def padding_token_id(self) -> int:
"""
Get the ID of the padding token.
"""

def mask_token_id(self) -> int:
"""
Get the ID of the mask token.
"""

def cls_token_id(self) -> int:
"""
Get the ID of the CLS token.
"""

def bos_token_id(self) -> int:
"""
Get the ID of the BOS token.
"""

def eos_token_id(self) -> int:
"""
Get the ID of the EOS token.
"""

def sep_token_id(self) -> int:
"""
Get the ID of the SEP token.
"""

def vocab_size(self) -> int:
"""
Get the vocabulary size.
"""

def tokenize(self, regions: List[Region]) -> List[Region]:
"""
Tokenize a list of regions. This will only return the tokenized regions.
:param regions: The regions to tokenize.
:return: The tokenized regions as a list.
"""

def tokenize_bed_file(self, path: str) -> List[Region]:
"""
Tokenize a BED file directly.
:param path: The path to the BED file.
:return: The tokenized regions as a list.
"""

def encode(self, regions: List[Region]) -> List[int]:
"""
Encode a list of regions. This will return the integer representation of the tokenized regions.
:param regions: The regions to encode.
:return: The integer representation of the tokenized regions.
"""

def decode(self, ids: List[int]) -> List[Region]:
"""
Decode a list of integer representations of the tokenized regions.
:param ids: The integer representations of the tokenized regions.
:return: The decoded regions.
"""

def vocab(self) -> List[Tuple[Region, int]]:
"""
Get the vocabulary.
:return: The vocabulary as a list of tuples.
"""

@property
def universe(self) -> Universe:
"""
The universe object.
"""

def export(self, path: str):
"""
Export the tokenizer configuration to a file.
:param path: The path to the output file.
"""

def __call__(self, regions: List[Region]) -> TokenizedRegionSet:
"""
Tokenize a list of regions.
:param regions: The regions to tokenize.
:return: A TokenizedRegionSet object.
"""

def __len__(self) -> int:
"""
Get the vocabulary size.
"""

def __repr__(self) -> str:
"""
Get a string representation of the tokenizer.
"""
2 changes: 1 addition & 1 deletion bindings/src/ailist/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use pyo3::{prelude::*, pyclass};

use crate::models::PyInterval;

#[pyclass(name = "AIList")]
#[pyclass(name = "AIList", module="gtars.ailist")]
struct PyAIList {
ailist: AIList,
}
Expand Down
2 changes: 1 addition & 1 deletion bindings/src/models/interval.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use pyo3::prelude::*;

#[pyclass(name = "Interval")]
#[pyclass(name = "Interval", module="gtars.models")]
pub struct PyInterval {
#[pyo3(get, set)]
pub start: u32,
Expand Down
4 changes: 2 additions & 2 deletions bindings/src/models/region.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use gtars::common::models::region::Region;

use crate::models::PyUniverse;

#[pyclass(name = "Region")]
#[pyclass(name = "Region", module="gtars.models")]
#[derive(Clone, Debug, Hash, Eq, PartialEq)]
pub struct PyRegion {
pub chr: String,
Expand Down Expand Up @@ -75,7 +75,7 @@ impl PyRegion {
}
}

#[pyclass(name = "TokenizedRegion")]
#[pyclass(name = "TokenizedRegion", module="gtars.models")]
#[derive(Clone, Debug)]
pub struct PyTokenizedRegion {
pub id: u32,
Expand Down
6 changes: 3 additions & 3 deletions bindings/src/models/region_set.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use gtars::common::utils::extract_regions_from_bed_file;

use crate::models::{PyRegion, PyTokenizedRegion, PyUniverse};

#[pyclass(name = "RegionSet")]
#[pyclass(name = "RegionSet", module="gtars.models")]
#[derive(Clone, Debug)]
pub struct PyRegionSet {
pub regions: Vec<PyRegion>,
Expand Down Expand Up @@ -85,7 +85,7 @@ impl PyRegionSet {
}
}

#[pyclass(name = "TokenizedRegionSet")]
#[pyclass(name = "TokenizedRegionSet", module="gtars.models")]
#[derive(Clone, Debug)]
pub struct PyTokenizedRegionSet {
pub ids: Vec<u32>,
Expand Down Expand Up @@ -123,7 +123,7 @@ impl PyTokenizedRegionSet {
Ok(self
.ids
.iter()
.map(|id| self.universe.borrow(py).id_to_region[&id].clone())
.map(|id| self.universe.borrow(py).id_to_region[id].clone())
.collect())
})
}
Expand Down
2 changes: 1 addition & 1 deletion bindings/src/models/universe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use anyhow::Result;
use crate::models::PyRegion;
use gtars::common::models::Universe;

#[pyclass(name = "Universe")]
#[pyclass(name = "Universe", module="gtars.models")]
#[derive(Clone, Debug)]
pub struct PyUniverse {
pub regions: Vec<PyRegion>,
Expand Down
48 changes: 48 additions & 0 deletions bindings/src/tokenizers/builder.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// TODO: stil a work in progress
use pyo3::prelude::*;

use anyhow::Result;

use std::path::Path;


use gtars::tokenizers::TokenizerConfig;

use super::{
PyMetaTokenizer,
PyTreeTokenizer
};

#[pyclass(name="TokenizerBuilder")]
pub struct PyTokenizerBuilder;

#[pymethods]
impl PyTokenizerBuilder {

#[classmethod]
pub fn from_toml(path: String) -> Result<PyObject> {
let config = TokenizerConfig::new(Path::new(&path))?;

match config.tokenizer_type {
Some(tokenizer_type) => {
match tokenizer_type.as_str() {
"tree" => {
let t = PyTreeTokenizer::new(path)?;
t.to_object()
},
"meta" => {
PyMetaTokenizer::new(path)
},
_ => {
anyhow::bail!("Tokenizer type {} not supported", tokenizer_type)
}
}
},
None => {
println!("No tokenizer type found in config file. Instantiating a default TreeTokenizer. Note that this may lead to unexpected behavior.");
PyTreeTokenizer::new(path)
}
};

}
}
2 changes: 1 addition & 1 deletion bindings/src/tokenizers/fragments_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use pyo3::prelude::*;
use super::PyTokenizedRegionSet;
use super::PyUniverse;

#[pyclass(name = "FragmentTokenizer")]
#[pyclass(name = "FragmentTokenizer", module="gtars.tokenizers")]
pub struct PyFragmentTokenizer {
pub tokenizer: gtars::tokenizers::FragmentTokenizer<TreeTokenizer>,
pub universe: Py<PyUniverse>, // this is a Py-wrapped version self.tokenizer.universe for performance reasons
Expand Down
Loading

0 comments on commit 8c0811a

Please sign in to comment.