diff --git a/Cargo.toml b/Cargo.toml index 61a3741..edf3307 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -33,11 +33,11 @@ unicode-segmentation = "1.10.1" criterion = "0.5.1" fake = "2.9.1" insta = { version = "1.34.0", features = ["glob", "yaml"] } +more-asserts = "0.3.1" tokenizers = { version = ">=0.13.3, <0.16.0", default-features = false, features = [ "onig", "http", ] } -more-asserts = "0.3.1" [[bench]] name = "chunk_size" diff --git a/benches/chunk_size.rs b/benches/chunk_size.rs index 87f366c..8841ad7 100644 --- a/benches/chunk_size.rs +++ b/benches/chunk_size.rs @@ -1,65 +1,62 @@ use std::fs; use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion}; -use text_splitter::TextSplitter; -use tiktoken_rs::cl100k_base; +use text_splitter::{Characters, TextSplitter}; +use tiktoken_rs::{cl100k_base, CoreBPE}; use tokenizers::Tokenizer; -fn criterion_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("characters"); - - let splitter = TextSplitter::default(); - let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap(); - - for chunk_size in (5..17).map(|n| 2usize.pow(n)) { - group.bench_with_input( - BenchmarkId::from_parameter(chunk_size), - &chunk_size, - |b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::>()), - ); - } - - group.finish(); +#[allow(clippy::large_enum_variant)] +enum Splitter { + Characters(TextSplitter), + Huggingface(TextSplitter), + Tiktoken(TextSplitter), } -fn huggingface_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("huggingface"); - - let splitter = TextSplitter::new(Tokenizer::from_pretrained("bert-base-cased", None).unwrap()); - let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap(); +impl Splitter { + fn name(&self) -> &str { + match self { + Splitter::Characters(_) => "Characters", + Splitter::Huggingface(_) => "Huggingface", + Splitter::Tiktoken(_) => "Tiktoken", + } + } - for chunk_size in (5..17).map(|n| 2usize.pow(n)) { - group.bench_with_input( - BenchmarkId::from_parameter(chunk_size), - &chunk_size, - |b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::>()), - ); + fn iter() -> [Self; 3] { + [ + Self::Characters(TextSplitter::default()), + Self::Huggingface(TextSplitter::new( + Tokenizer::from_pretrained("bert-base-cased", None).unwrap(), + )), + Self::Tiktoken(TextSplitter::new(cl100k_base().unwrap())), + ] } - group.finish(); + fn chunks<'text>(&self, text: &'text str, chunk_size: usize) -> Vec<&'text str> { + match self { + Splitter::Characters(splitter) => splitter.chunks(text, chunk_size).collect(), + Splitter::Huggingface(splitter) => splitter.chunks(text, chunk_size).collect(), + Splitter::Tiktoken(splitter) => splitter.chunks(text, chunk_size).collect(), + } + } } -fn tiktoken_benchmark(c: &mut Criterion) { - let mut group = c.benchmark_group("tiktoken"); - - let splitter = TextSplitter::new(cl100k_base().unwrap()); - let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap(); - - for chunk_size in (5..17).map(|n| 2usize.pow(n)) { - group.bench_with_input( - BenchmarkId::from_parameter(chunk_size), - &chunk_size, - |b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::>()), - ); +fn criterion_benchmark(c: &mut Criterion) { + for filename in ["romeo_and_juliet", "room_with_a_view"] { + let mut group = c.benchmark_group(filename); + let text = fs::read_to_string(format!("tests/inputs/text/{filename}.txt")).unwrap(); + + for splitter in Splitter::iter() { + for chunk_size in (5..17).map(|n| 2usize.pow(n)) { + group.bench_with_input( + BenchmarkId::new(splitter.name(), chunk_size), + &chunk_size, + |b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size)), + ); + } + } + group.finish(); } - - group.finish(); } -criterion_group!( - benches, - criterion_benchmark, - huggingface_benchmark, - tiktoken_benchmark -); +criterion_group!(benches, criterion_benchmark); criterion_main!(benches);