Skip to content

Commit

Permalink
Benchmark both files
Browse files Browse the repository at this point in the history
  • Loading branch information
benbrandt committed Dec 23, 2023
1 parent 0023b65 commit bfee51f
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 50 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,11 @@ unicode-segmentation = "1.10.1"
criterion = "0.5.1"
fake = "2.9.1"
insta = { version = "1.34.0", features = ["glob", "yaml"] }
more-asserts = "0.3.1"
tokenizers = { version = ">=0.13.3, <0.16.0", default-features = false, features = [
"onig",
"http",
] }
more-asserts = "0.3.1"

[[bench]]
name = "chunk_size"
Expand Down
95 changes: 46 additions & 49 deletions benches/chunk_size.rs
Original file line number Diff line number Diff line change
@@ -1,65 +1,62 @@
use std::fs;

use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use text_splitter::TextSplitter;
use tiktoken_rs::cl100k_base;
use text_splitter::{Characters, TextSplitter};
use tiktoken_rs::{cl100k_base, CoreBPE};
use tokenizers::Tokenizer;

fn criterion_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("characters");

let splitter = TextSplitter::default();
let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap();

for chunk_size in (5..17).map(|n| 2usize.pow(n)) {
group.bench_with_input(
BenchmarkId::from_parameter(chunk_size),
&chunk_size,
|b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::<Vec<_>>()),
);
}

group.finish();
#[allow(clippy::large_enum_variant)]
enum Splitter {
Characters(TextSplitter<Characters>),
Huggingface(TextSplitter<Tokenizer>),
Tiktoken(TextSplitter<CoreBPE>),
}

fn huggingface_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("huggingface");

let splitter = TextSplitter::new(Tokenizer::from_pretrained("bert-base-cased", None).unwrap());
let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap();
impl Splitter {
fn name(&self) -> &str {
match self {
Splitter::Characters(_) => "Characters",
Splitter::Huggingface(_) => "Huggingface",
Splitter::Tiktoken(_) => "Tiktoken",
}
}

for chunk_size in (5..17).map(|n| 2usize.pow(n)) {
group.bench_with_input(
BenchmarkId::from_parameter(chunk_size),
&chunk_size,
|b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::<Vec<_>>()),
);
fn iter() -> [Self; 3] {
[
Self::Characters(TextSplitter::default()),
Self::Huggingface(TextSplitter::new(
Tokenizer::from_pretrained("bert-base-cased", None).unwrap(),
)),
Self::Tiktoken(TextSplitter::new(cl100k_base().unwrap())),
]
}

group.finish();
fn chunks<'text>(&self, text: &'text str, chunk_size: usize) -> Vec<&'text str> {
match self {
Splitter::Characters(splitter) => splitter.chunks(text, chunk_size).collect(),
Splitter::Huggingface(splitter) => splitter.chunks(text, chunk_size).collect(),
Splitter::Tiktoken(splitter) => splitter.chunks(text, chunk_size).collect(),
}
}
}

fn tiktoken_benchmark(c: &mut Criterion) {
let mut group = c.benchmark_group("tiktoken");

let splitter = TextSplitter::new(cl100k_base().unwrap());
let text = fs::read_to_string("tests/inputs/text/romeo_and_juliet.txt").unwrap();

for chunk_size in (5..17).map(|n| 2usize.pow(n)) {
group.bench_with_input(
BenchmarkId::from_parameter(chunk_size),
&chunk_size,
|b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size).collect::<Vec<_>>()),
);
fn criterion_benchmark(c: &mut Criterion) {
for filename in ["romeo_and_juliet", "room_with_a_view"] {
let mut group = c.benchmark_group(filename);
let text = fs::read_to_string(format!("tests/inputs/text/{filename}.txt")).unwrap();

for splitter in Splitter::iter() {
for chunk_size in (5..17).map(|n| 2usize.pow(n)) {
group.bench_with_input(
BenchmarkId::new(splitter.name(), chunk_size),
&chunk_size,
|b, &chunk_size| b.iter(|| splitter.chunks(&text, chunk_size)),
);
}
}
group.finish();
}

group.finish();
}

criterion_group!(
benches,
criterion_benchmark,
huggingface_benchmark,
tiktoken_benchmark
);
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

0 comments on commit bfee51f

Please sign in to comment.