feat!: special tokens encoded by default

Special tokens are now also encoded by both Huggingface and Tiktoken tokenizers. This is closer to the default behavior on the Python side, and should make sure if a model adds tokens at the beginning or end of a sequence, these are accounted for as well.
benbrandt · Dec 14, 2024 · 0377d87 · 0377d87
1 parent 919fba6
commit 0377d87
Show file tree

Hide file tree

Showing 13 changed files with 4,000 additions and 3,267 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v0.21.0
+
+### Breaking Changes
+
+- Special tokens are now also encoded by both Huggingface and Tiktoken tokenizers. This is closer to the default behavior on the Python side, and should make sure if a model adds tokens at the beginning or end of a sequence, these are accounted for as well.
+
 ## v0.20.0
 
 ### Breaking Changes

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -2,7 +2,7 @@
 members = ["bindings/*"]
 
 [workspace.package]
-version = "0.20.0"
+version = "0.21.0"
 authors = ["Ben Brandt <[email protected]>"]
 edition = "2021"
 description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python."

diff --git a/src/chunk_size/huggingface.rs b/src/chunk_size/huggingface.rs
@@ -33,7 +33,7 @@ impl ChunkSizer for &Tokenizer {
     /// encounters text it can't tokenize.
     fn size(&self, chunk: &str) -> usize {
         let encoding = self
-            .encode(chunk, false)
+            .encode(chunk, true)
             .expect("Unable to tokenize the following string {chunk}");
 
         let pad_id = self.get_padding().map(|params| params.pad_id);
@@ -61,7 +61,8 @@ mod tests {
     fn returns_size() {
         let tokenizer = Tokenizer::from_pretrained("bert-base-cased", None).unwrap();
         let size = tokenizer.size(" An apple a");
-        assert_eq!(size, 3);
+        // Bert has a beginning and end token
+        assert_eq!(size, 5);
     }
 
     #[test]
@@ -77,7 +78,8 @@ mod tests {
     fn handles_padding() {
         let tokenizer = Tokenizer::from_pretrained("thenlper/gte-small", None).unwrap();
         let size = tokenizer.size("An apple a");
-        assert_eq!(size, 3);
+        // Has a beginning and end token
+        assert_eq!(size, 5);
     }
 
     #[test]
@@ -87,8 +89,9 @@ mod tests {
 
         // Need to ensure chunk is large enough to cause Encoding overflows.
         assert_eq!(
-            tokenizer.size("An apple a day keeps the doctor away.".repeat(100).as_str()),
-            900
+            tokenizer.size(" An apple a day keeps the doctor away".repeat(16).as_str()),
+            // Overflows at 128, with special tokens at beginning and end of each section of tokens
+            132
         );
     }
 }
diff --git a/src/chunk_size/tiktoken.rs b/src/chunk_size/tiktoken.rs
@@ -5,7 +5,7 @@ use crate::ChunkSizer;
 impl ChunkSizer for &CoreBPE {
     /// Returns the number of tokens in a given text after tokenization.
     fn size(&self, chunk: &str) -> usize {
-        self.encode_ordinary(chunk).len()
+        self.encode_with_special_tokens(chunk).len()
     }
 }
 

diff --git a/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_32.snap b/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_32.snap
diff --git a/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_512.snap b/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_512.snap
diff --git a/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_false_32.snap b/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_false_32.snap
diff --git a/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_false_512.snap b/tests/snapshots/snapshots__romeo_and_juliet_Tokenizers_trim_false_512.snap
diff --git a/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_32.snap b/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_32.snap
diff --git a/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_512.snap b/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_512.snap
diff --git a/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_false_32.snap b/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_false_32.snap
diff --git a/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_false_512.snap b/tests/snapshots/snapshots__room_with_a_view_Tokenizers_trim_false_512.snap