Skip to content

Commit

Permalink
ensure tokenizer sizing takes special characters
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeadie committed Nov 27, 2024
1 parent ccb88e9 commit b33e474
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion src/chunk_size/huggingface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ impl ChunkSizer for &Tokenizer {
/// encounters text it can't tokenize.
fn size(&self, chunk: &str) -> usize {
let encoding = self
.encode(chunk, false)
.encode(chunk, true)
.expect("Unable to tokenize the following string {chunk}");

let pad_id = self.get_padding().map(|params| params.pad_id);
Expand Down

0 comments on commit b33e474

Please sign in to comment.