From 0fc34dd76184c1563762777a5f1f3cb5139418f0 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Tue, 7 May 2024 12:05:16 -0700 Subject: [PATCH] fix: Max encoded offset optimization was potentially using the wrong byte offset Potentially the fallback max byte offset was larger than it should have been. Now it properly uses whichever is smaller, either from the fallback or the semantic level. --- CHANGELOG.md | 4 ++ Cargo.lock | 4 +- Cargo.toml | 2 +- src/lib.rs | 5 +- ...mark_spec_markdown_Characters_trim_16.snap | 9 ++- ...markdown_Characters_trim_16_overlap_8.snap | 7 ++- ..._flavored_markdown_Characters_trim_16.snap | 63 ++++++++++--------- ...markdown_Characters_trim_16_overlap_8.snap | 28 +++++---- ...ub_flavored_markdown_TikToken_trim_16.snap | 4 +- 9 files changed, 74 insertions(+), 52 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9f51e573..46d40a1a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## v0.13.1 + +Fix a bug in the fallback logic to make sure we are still respecting the maximum bytes we should be searching in. Again, this only affects Markdown splitting at very small sizes. + ## v0.13.0 ### What's New / Breaking Changes diff --git a/Cargo.lock b/Cargo.lock index 7fa3c60c..ceae75fa 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1864,7 +1864,7 @@ dependencies = [ [[package]] name = "semantic-text-splitter" -version = "0.13.0" +version = "0.13.1" dependencies = [ "auto_enums", "pyo3", @@ -2100,7 +2100,7 @@ dependencies = [ [[package]] name = "text-splitter" -version = "0.13.0" +version = "0.13.1" dependencies = [ "ahash", "auto_enums", diff --git a/Cargo.toml b/Cargo.toml index 7b3c0856..93aca37b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = ["bindings/*"] [workspace.package] -version = "0.13.0" +version = "0.13.1" authors = ["Ben Brandt "] edition = "2021" description = "Split text into semantic chunks, up to a desired chunk size. Supports calculating length by characters and tokens, and is callable from Rust and Python." diff --git a/src/lib.rs b/src/lib.rs index bc31483d..3911e88c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -401,7 +401,10 @@ where }), ); - let max_encoded_offset = fallback_max_encoded_offset.or(max_encoded_offset); + let max_encoded_offset = match (fallback_max_encoded_offset, max_encoded_offset) { + (Some(fallback), Some(max)) => Some(fallback.min(max)), + (fallback, max) => fallback.or(max), + }; let sections = semantic_level .unwrap_or(FallbackLevel::Char) diff --git a/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16.snap b/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16.snap index 780b158f..87d7f3f0 100644 --- a/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16.snap +++ b/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16.snap @@ -14644,7 +14644,8 @@ expression: chunks - "=-)" - list_item - paragraph -- "\"Qui *quodsi" +- "\"Qui" +- "*quodsi" - "iracundia*\"" - "-> list_item" - "-> paragraph" @@ -14902,7 +14903,8 @@ expression: chunks - "=-)" - "-> list_item" - "-> paragraph" -- "\"Qui *quodsi" +- "\"Qui" +- "*quodsi" - "iracundia*\"\n```" - "The fourth line," - "``` markdown" @@ -14939,7 +14941,8 @@ expression: chunks - "=-)" - list_item - paragraph -- "\"Qui *quodsi" +- "\"Qui" +- "*quodsi" - "iracundia*\"" - "-> list_item" - "-> paragraph" diff --git a/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16_overlap_8.snap b/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16_overlap_8.snap index 689ad770..f1794e7b 100644 --- a/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16_overlap_8.snap +++ b/tests/snapshots/text_splitter_snapshots__commonmark_spec_markdown_Characters_trim_16_overlap_8.snap @@ -4545,7 +4545,7 @@ expression: chunks - "````````````````" - "```````` example" - ~~~~ ruby -- ruby startline=3 +- ruby startline= - "=3 $%@#$" - def foo(x) - " return 3\nend" @@ -11096,7 +11096,7 @@ expression: chunks - "baz`\n." - "

foo" - code>foo bar -- bar baz< +- bar baz - "

" - "````````````````" - "````````````````" @@ -18599,6 +18599,7 @@ expression: chunks - "=-)" - list_item - paragraph +- "\"Qui" - "\"Qui *quodsi" - quodsi iracundia - "*\"" @@ -18917,6 +18918,7 @@ expression: chunks - "=-)" - "-> list_item" - "-> paragraph" +- "\"Qui" - "\"Qui *quodsi" - quodsi iracundia - "*\"\n```" @@ -18961,6 +18963,7 @@ expression: chunks - "=-)" - list_item - paragraph +- "\"Qui" - "\"Qui *quodsi" - quodsi iracundia - "*\"" diff --git a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16.snap b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16.snap index d90bfbbf..93491be6 100644 --- a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16.snap +++ b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16.snap @@ -661,10 +661,11 @@ expression: chunks - "3021" - namespace - "MyApplication\n{" -- "[Obsolete(\"...\")" -- "]" -- "class Program :" -- "IInterface\n {" +- "[Obsolete(\"..." +- "\")]" +- class Program +- ": IInterface" +- "{" - public static - List - JustDoIt(int @@ -672,11 +673,11 @@ expression: chunks - Console.WriteLin - "e($\"Hello {Name}" - "!\");" -- return new List< -- "int>(new int[] {" -- "1, 2, 3 })" -- " }\n }" -- "}\n```" +- return +- new List( +- "new int[] { 1, 2" +- ", 3 })\n }" +- " }\n}\n```" - "```css" - "@font-face {" - "font-family:" @@ -704,14 +705,14 @@ expression: chunks - initHighlight( - "block, cls) {" - "try {" -- if (cls.search(/ -- "\\bno\\-highlight\\" -- b/) != -1) +- if (cls.search( +- "/\\bno\\-highlight" +- "\\b/) != -1)" - return process( - "block, true," - "0x0F) +" -- "` class=\"${cls}\"" -- "`;" +- "` class=" +- "\"${cls}\"`;" - "} catch (e) {" - /* handle - "exception */\n }" @@ -723,9 +724,9 @@ expression: chunks - checkCondition( - "classes[i]) ===" - undefined) -- "console.log('" -- "undefined');\n }" -- "}" +- console.log( +- "'undefined');" +- " }\n}" - export $ - initHighlight; - "```" @@ -746,8 +747,8 @@ expression: chunks - "Factory\n{" - abstract - function test(); -- public static $ -- st1 = 1; +- public static +- $st1 = 1; - "const ME = \"Yo\";" - var $list = NULL - ; @@ -763,8 +764,9 @@ expression: chunks - "= array(), $uri" - "= 'http')\n {" - echo __METHOD__; -- "$uri = explode('" -- ":', $uri, 0b10);" +- $uri = explode( +- "':', $uri, 0b10)" +- ; - $schemeSpecific - "= isset($uri[1])" - "? $uri[1] : '';" @@ -776,19 +778,20 @@ expression: chunks - if (! - ctype_alnum($ - "scheme)) {" -- throw new +- throw +- new - Zend_Uri_Excepti - "on('Illegal" - "scheme');" - "}" -- $this->var = 0 - -- "self::$st;" -- $this->list = -- "list(Array(\"1\"=>" -- "2, 2=>self::ME," -- "3 => \\Location\\" -- "Web\\URI::class))" -- ; +- $this->var +- "= 0 - self::$st;" +- $this->list +- "= list(Array(\"1\"" +- "=> 2, 2=>self::" +- "ME, 3 => \\" +- "Location\\Web\\URI" +- "::class));" - "return [" - "'uri' => $uri," - "'value' => null," diff --git a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16_overlap_8.snap b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16_overlap_8.snap index d1013db7..130bcd9d 100644 --- a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16_overlap_8.snap +++ b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_Characters_trim_16_overlap_8.snap @@ -818,9 +818,10 @@ expression: chunks - "414, 3021" - namespace - " MyApplication\n{" -- "[Obsolete(\"...\")" +- "[Obsolete(\"..." - "(\"...\")]" -- "class Program :" +- class Program +- "Program :" - ": IInterface" - "{" - public static @@ -832,6 +833,7 @@ expression: chunks - "WriteLine($\"" - "($\"Hello {Name}!" - "{Name}!\");" +- return - return new List< - List(new - "int>(new int[] {" @@ -867,7 +869,7 @@ expression: chunks - $initHighlight( - "(block, cls) {" - "try {" -- if (cls.search(/ +- if (cls.search( - "(/\\bno\\-" - "/\\bno\\-highlight" - "\\b/) != -1)" @@ -875,6 +877,7 @@ expression: chunks - "process(block," - "(block, true," - ", true, 0x0F) +" +- "` class=" - "` class=\"${cls}\"" - "\"${cls}\"`;" - "} catch (e) {" @@ -890,7 +893,7 @@ expression: chunks - "(classes[i]) ===" - "]) === undefined" - ) -- "console.log('" +- console.log( - "('undefined');" - " }\n}" - export $ @@ -914,7 +917,7 @@ expression: chunks - " Factory\n{" - abstract - function test(); -- public static $ +- public static - static $st1 = 1; - "const ME = \"Yo\";" - var $list = NULL @@ -933,7 +936,8 @@ expression: chunks - "(), $uri = 'http" - " = 'http')\n {" - echo __METHOD__; -- "$uri = explode('" +- $uri = explode( +- "explode(':', $" - "(':', $uri, 0b10" - ", 0b10);" - $schemeSpecific @@ -948,15 +952,17 @@ expression: chunks - if (! - ctype_alnum($ - "($scheme)) {" +- throw - throw new - Zend_Uri_Excepti - "_Exception('" - "('Illegal scheme" - "scheme');" - "}" -- $this->var = 0 - -- "= 0 - self::$st;" -- $this->list = +- $this->var +- "->var = 0 - self" +- "0 - self::$st;" +- $this->list - "->list = list(" - "= list(Array(\"1\"" - "(\"1\"=> 2, 2=>" @@ -1089,9 +1095,9 @@ expression: chunks - "| Right-aligned" - "-aligned |" - "| :--- |" -- "| :---:" +- "| :--" - ":---: |" -- "| ---:" +- "| -" - "---: |" - "| git status |" - "| git status" diff --git a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_TikToken_trim_16.snap b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_TikToken_trim_16.snap index bedf9531..f2d01b07 100644 --- a/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_TikToken_trim_16.snap +++ b/tests/snapshots/text_splitter_snapshots__github_flavored_markdown_TikToken_trim_16.snap @@ -245,8 +245,8 @@ expression: chunks - "ctype_alnum($scheme)) {" - " throw new Zend_Uri_Exception('Illegal scheme');\n }" - "$this->var = 0 - self::$st;" -- "$this->list = list(Array(\"1\"=> 2, 2=>self" -- "::ME, 3 => \\Location\\Web\\URI::class));" +- "$this->list = list(Array(\"1\"=> 2, 2=>" +- "self::ME, 3 => \\Location\\Web\\URI::class));" - " return [\n 'uri' => $uri," - " 'value' => null,\n ];\n }\n}" - "echo URI::ME . URI::$st1;"