From 094fb359f79c940c4f865621f9b0d804ba9d6a27 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Tue, 9 Apr 2024 06:26:44 +0800 Subject: [PATCH 1/5] Optimize `String#index(Char)` and `#rindex(Char)` for invalid UTF-8 --- src/string.cr | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/string.cr b/src/string.cr index 4004f0d34929..0662ece5aaf9 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3334,8 +3334,8 @@ class String # ``` def index(search : Char, offset = 0) : Int32? # If it's ASCII we can delegate to slice - if search.ascii? && single_byte_optimizable? - return to_slice.fast_index(search.ord.to_u8, offset) + if single_byte_optimizable? + return search.ascii? ? to_slice.fast_index(search.ord.to_u8, offset) : nil end offset += size if offset < 0 @@ -3445,8 +3445,8 @@ class String # ``` def rindex(search : Char, offset = size - 1) # If it's ASCII we can delegate to slice - if search.ascii? && single_byte_optimizable? - return to_slice.rindex(search.ord.to_u8, offset) + if single_byte_optimizable? + return search.ascii? ? to_slice.rindex(search.ord.to_u8, offset) : nil end offset += size if offset < 0 From e86631e3981aaa8b883b0b31aeadf0b99b34b53c Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Tue, 9 Apr 2024 15:40:39 +0800 Subject: [PATCH 2/5] Update src/string.cr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Müller --- src/string.cr | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index 0662ece5aaf9..da787c066538 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3335,7 +3335,11 @@ class String def index(search : Char, offset = 0) : Int32? # If it's ASCII we can delegate to slice if single_byte_optimizable? - return search.ascii? ? to_slice.fast_index(search.ord.to_u8, offset) : nil + # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte + # sequences and we can immediately reject any non-ASCII codepoint. + return unless search.ascii? + + return to_slice.fast_index(search.ord.to_u8, offset) end offset += size if offset < 0 From 599cef3c6b1f51e50210ad078cb1ebed5e6c418e Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Tue, 9 Apr 2024 15:40:55 +0800 Subject: [PATCH 3/5] Update src/string.cr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Johannes Müller --- src/string.cr | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index da787c066538..79acf186ca03 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3450,7 +3450,11 @@ class String def rindex(search : Char, offset = size - 1) # If it's ASCII we can delegate to slice if single_byte_optimizable? - return search.ascii? ? to_slice.rindex(search.ord.to_u8, offset) : nil + # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte + # sequences and we can immediately reject any non-ASCII codepoint. + return unless search.ascii? + + return to_slice.fast_rindex(search.ord.to_u8, offset) end offset += size if offset < 0 From 7e2e90162666613c28375a12deb7a9a37c3b1489 Mon Sep 17 00:00:00 2001 From: Quinton Miller Date: Tue, 9 Apr 2024 16:54:54 +0800 Subject: [PATCH 4/5] Update src/string.cr Co-authored-by: Sijawusz Pur Rahnama --- src/string.cr | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/string.cr b/src/string.cr index 79acf186ca03..ee7e4d1e98a7 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3454,7 +3454,7 @@ class String # sequences and we can immediately reject any non-ASCII codepoint. return unless search.ascii? - return to_slice.fast_rindex(search.ord.to_u8, offset) + return to_slice.rindex(search.ord.to_u8, offset) end offset += size if offset < 0 From 30b90f47638eb1a2c04b35557d9960d5740d2cfd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Tue, 16 Apr 2024 12:55:22 +0200 Subject: [PATCH 5/5] `crystal tool format` --- src/string.cr | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/string.cr b/src/string.cr index ee7e4d1e98a7..89c46ca38352 100644 --- a/src/string.cr +++ b/src/string.cr @@ -3338,7 +3338,7 @@ class String # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte # sequences and we can immediately reject any non-ASCII codepoint. return unless search.ascii? - + return to_slice.fast_index(search.ord.to_u8, offset) end @@ -3453,7 +3453,7 @@ class String # With `single_byte_optimizable?` there are only ASCII characters and invalid UTF-8 byte # sequences and we can immediately reject any non-ASCII codepoint. return unless search.ascii? - + return to_slice.rindex(search.ord.to_u8, offset) end