From fe7c4c8f32c2125edf566d186a7689828c824774 Mon Sep 17 00:00:00 2001 From: Peter Arato Date: Wed, 10 May 2023 16:02:41 -0400 Subject: [PATCH] Add `String#bytesplice` with tests. --- CHANGELOG.md | 1 + spec/ruby/core/string/bytesplice_spec.rb | 130 ++++++++++++++++++ spec/truffle/methods/String.txt | 1 + src/main/ruby/truffleruby/core/string.rb | 48 +++++++ .../core/truffle/polyglot_methods.rb | 4 + .../core/truffle/string_operations.rb | 13 ++ 6 files changed, 197 insertions(+) create mode 100644 spec/ruby/core/string/bytesplice_spec.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index f54a756e90be..d1d7969d10dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ Compatibility: * Add support for array pattern matching. This is opt-in via `--pattern-matching` since pattern matching is not fully supported yet. (#2683, @razetime). * Fix `Array#[]` with `ArithmeticSequence` argument when step is negative (@itarato). * Fix `Range#size` and return `nil` for beginningless Range when end isn't Numeric (@rwstauner). +* Add `String#bytesplice` (@itarato). Performance: diff --git a/spec/ruby/core/string/bytesplice_spec.rb b/spec/ruby/core/string/bytesplice_spec.rb new file mode 100644 index 000000000000..c499ffce4b8b --- /dev/null +++ b/spec/ruby/core/string/bytesplice_spec.rb @@ -0,0 +1,130 @@ +# -*- encoding: utf-8 -*- +require_relative '../../spec_helper' + +describe "String#bytesplice" do + ruby_version_is "3.2" do + it "raises IndexError when index is less than -size" do + -> { "hello".bytesplice(-6, 0, "xxx") }.should raise_error(IndexError, "index -6 out of string") + end + + it "raises IndexError when index is greater or equal to size" do + -> { "hello".bytesplice(6, 0, "xxx") }.should raise_error(IndexError, "index 6 out of string") + end + + it "replaces with integer indices" do + "hello".bytesplice(-5, 0, "xxx").should == "xxxhello" + "hello".bytesplice(0, 0, "xxx").should == "xxxhello" + "hello".bytesplice(0, 1, "xxx").should == "xxxello" + "hello".bytesplice(0, 5, "xxx").should == "xxx" + "hello".bytesplice(0, 6, "xxx").should == "xxx" + end + + it "raises RangeError when range left boundary is less than -size" do + -> { "hello".bytesplice(-6...-6, "xxx") }.should raise_error(RangeError, "-6...-6 out of range") + end + + it "replaces with ranges" do + "hello".bytesplice(-5...-5, "xxx").should == "xxxhello" + "hello".bytesplice(0...0, "xxx").should == "xxxhello" + "hello".bytesplice(0..0, "xxx").should == "xxxello" + "hello".bytesplice(0...1, "xxx").should == "xxxello" + "hello".bytesplice(0..1, "xxx").should == "xxxllo" + "hello".bytesplice(0..-1, "xxx").should == "xxx" + "hello".bytesplice(0...5, "xxx").should == "xxx" + "hello".bytesplice(0...6, "xxx").should == "xxx" + end + + it "raises TypeError when integer index is provided without length argument" do + -> { "hello".bytesplice(0, "xxx") }.should raise_error(TypeError, "wrong argument type Integer (expected Range)") + end + + it "replaces on an empty string" do + "".bytesplice(0, 0, "").should == "" + "".bytesplice(0, 0, "xxx").should == "xxx" + end + + it "mutates self" do + s = "hello" + s.bytesplice(2, 1, "xxx").should == "hexxxlo" + s.should == "hexxxlo" + end + + it "raises when string is frozen" do + s = "hello".freeze + -> { s.bytesplice(2, 1, "xxx") }.should raise_error(FrozenError, "can't modify frozen String: \"hello\"") + end + end +end + +describe "String#bytesplice with multibyte characters" do + ruby_version_is "3.2" do + it "raises IndexError when index is out of byte size boundary" do + -> { "こんにちは".bytesplice(-16, 0, "xxx") }.should raise_error(IndexError, "index -16 out of string") + end + + it "raises IndexError when index is not on a codepoint boundary" do + -> { "こんにちは".bytesplice(1, 0, "xxx") }.should raise_error(IndexError, "offset 1 does not land on character boundary") + end + + it "raises IndexError when length is not matching the codepoint boundary" do + -> { "こんにちは".bytesplice(0, 1, "xxx") }.should raise_error(IndexError, "offset 1 does not land on character boundary") + -> { "こんにちは".bytesplice(0, 2, "xxx") }.should raise_error(IndexError, "offset 2 does not land on character boundary") + end + + it "replaces with integer indices" do + "こんにちは".bytesplice(-15, 0, "xxx").should == "xxxこんにちは" + "こんにちは".bytesplice(0, 0, "xxx").should == "xxxこんにちは" + "こんにちは".bytesplice(0, 3, "xxx").should == "xxxんにちは" + "こんにちは".bytesplice(3, 3, "はは").should == "こははにちは" + "こんにちは".bytesplice(15, 0, "xxx").should == "こんにちはxxx" + end + + it "replaces with range" do + "こんにちは".bytesplice(-15...-16, "xxx").should == "xxxこんにちは" + "こんにちは".bytesplice(0...0, "xxx").should == "xxxこんにちは" + "こんにちは".bytesplice(0..2, "xxx").should == "xxxんにちは" + "こんにちは".bytesplice(0...3, "xxx").should == "xxxんにちは" + "こんにちは".bytesplice(0..5, "xxx").should == "xxxにちは" + "こんにちは".bytesplice(0..-1, "xxx").should == "xxx" + "こんにちは".bytesplice(0...15, "xxx").should == "xxx" + "こんにちは".bytesplice(0...18, "xxx").should == "xxx" + end + + it "treats negative length for range as 0" do + "こんにちは".bytesplice(0...-100, "xxx").should == "xxxこんにちは" + "こんにちは".bytesplice(3...-100, "xxx").should == "こxxxんにちは" + "こんにちは".bytesplice(-15...-100, "xxx").should == "xxxこんにちは" + end + + it "raises when ranges not match codepoint boundaries" do + -> { "こんにちは".bytesplice(0..0, "x") }.should raise_error(IndexError, "offset 1 does not land on character boundary") + -> { "こんにちは".bytesplice(0..1, "x") }.should raise_error(IndexError, "offset 2 does not land on character boundary") + # Begin is incorrect + -> { "こんにちは".bytesplice(-4..-1, "x") }.should raise_error(IndexError, "offset 11 does not land on character boundary") + -> { "こんにちは".bytesplice(-5..-1, "x") }.should raise_error(IndexError, "offset 10 does not land on character boundary") + # End is incorrect + -> { "こんにちは".bytesplice(-3..-2, "x") }.should raise_error(IndexError, "offset 14 does not land on character boundary") + -> { "こんにちは".bytesplice(-3..-3, "x") }.should raise_error(IndexError, "offset 13 does not land on character boundary") + end + + it "deals with a different encoded argument" do + s = "こんにちは" + s.encoding.should == Encoding::UTF_8 + sub = "xxxxxx" + sub.force_encoding(Encoding::US_ASCII) + + result = s.bytesplice(0, 3, sub) + result.should == "xxxxxxんにちは" + result.encoding.should == Encoding::UTF_8 + + s = "xxxxxx" + s.force_encoding(Encoding::US_ASCII) + sub = "こんにちは" + sub.encoding.should == Encoding::UTF_8 + + result = s.bytesplice(0, 3, sub) + result.should == "こんにちはxxx" + result.encoding.should == Encoding::UTF_8 + end + end +end diff --git a/spec/truffle/methods/String.txt b/spec/truffle/methods/String.txt index 490c1c20945c..1d28de3dcb87 100644 --- a/spec/truffle/methods/String.txt +++ b/spec/truffle/methods/String.txt @@ -15,6 +15,7 @@ b bytes bytesize byteslice +bytesplice capitalize capitalize! casecmp diff --git a/src/main/ruby/truffleruby/core/string.rb b/src/main/ruby/truffleruby/core/string.rb index 8bbafba99426..ce64b6b89b61 100644 --- a/src/main/ruby/truffleruby/core/string.rb +++ b/src/main/ruby/truffleruby/core/string.rb @@ -70,6 +70,54 @@ def byteslice(index_or_range, length = undefined) byteslice index, length end + def bytesplice(*args) + is_range = Primitive.is_a?(args[0], Range) + + if args.size == 3 + start = Primitive.rb_to_int(args[0]) + start += bytesize if start < 0 + + len = Primitive.rb_to_int(args[1]) + str = StringValue(args[2]) + elsif args.size == 2 + unless is_range + raise(TypeError, "wrong argument type #{Primitive.class(args[0])} (expected Range)") + end + + start, len = Primitive.range_normalized_start_length(args[0], bytesize) + len = 0 if len < 0 + str = StringValue(args[1]) + else + raise(ArgumentError, "wrong number of arguments (given #{args.size}, expected 2..3)") + end + + if len < 0 + raise(IndexError, "negative length #{len}") + end + + if bytesize < start || start < 0 + if is_range + raise(RangeError, "#{args[0]} out of range") + else + raise(IndexError, "index #{args[0]} out of string") + end + end + + len = bytesize - start if len > bytesize - start + finish = start + len + + unless Truffle::StringOperations.on_codepoint_boundary?(self, start) + raise(IndexError, "offset #{start} does not land on character boundary") + end + unless Truffle::StringOperations.on_codepoint_boundary?(self, finish) + raise(IndexError, "offset #{finish} does not land on character boundary") + end + + Primitive.check_mutable_string(self) + enc = Primitive.encoding_ensure_compatible_str(self, str) + Primitive.string_splice(self, str, start, len, enc) + end + def self.try_convert(obj) Truffle::Type.try_convert obj, String, :to_str end diff --git a/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb b/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb index 617ac9343744..6d4ba4677457 100644 --- a/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb +++ b/src/main/ruby/truffleruby/core/truffle/polyglot_methods.rb @@ -80,6 +80,10 @@ def byteslice(...) to_s.byteslice(...) end + def bytesplice(...) + to_s.bytesplice(...) + end + def capitalize(...) to_s.capitalize(...) end diff --git a/src/main/ruby/truffleruby/core/truffle/string_operations.rb b/src/main/ruby/truffleruby/core/truffle/string_operations.rb index 8e6cee160eb6..3277343c50df 100644 --- a/src/main/ruby/truffleruby/core/truffle/string_operations.rb +++ b/src/main/ruby/truffleruby/core/truffle/string_operations.rb @@ -411,5 +411,18 @@ def self.assign_regexp(string, index, count, replacement) Primitive.string_splice(string, replacement, bi, bs, enc) end + + def self.on_codepoint_boundary?(string, byte_pos) + char_pos = Primitive.byte_index_to_character_index(string, byte_pos) + adjusted_byte_pos = if char_pos >= string.size + # Handle index overflow cases. + # @see com.oracle.truffle.api.strings.ByteIndexToCodePointIndexNode#execute for details. + string.bytesize + else + Primitive.character_index_to_byte_index(string, char_pos) + end + + byte_pos == adjusted_byte_pos + end end end