From 09b7fb980d99dbd02144e4f679cff0e1d4fa7739 Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Wed, 3 Jan 2024 17:18:42 +0900
Subject: [PATCH] use StringScanner with baseparser

[Why]
Using StringScanner reduces the string copying process and speeds up the process.
---
 lib/rexml/parseexception.rb     |  2 +-
 lib/rexml/parsers/baseparser.rb | 35 +++++++----------
 lib/rexml/source.rb             | 69 +++++++++++++++++++--------------
 3 files changed, 56 insertions(+), 50 deletions(-)

diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb
index 7b16cd1a..23c23e2f 100644
--- a/lib/rexml/parseexception.rb
+++ b/lib/rexml/parseexception.rb
@@ -29,7 +29,7 @@ def to_s
         err << "\nLine: #{line}\n"
         err << "Position: #{position}\n"
         err << "Last 80 unconsumed characters:\n"
-        err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
+        err << @source.scanner.rest[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ')
       end
 
       err
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 305b1207..88bb35af 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -96,7 +96,7 @@ class BaseParser
       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
 
       NOTATIONDECL_START = /\A\s*<!NOTATION/um
       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -194,8 +194,6 @@ def pull_event
         end
         return [ :end_document ] if empty?
         return @stack.shift if @stack.size > 0
-        #STDERR.puts @source.encoding
-        #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
         if @document_status == nil
           word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um )
           word = word[1] unless word.nil?
@@ -259,7 +257,7 @@ def pull_event
           else
             @document_status = :after_doctype
             if @source.encoding == "UTF-8"
-              @source.buffer.force_encoding(::Encoding::UTF_8)
+              @source.scanner.string = @source.scanner.rest.force_encoding(::Encoding::UTF_8)
             end
           end
         end
@@ -274,8 +272,8 @@ def pull_event
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
 
           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
-            match[0] = :entitydecl
+            match = @source.match( ENTITYDECL, true )
+            match = match.nil? ? [:entitydecl] : [:entitydecl, *match.captures.compact.reject(&:empty?)]
             ref = false
             if match[1] == '%'
               ref = true
@@ -349,9 +347,9 @@ def pull_event
           @source.match(/\A\s*/um, true)
         end
         begin
-          @source.read if @source.buffer.size<2
-          if @source.buffer[0] == ?<
-            if @source.buffer[1] == ?/
+          @source.read if @source.scanner.rest.size<2
+          if @source.scanner.rest[0] == ?<
+            if @source.scanner.rest[1] == ?/
               @nsstack.shift
               last_tag = @tags.pop
               md = @source.match( CLOSE_MATCH, true )
@@ -365,9 +363,8 @@ def pull_event
                 raise REXML::ParseException.new(message, @source)
               end
               return [ :end_element, last_tag ]
-            elsif @source.buffer[1] == ?!
+            elsif @source.scanner.rest[1] == ?!
               md = @source.match(/\A(\s*[^>]*>)/um)
-              #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
               raise REXML::ParseException.new("Malformed node", @source) unless md
               if md[0][2] == ?-
                 md = @source.match( COMMENT_PATTERN, true )
@@ -384,7 +381,7 @@ def pull_event
               end
               raise REXML::ParseException.new( "Declarations can only occur "+
                 "in the doctype declaration.", @source)
-            elsif @source.buffer[1] == ??
+            elsif @source.scanner.rest[1] == ??
               return process_instruction
             else
               # Get the next tag
@@ -392,6 +389,7 @@ def pull_event
               unless md
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
+              tag = md[1]
               @document_status = :in_element
               prefixes = Set.new
               prefixes << md[2] if md[2]
@@ -405,23 +403,20 @@ def pull_event
               end
 
               if closed
-                @closed = md[1]
+                @closed = tag
                 @nsstack.shift
               else
-                @tags.push( md[1] )
+                @tags.push( tag )
               end
-              return [ :start_element, md[1], attributes ]
+              return [ :start_element, tag, attributes ]
             end
           else
             md = @source.match( TEXT_PATTERN, true )
+            text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
             end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
+            return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
           raise
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 54eea54b..aeef7c35 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -31,7 +31,8 @@ def SourceFactory::create_from(arg)
   class Source
     include Encoding
     # The current buffer (what we're going to read next)
-    attr_reader :buffer
+    attr_reader :scanner
+
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
@@ -41,7 +42,8 @@ class Source
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -62,53 +64,56 @@ def read
     end
 
     def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+      if cons
+        @scanner.scan(pattern)
+      else
+        @scanner.check(pattern)
+      end
+      @scanner.matched? ? @scanner : nil
     end
 
     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
+      @scanner.eos?
     end
 
     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end
 
     private
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      orig_encoding = @orig.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @orig.force_encoding("ASCII-8BIT")
+        if @orig[0, 2] == "\xfe\xff"
+          @orig[0, 2] = ""
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @orig[0, 2] == "\xff\xfe"
+          @orig[0, 2] = ""
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @orig[0, 3] == "\xef\xbb\xbf"
+          @orig[0, 3] = ""
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @orig.force_encoding(orig_encoding)
       end
       self.encoding = detected_encoding
     end
 
     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string = @scanner.rest.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -131,7 +136,7 @@ def initialize(arg, block_size=500, encoding=nil)
       end
 
       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -142,26 +147,32 @@ def initialize(arg, block_size=500, encoding=nil)
 
     def read
       begin
-        @buffer << readline
+        @scanner << readline
       rescue Exception, NameError
         @source = nil
       end
     end
 
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
+      if cons
+        @scanner.scan(pattern)
+      else
+        @scanner.check(pattern)
+      end
+      while !@scanner.matched? and @source
         begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
+          @scanner << readline
+          if cons
+            @scanner.scan(pattern)
+          else
+            @scanner.check(pattern)
+          end
         rescue
           @source = nil
         end
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+      @scanner.taint if RUBY_VERSION < '2.7'
+      @scanner.matched? ? @scanner : nil
     end
 
     def empty?
@@ -218,7 +229,7 @@ def encoding_updated
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end