From 09b7fb980d99dbd02144e4f679cff0e1d4fa7739 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Wed, 3 Jan 2024 17:18:42 +0900 Subject: [PATCH] use StringScanner with baseparser [Why] Using StringScanner reduces the string copying process and speeds up the process. --- lib/rexml/parseexception.rb | 2 +- lib/rexml/parsers/baseparser.rb | 35 +++++++---------- lib/rexml/source.rb | 69 +++++++++++++++++++-------------- 3 files changed, 56 insertions(+), 50 deletions(-) diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index 7b16cd1a..23c23e2f 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -29,7 +29,7 @@ def to_s err << "\nLine: #{line}\n" err << "Position: #{position}\n" err << "Last 80 unconsumed characters:\n" - err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') + err << @source.scanner.rest[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') end err diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 305b1207..88bb35af 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -96,7 +96,7 @@ class BaseParser ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" PEDECL = "" GEDECL = "" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s* 0 - #STDERR.puts @source.encoding - #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) word = word[1] unless word.nil? @@ -259,7 +257,7 @@ def pull_event else @document_status = :after_doctype if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + @source.scanner.string = @source.scanner.rest.force_encoding(::Encoding::UTF_8) end end end @@ -274,8 +272,8 @@ def pull_event return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] when ENTITY_START - match = @source.match( ENTITYDECL, true ).to_a.compact - match[0] = :entitydecl + match = @source.match( ENTITYDECL, true ) + match = match.nil? ? [:entitydecl] : [:entitydecl, *match.captures.compact.reject(&:empty?)] ref = false if match[1] == '%' ref = true @@ -349,9 +347,9 @@ def pull_event @source.match(/\A\s*/um, true) end begin - @source.read if @source.buffer.size<2 - if @source.buffer[0] == ?< - if @source.buffer[1] == ?/ + @source.read if @source.scanner.rest.size<2 + if @source.scanner.rest[0] == ?< + if @source.scanner.rest[1] == ?/ @nsstack.shift last_tag = @tags.pop md = @source.match( CLOSE_MATCH, true ) @@ -365,9 +363,8 @@ def pull_event raise REXML::ParseException.new(message, @source) end return [ :end_element, last_tag ] - elsif @source.buffer[1] == ?! + elsif @source.scanner.rest[1] == ?! md = @source.match(/\A(\s*[^>]*>)/um) - #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][2] == ?- md = @source.match( COMMENT_PATTERN, true ) @@ -384,7 +381,7 @@ def pull_event end raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) - elsif @source.buffer[1] == ?? + elsif @source.scanner.rest[1] == ?? return process_instruction else # Get the next tag @@ -392,6 +389,7 @@ def pull_event unless md raise REXML::ParseException.new("malformed XML: missing tag start", @source) end + tag = md[1] @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @@ -405,23 +403,20 @@ def pull_event end if closed - @closed = md[1] + @closed = tag @nsstack.shift else - @tags.push( md[1] ) + @tags.push( tag ) end - return [ :start_element, md[1], attributes ] + return [ :start_element, tag, attributes ] end else md = @source.match( TEXT_PATTERN, true ) + text = md[1] if md[0].length == 0 @source.match( /(\s+)/, true ) end - #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0 - #return [ :text, "" ] if md[0].length == 0 - # unnormalized = Text::unnormalize( md[1], self ) - # return PullEvent.new( :text, md[1], unnormalized ) - return [ :text, md[1] ] + return [ :text, text ] end rescue REXML::UndefinedNamespaceException raise diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 54eea54b..aeef7c35 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -31,7 +31,8 @@ def SourceFactory::create_from(arg) class Source include Encoding # The current buffer (what we're going to read next) - attr_reader :buffer + attr_reader :scanner + # The line number of the last consumed text attr_reader :line attr_reader :encoding @@ -41,7 +42,8 @@ class Source # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -62,53 +64,56 @@ def read end def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end + @scanner.matched? ? @scanner : nil end # @return true if the Source is exhausted def empty? - @buffer == "" + @scanner.eos? end # @return the current line in the source def current_line lines = @orig.split - res = lines.grep @buffer[0..30] + res = lines.grep @scanner.rest[0..30] res = res[-1] if res.kind_of? Array lines.index( res ) if res end private def detect_encoding - buffer_encoding = @buffer.encoding + orig_encoding = @orig.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @orig.force_encoding("ASCII-8BIT") + if @orig[0, 2] == "\xfe\xff" + @orig[0, 2] = "" detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @orig[0, 2] == "\xff\xfe" + @orig[0, 2] = "" detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @orig[0, 3] == "\xef\xbb\xbf" + @orig[0, 3] = "" detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @orig.force_encoding(orig_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string = @scanner.rest.force_encoding(::Encoding::UTF_8) end end end @@ -131,7 +136,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -142,26 +147,32 @@ def initialize(arg, block_size=500, encoding=nil) def read begin - @buffer << readline + @scanner << readline rescue Exception, NameError @source = nil end end def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end + while !@scanner.matched? and @source begin - @buffer << readline - rv = pattern.match(@buffer) - @buffer = $' if cons and rv + @scanner << readline + if cons + @scanner.scan(pattern) + else + @scanner.check(pattern) + end rescue @source = nil end end - rv.taint if RUBY_VERSION < '2.7' - rv + @scanner.taint if RUBY_VERSION < '2.7' + @scanner.matched? ? @scanner : nil end def empty? @@ -218,7 +229,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end