diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index aa4cb9b5..7faeb9ba 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -258,7 +258,7 @@ def pull_event else @document_status = :after_doctype if @source.encoding == "UTF-8" - @source.buffer.force_encoding(::Encoding::UTF_8) + @source.scanner.string = @source.scanner.rest.force_encoding(::Encoding::UTF_8) end end end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 9ad62b6a..b9cbc668 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -30,8 +30,7 @@ def SourceFactory::create_from(arg) # objects and provides consumption of text class Source include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer + # The current scanner (what we're going to read next) attr_reader :scanner # The line number of the last consumed text attr_reader :line @@ -42,8 +41,8 @@ class Source # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection def initialize(arg, encoding=nil) - @orig = @buffer = arg - @scanner = StringScanner.new(@buffer) + @orig = arg + @scanner = StringScanner.new(@orig) if encoding self.encoding = encoding else @@ -64,10 +63,8 @@ def read end def match(pattern, cons=false) - @scanner.string = @buffer if cons @scanner.scan(pattern) - @buffer = @scanner.rest if @scanner.matched? else @scanner.check(pattern) end @@ -88,34 +85,35 @@ def current_line end private + def detect_encoding - buffer_encoding = @buffer.encoding + scanner_encoding = @scanner.rest.encoding detected_encoding = "UTF-8" begin - @buffer.force_encoding("ASCII-8BIT") - if @buffer[0, 2] == "\xfe\xff" - @buffer[0, 2] = "" + @scanner.string = @scanner.rest.force_encoding("ASCII-8BIT") + if @scanner.rest[0, 2] == "\xfe\xff" + @scanner.string = @scanner.rest.delete_prefix("\xfe\xff") detected_encoding = "UTF-16BE" - elsif @buffer[0, 2] == "\xff\xfe" - @buffer[0, 2] = "" + elsif @scanner.rest[0, 2] == "\xff\xfe" + @scanner.string = @scanner.rest.delete_prefix("\xff\xfe") detected_encoding = "UTF-16LE" - elsif @buffer[0, 3] == "\xef\xbb\xbf" - @buffer[0, 3] = "" + elsif @scanner.rest[0, 3] == "\xef\xbb\xbf" + @scanner.string = @scanner.rest.delete_prefix("\xef\xbb\xbf") detected_encoding = "UTF-8" end ensure - @buffer.force_encoding(buffer_encoding) + @scanner.string = @scanner.rest.force_encoding(scanner_encoding) end self.encoding = detected_encoding end def encoding_updated if @encoding != 'UTF-8' - @buffer = decode(@buffer) + @scanner.string = decode(@scanner.rest) @to_utf = true else @to_utf = false - @buffer.force_encoding ::Encoding::UTF_8 + @scanner.string = @scanner.rest.force_encoding(::Encoding::UTF_8) end end end @@ -138,7 +136,7 @@ def initialize(arg, block_size=500, encoding=nil) end if !@to_utf and - @buffer.respond_to?(:force_encoding) and + @orig.respond_to?(:force_encoding) and @source.respond_to?(:external_encoding) and @source.external_encoding != ::Encoding::UTF_8 @force_utf8 = true @@ -149,32 +147,26 @@ def initialize(arg, block_size=500, encoding=nil) def read begin - @buffer << readline - @scanner.string = @buffer + @scanner.string = @scanner.rest + readline rescue Exception, NameError @source = nil end end def match( pattern, cons=false ) - @scanner.string = @buffer if cons @scanner.scan(pattern) - @buffer = @scanner.rest if @scanner.matched? else @scanner.check(pattern) end while !@scanner.matched? and @source begin - @buffer << readline - @scanner.string = @buffer + @scanner << readline if cons @scanner.scan(pattern) - @buffer = @scanner.rest if @scanner.matched? else @scanner.check(pattern) end - @buffer = @scanner.rest if cons and @scanner.matched? rescue @source = nil end @@ -237,7 +229,7 @@ def encoding_updated @source.set_encoding(@encoding, @encoding) end @line_break = encode(">") - @pending_buffer, @buffer = @buffer, "" + @pending_buffer, @scanner.string = @scanner.rest, "" @pending_buffer.force_encoding(@encoding) super end