diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index ca29b4a9..935cbd9e 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -50,7 +50,6 @@ class BaseParser DOCTYPE_START = /\A\s*/um - DOCTYPE_PATTERN = /\s*)/um ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um COMMENT_START = /\A/um @@ -69,7 +68,6 @@ class BaseParser STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um ENTITY_START = /\A\s*/um SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um @@ -101,8 +99,9 @@ class BaseParser ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um NOTATIONDECL_START = /\A\s*/um - SYSTEM = /\A\s*/um + EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um + EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um + PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um EREFERENCE = /&(?!#{NAME};)/ @@ -225,24 +224,37 @@ def pull_event when INSTRUCTION_START return process_instruction when DOCTYPE_START - md = @source.match( DOCTYPE_PATTERN, true ) + base_error_message = "Malformed DOCTYPE" + @source.match(DOCTYPE_START, true) @nsstack.unshift(curr_ns=Set.new) - identity = md[1] - close = md[2] - identity =~ IDENTITY - name = $1 - raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? - pub_sys = $2.nil? ? nil : $2.strip - long_name = $4.nil? ? nil : $4.strip - uri = $6.nil? ? nil : $6.strip - args = [ :start_doctype, name, pub_sys, long_name, uri ] - if close == ">" + name = parse_name(base_error_message) + if @source.match(/\A\s*\[/um, true) + id = [nil, nil, nil] + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/^\s*/um, true) - @stack << [ :end_doctype ] else - @document_status = :in_doctype + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\A\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end + end + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\A\s*/um, true) + @stack << [ :end_doctype ] end return args when /^\s+/ @@ -313,27 +325,24 @@ def pull_event end return [ :attlistdecl, element, pairs, contents ] when NOTATIONDECL_START - md = nil - if @source.match( PUBLIC ) - md = @source.match( PUBLIC, true ) - pubid = system = nil - pubid_literal = md[3] - pubid = pubid_literal[1..-2] if pubid_literal # Remove quote - system_literal = md[4] - system = system_literal[1..-2] if system_literal # Remove quote - vals = [md[1], md[2], pubid, system] - elsif @source.match( SYSTEM ) - md = @source.match( SYSTEM, true ) - system = nil - system_literal = md[3] - system = system_literal[1..-2] if system_literal # Remove quote - vals = [md[1], md[2], nil, system] - else - details = notation_decl_invalid_details - message = "Malformed notation declaration: #{details}" + base_error_message = "Malformed notation declaration" + unless @source.match(/\A\s*/um) + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid declaration name" + end + raise REXML::ParseException.new(message, @source) + end + name = parse_name(base_error_message) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: true) + unless @source.match(/\A\s*>/um, true) + message = "#{base_error_message}: garbage before end >" raise REXML::ParseException.new(message, @source) end - return [ :notationdecl, *vals ] + return [:notationdecl, name, *id] when DOCTYPE_END @document_status = :after_doctype @source.match( DOCTYPE_END, true ) @@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding) true end + def parse_name(base_error_message) + md = @source.match(/\A\s*#{NAME}/um, true) + unless md + if @source.match(/\A\s*\S/um) + message = "#{base_error_message}: invalid name" + else + message = "#{base_error_message}: name is missing" + end + raise REXML::ParseException.new(message, @source) + end + md[1] + end + + def parse_id(base_error_message, + accept_external_id:, + accept_public_id:) + if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + system_literal = md[2] + system = system_literal[1..-2] if system_literal # Remove quote + ["PUBLIC", pubid, system] + elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + ["PUBLIC", pubid, nil] + elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) + system = nil + system_literal = md[1] + system = system_literal[1..-2] if system_literal # Remove quote + ["SYSTEM", nil, system] + else + details = parse_id_invalid_details(accept_external_id: accept_external_id, + accept_public_id: accept_public_id) + message = "#{base_error_message}: #{details}" + raise REXML::ParseException.new(message, @source) + end + end + + def parse_id_invalid_details(accept_external_id:, + accept_public_id:) + public = /\A\s*PUBLIC/um + system = /\A\s*SYSTEM/um + if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) + if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) + return "public ID literal is missing" + end + unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) + return "invalid public ID literal" + end + if accept_public_id + if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) + return "system ID literal is missing" + end + unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) + return "invalid system literal" + end + "garbage after system literal" + else + "garbage after public ID literal" + end + elsif accept_external_id and @source.match(/#{system}/um) + if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) + return "system literal is missing" + end + unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) + return "invalid system literal" + end + "garbage after system literal" + else + unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) + return "invalid ID type" + end + "ID type is missing" + end + end + def process_instruction match_data = @source.match(INSTRUCTION_PATTERN, true) unless match_data @@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns) end return attributes, closed end - - def notation_decl_invalid_details - name = /#{NOTATIONDECL_START}\s+#{NAME}/um - public = /#{name}\s+PUBLIC/um - system = /#{name}\s+SYSTEM/um - if @source.match(/#{NOTATIONDECL_START}\s*>/um) - return "name is missing" - elsif not @source.match(/#{name}[\s>]/um) - return "invalid name" - elsif @source.match(/#{name}\s*>/um) - return "ID type is missing" - elsif not @source.match(/#{name}\s+(?:PUBLIC|SYSTEM)[\s>]/um) - return "invalid ID type" - elsif @source.match(/#{public}/um) - if @source.match(/#{public}\s*>/um) - return "public ID literal is missing" - elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}/um) - return "invalid public ID literal" - elsif @source.match(/#{public}\s+#{PUBIDLITERAL}[^\s>]/um) - return "garbage after public ID literal" - elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" - elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*>/um) - return "garbage after system literal" - end - elsif @source.match(/#{system}/um) - if @source.match(/#{system}\s*>/um) - return "system literal is missing" - elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" - elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}\s*>/um) - return "garbage after system literal" - end - end - "end > is missing" - end end end end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index 80f70888..55713909 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -5,17 +5,187 @@ module REXMLTests class TestParseDocumentTypeDeclaration < Test::Unit::TestCase private - def xml(internal_subset) - <<-XML - + def parse(doctype) + REXML::Document.new(<<-XML).doctype +#{doctype} XML end - def parse(internal_subset) - REXML::Document.new(xml(internal_subset)).doctype + class TestName < self + def test_valid + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("r", doctype.name) + end + + def test_garbage_plus_before_name_at_line_start + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: invalid name +Line: 5 +Position: 51 +Last 80 unconsumed characters: ++ r SYSTEM "urn:x-rexml:test" [ ]> + DETAIL + end + end + + class TestExternalID < self + class TestSystem < self + def test_left_bracket_in_system_literal + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal([ + "r", + "SYSTEM", + nil, + "urn:x-rexml:[test", + ], + [ + doctype.name, + doctype.external_id, + doctype.public, + doctype.system, + ]) + end + + def test_greater_than_in_system_literal + doctype = parse(<<-DOCTYPE) +test" [ +]> + DOCTYPE + assert_equal([ + "r", + "SYSTEM", + nil, + "urn:x-rexml:>test", + ], + [ + doctype.name, + doctype.external_id, + doctype.public, + doctype.system, + ]) + end + + def test_no_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: system literal is missing +Line: 3 +Position: 26 +Last 80 unconsumed characters: + SYSTEM> + DETAIL + end + + def test_garbage_after_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: garbage after external ID +Line: 3 +Position: 36 +Last 80 unconsumed characters: +x'> + DETAIL + end + + def test_single_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("r\".dtd", doctype.system) + end + + def test_double_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("r'.dtd", doctype.system) + end + end + + class TestPublic < self + class TestPublicIDLiteral < self + def test_content_double_quote + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: invalid public ID literal +Line: 3 +Position: 62 +Last 80 unconsumed characters: + PUBLIC 'double quote " is invalid' "r.dtd"> + DETAIL + end + + def test_single_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("public-id-literal", doctype.public) + end + + def test_double_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("public'-id-literal", doctype.public) + end + end + + class TestSystemLiteral < self + def test_garbage_after_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: garbage after external ID +Line: 3 +Position: 65 +Last 80 unconsumed characters: +x'> + DETAIL + end + + def test_single_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("system\"-literal", doctype.system) + end + + def test_double_quote + doctype = parse(<<-DOCTYPE) + + DOCTYPE + assert_equal("system'-literal", doctype.system) + end + end + end end class TestMixed < self @@ -45,6 +215,15 @@ def test_notation_attlist assert_equal([REXML::NotationDecl, REXML::AttlistDecl], doctype.children.collect(&:class)) end + + private + def parse(internal_subset) + super(<<-DOCTYPE) + + DOCTYPE + end end end end diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index fbd29e2a..19a0536d 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -50,7 +50,7 @@ def test_invalid_name Line: 5 Position: 74 Last 80 unconsumed characters: - ]> +'> ]> DETAIL end @@ -61,11 +61,11 @@ def test_no_id_type INTERNAL_SUBSET end assert_equal(<<-DETAIL.chomp, exception.to_s) -Malformed notation declaration: ID type is missing +Malformed notation declaration: invalid ID type Line: 5 Position: 77 Last 80 unconsumed characters: - ]> +> ]> DETAIL end @@ -80,7 +80,7 @@ def test_invalid_id_type Line: 5 Position: 85 Last 80 unconsumed characters: - ]> + INVALID> ]> DETAIL end end @@ -98,7 +98,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - ]> + SYSTEM> ]> DETAIL end @@ -109,11 +109,11 @@ def test_garbage_after_literal INTERNAL_SUBSET end assert_equal(<<-DETAIL.chomp, exception.to_s) -Malformed notation declaration: garbage after system literal +Malformed notation declaration: garbage before end > Line: 5 Position: 103 Last 80 unconsumed characters: - ]> +x'> ]> DETAIL end @@ -145,7 +145,7 @@ def test_content_double_quote Line: 5 Position: 129 Last 80 unconsumed characters: - ]> + PUBLIC 'double quote " is invalid' "system-literal"> ]> DETAIL end @@ -172,11 +172,11 @@ def test_garbage_after_literal INTERNAL_SUBSET end assert_equal(<<-DETAIL.chomp, exception.to_s) -Malformed notation declaration: garbage after system literal +Malformed notation declaration: garbage before end > Line: 5 Position: 123 Last 80 unconsumed characters: - ]> +x'> ]> DETAIL end @@ -229,7 +229,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - ]> + PUBLIC> ]> DETAIL end @@ -244,7 +244,7 @@ def test_literal_content_double_quote Line: 5 Position: 128 Last 80 unconsumed characters: - ]> + PUBLIC 'double quote \" is invalid in PubidLiteral'> ]> DETAIL end @@ -255,11 +255,11 @@ def test_garbage_after_literal INTERNAL_SUBSET end assert_equal(<<-DETAIL.chomp, exception.to_s) -Malformed notation declaration: garbage after public ID literal +Malformed notation declaration: garbage before end > Line: 5 Position: 106 Last 80 unconsumed characters: - ]> +x'> ]> DETAIL end