diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index ca29b4a9..935cbd9e 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -50,7 +50,6 @@ class BaseParser
DOCTYPE_START = /\A\s*/um
- DOCTYPE_PATTERN = /\s*)/um
ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um
COMMENT_START = /\A/um
@@ -69,7 +68,6 @@ class BaseParser
STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um
ENTITY_START = /\A\s*/um
SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um
@@ -101,8 +99,9 @@ class BaseParser
ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
NOTATIONDECL_START = /\A\s*/um
- SYSTEM = /\A\s*/um
+ EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
+ EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um
+ PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um
EREFERENCE = /&(?!#{NAME};)/
@@ -225,24 +224,37 @@ def pull_event
when INSTRUCTION_START
return process_instruction
when DOCTYPE_START
- md = @source.match( DOCTYPE_PATTERN, true )
+ base_error_message = "Malformed DOCTYPE"
+ @source.match(DOCTYPE_START, true)
@nsstack.unshift(curr_ns=Set.new)
- identity = md[1]
- close = md[2]
- identity =~ IDENTITY
- name = $1
- raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
- pub_sys = $2.nil? ? nil : $2.strip
- long_name = $4.nil? ? nil : $4.strip
- uri = $6.nil? ? nil : $6.strip
- args = [ :start_doctype, name, pub_sys, long_name, uri ]
- if close == ">"
+ name = parse_name(base_error_message)
+ if @source.match(/\A\s*\[/um, true)
+ id = [nil, nil, nil]
+ @document_status = :in_doctype
+ elsif @source.match(/\A\s*>/um, true)
+ id = [nil, nil, nil]
@document_status = :after_doctype
- @source.read if @source.buffer.size<2
- md = @source.match(/^\s*/um, true)
- @stack << [ :end_doctype ]
else
- @document_status = :in_doctype
+ id = parse_id(base_error_message,
+ accept_external_id: true,
+ accept_public_id: false)
+ if id[0] == "SYSTEM"
+ # For backward compatibility
+ id[1], id[2] = id[2], nil
+ end
+ if @source.match(/\A\s*\[/um, true)
+ @document_status = :in_doctype
+ elsif @source.match(/\A\s*>/um, true)
+ @document_status = :after_doctype
+ else
+ message = "#{base_error_message}: garbage after external ID"
+ raise REXML::ParseException.new(message, @source)
+ end
+ end
+ args = [:start_doctype, name, *id]
+ if @document_status == :after_doctype
+ @source.match(/\A\s*/um, true)
+ @stack << [ :end_doctype ]
end
return args
when /^\s+/
@@ -313,27 +325,24 @@ def pull_event
end
return [ :attlistdecl, element, pairs, contents ]
when NOTATIONDECL_START
- md = nil
- if @source.match( PUBLIC )
- md = @source.match( PUBLIC, true )
- pubid = system = nil
- pubid_literal = md[3]
- pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
- system_literal = md[4]
- system = system_literal[1..-2] if system_literal # Remove quote
- vals = [md[1], md[2], pubid, system]
- elsif @source.match( SYSTEM )
- md = @source.match( SYSTEM, true )
- system = nil
- system_literal = md[3]
- system = system_literal[1..-2] if system_literal # Remove quote
- vals = [md[1], md[2], nil, system]
- else
- details = notation_decl_invalid_details
- message = "Malformed notation declaration: #{details}"
+ base_error_message = "Malformed notation declaration"
+ unless @source.match(/\A\s*/um)
+ message = "#{base_error_message}: name is missing"
+ else
+ message = "#{base_error_message}: invalid declaration name"
+ end
+ raise REXML::ParseException.new(message, @source)
+ end
+ name = parse_name(base_error_message)
+ id = parse_id(base_error_message,
+ accept_external_id: true,
+ accept_public_id: true)
+ unless @source.match(/\A\s*>/um, true)
+ message = "#{base_error_message}: garbage before end >"
raise REXML::ParseException.new(message, @source)
end
- return [ :notationdecl, *vals ]
+ return [:notationdecl, name, *id]
when DOCTYPE_END
@document_status = :after_doctype
@source.match( DOCTYPE_END, true )
@@ -488,6 +497,85 @@ def need_source_encoding_update?(xml_declaration_encoding)
true
end
+ def parse_name(base_error_message)
+ md = @source.match(/\A\s*#{NAME}/um, true)
+ unless md
+ if @source.match(/\A\s*\S/um)
+ message = "#{base_error_message}: invalid name"
+ else
+ message = "#{base_error_message}: name is missing"
+ end
+ raise REXML::ParseException.new(message, @source)
+ end
+ md[1]
+ end
+
+ def parse_id(base_error_message,
+ accept_external_id:,
+ accept_public_id:)
+ if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true))
+ pubid = system = nil
+ pubid_literal = md[1]
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
+ system_literal = md[2]
+ system = system_literal[1..-2] if system_literal # Remove quote
+ ["PUBLIC", pubid, system]
+ elsif accept_public_id and (md = @source.match(PUBLIC_ID, true))
+ pubid = system = nil
+ pubid_literal = md[1]
+ pubid = pubid_literal[1..-2] if pubid_literal # Remove quote
+ ["PUBLIC", pubid, nil]
+ elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true))
+ system = nil
+ system_literal = md[1]
+ system = system_literal[1..-2] if system_literal # Remove quote
+ ["SYSTEM", nil, system]
+ else
+ details = parse_id_invalid_details(accept_external_id: accept_external_id,
+ accept_public_id: accept_public_id)
+ message = "#{base_error_message}: #{details}"
+ raise REXML::ParseException.new(message, @source)
+ end
+ end
+
+ def parse_id_invalid_details(accept_external_id:,
+ accept_public_id:)
+ public = /\A\s*PUBLIC/um
+ system = /\A\s*SYSTEM/um
+ if (accept_external_id or accept_public_id) and @source.match(/#{public}/um)
+ if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um)
+ return "public ID literal is missing"
+ end
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
+ return "invalid public ID literal"
+ end
+ if accept_public_id
+ if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um)
+ return "system ID literal is missing"
+ end
+ unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
+ return "invalid system literal"
+ end
+ "garbage after system literal"
+ else
+ "garbage after public ID literal"
+ end
+ elsif accept_external_id and @source.match(/#{system}/um)
+ if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um)
+ return "system literal is missing"
+ end
+ unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
+ return "invalid system literal"
+ end
+ "garbage after system literal"
+ else
+ unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um)
+ return "invalid ID type"
+ end
+ "ID type is missing"
+ end
+ end
+
def process_instruction
match_data = @source.match(INSTRUCTION_PATTERN, true)
unless match_data
@@ -580,42 +668,6 @@ def parse_attributes(prefixes, curr_ns)
end
return attributes, closed
end
-
- def notation_decl_invalid_details
- name = /#{NOTATIONDECL_START}\s+#{NAME}/um
- public = /#{name}\s+PUBLIC/um
- system = /#{name}\s+SYSTEM/um
- if @source.match(/#{NOTATIONDECL_START}\s*>/um)
- return "name is missing"
- elsif not @source.match(/#{name}[\s>]/um)
- return "invalid name"
- elsif @source.match(/#{name}\s*>/um)
- return "ID type is missing"
- elsif not @source.match(/#{name}\s+(?:PUBLIC|SYSTEM)[\s>]/um)
- return "invalid ID type"
- elsif @source.match(/#{public}/um)
- if @source.match(/#{public}\s*>/um)
- return "public ID literal is missing"
- elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}/um)
- return "invalid public ID literal"
- elsif @source.match(/#{public}\s+#{PUBIDLITERAL}[^\s>]/um)
- return "garbage after public ID literal"
- elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um)
- return "invalid system literal"
- elsif not @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*>/um)
- return "garbage after system literal"
- end
- elsif @source.match(/#{system}/um)
- if @source.match(/#{system}\s*>/um)
- return "system literal is missing"
- elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}/um)
- return "invalid system literal"
- elsif not @source.match(/#{system}\s+#{SYSTEMLITERAL}\s*>/um)
- return "garbage after system literal"
- end
- end
- "end > is missing"
- end
end
end
end
diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb
index 80f70888..55713909 100644
--- a/test/parse/test_document_type_declaration.rb
+++ b/test/parse/test_document_type_declaration.rb
@@ -5,17 +5,187 @@
module REXMLTests
class TestParseDocumentTypeDeclaration < Test::Unit::TestCase
private
- def xml(internal_subset)
- <<-XML
-
+ def parse(doctype)
+ REXML::Document.new(<<-XML).doctype
+#{doctype}
XML
end
- def parse(internal_subset)
- REXML::Document.new(xml(internal_subset)).doctype
+ class TestName < self
+ def test_valid
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("r", doctype.name)
+ end
+
+ def test_garbage_plus_before_name_at_line_start
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: invalid name
+Line: 5
+Position: 51
+Last 80 unconsumed characters:
++ r SYSTEM "urn:x-rexml:test" [ ]>
+ DETAIL
+ end
+ end
+
+ class TestExternalID < self
+ class TestSystem < self
+ def test_left_bracket_in_system_literal
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal([
+ "r",
+ "SYSTEM",
+ nil,
+ "urn:x-rexml:[test",
+ ],
+ [
+ doctype.name,
+ doctype.external_id,
+ doctype.public,
+ doctype.system,
+ ])
+ end
+
+ def test_greater_than_in_system_literal
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal([
+ "r",
+ "SYSTEM",
+ nil,
+ "urn:x-rexml:>test",
+ ],
+ [
+ doctype.name,
+ doctype.external_id,
+ doctype.public,
+ doctype.system,
+ ])
+ end
+
+ def test_no_literal
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: system literal is missing
+Line: 3
+Position: 26
+Last 80 unconsumed characters:
+ SYSTEM>
+ DETAIL
+ end
+
+ def test_garbage_after_literal
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: garbage after external ID
+Line: 3
+Position: 36
+Last 80 unconsumed characters:
+x'>
+ DETAIL
+ end
+
+ def test_single_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("r\".dtd", doctype.system)
+ end
+
+ def test_double_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("r'.dtd", doctype.system)
+ end
+ end
+
+ class TestPublic < self
+ class TestPublicIDLiteral < self
+ def test_content_double_quote
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: invalid public ID literal
+Line: 3
+Position: 62
+Last 80 unconsumed characters:
+ PUBLIC 'double quote " is invalid' "r.dtd">
+ DETAIL
+ end
+
+ def test_single_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("public-id-literal", doctype.public)
+ end
+
+ def test_double_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("public'-id-literal", doctype.public)
+ end
+ end
+
+ class TestSystemLiteral < self
+ def test_garbage_after_literal
+ exception = assert_raise(REXML::ParseException) do
+ parse(<<-DOCTYPE)
+
+ DOCTYPE
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Malformed DOCTYPE: garbage after external ID
+Line: 3
+Position: 65
+Last 80 unconsumed characters:
+x'>
+ DETAIL
+ end
+
+ def test_single_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("system\"-literal", doctype.system)
+ end
+
+ def test_double_quote
+ doctype = parse(<<-DOCTYPE)
+
+ DOCTYPE
+ assert_equal("system'-literal", doctype.system)
+ end
+ end
+ end
end
class TestMixed < self
@@ -45,6 +215,15 @@ def test_notation_attlist
assert_equal([REXML::NotationDecl, REXML::AttlistDecl],
doctype.children.collect(&:class))
end
+
+ private
+ def parse(internal_subset)
+ super(<<-DOCTYPE)
+
+ DOCTYPE
+ end
end
end
end
diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb
index fbd29e2a..19a0536d 100644
--- a/test/parse/test_notation_declaration.rb
+++ b/test/parse/test_notation_declaration.rb
@@ -50,7 +50,7 @@ def test_invalid_name
Line: 5
Position: 74
Last 80 unconsumed characters:
- ]>
+'> ]>
DETAIL
end
@@ -61,11 +61,11 @@ def test_no_id_type
INTERNAL_SUBSET
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
-Malformed notation declaration: ID type is missing
+Malformed notation declaration: invalid ID type
Line: 5
Position: 77
Last 80 unconsumed characters:
- ]>
+> ]>
DETAIL
end
@@ -80,7 +80,7 @@ def test_invalid_id_type
Line: 5
Position: 85
Last 80 unconsumed characters:
- ]>
+ INVALID> ]>
DETAIL
end
end
@@ -98,7 +98,7 @@ def test_no_literal
Line: 5
Position: 84
Last 80 unconsumed characters:
- ]>
+ SYSTEM> ]>
DETAIL
end
@@ -109,11 +109,11 @@ def test_garbage_after_literal
INTERNAL_SUBSET
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
-Malformed notation declaration: garbage after system literal
+Malformed notation declaration: garbage before end >
Line: 5
Position: 103
Last 80 unconsumed characters:
- ]>
+x'> ]>
DETAIL
end
@@ -145,7 +145,7 @@ def test_content_double_quote
Line: 5
Position: 129
Last 80 unconsumed characters:
- ]>
+ PUBLIC 'double quote " is invalid' "system-literal"> ]>
DETAIL
end
@@ -172,11 +172,11 @@ def test_garbage_after_literal
INTERNAL_SUBSET
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
-Malformed notation declaration: garbage after system literal
+Malformed notation declaration: garbage before end >
Line: 5
Position: 123
Last 80 unconsumed characters:
- ]>
+x'> ]>
DETAIL
end
@@ -229,7 +229,7 @@ def test_no_literal
Line: 5
Position: 84
Last 80 unconsumed characters:
- ]>
+ PUBLIC> ]>
DETAIL
end
@@ -244,7 +244,7 @@ def test_literal_content_double_quote
Line: 5
Position: 128
Last 80 unconsumed characters:
- ]>
+ PUBLIC 'double quote \" is invalid in PubidLiteral'> ]>
DETAIL
end
@@ -255,11 +255,11 @@ def test_garbage_after_literal
INTERNAL_SUBSET
end
assert_equal(<<-DETAIL.chomp, exception.to_s)
-Malformed notation declaration: garbage after public ID literal
+Malformed notation declaration: garbage before end >
Line: 5
Position: 106
Last 80 unconsumed characters:
- ]>
+x'> ]>
DETAIL
end