diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 8fe287a7..0b48ec14 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -158,6 +158,9 @@ module Private DEFAULT_ENTITIES_PATTERNS[term] = /&#{term};/ end XML_PREFIXED_NAMESPACE = "http://www.w3.org/XML/1998/namespace" + EXTERNAL_ID_PUBLIC_PATTERN = /\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um + EXTERNAL_ID_SYSTEM_PATTERN = /\s+#{SYSTEMLITERAL}/um + PUBLIC_ID_PATTERN = /\s+#{PUBIDLITERAL}/um end private_constant :Private @@ -307,7 +310,6 @@ def pull_event @source.ensure_buffer else id = parse_id(base_error_message, - accept_external_id: true, accept_public_id: false) if id[0] == "SYSTEM" # For backward compatibility @@ -409,7 +411,6 @@ def pull_event end name = parse_name(base_error_message) id = parse_id(base_error_message, - accept_external_id: true, accept_public_id: true) @source.skip_spaces unless @source.match?(">", true) @@ -667,68 +668,41 @@ def parse_name(base_error_message) end def parse_id(base_error_message, - accept_external_id:, accept_public_id:) - if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) - pubid = system = nil - pubid_literal = md[1] - pubid = pubid_literal[1..-2] if pubid_literal # Remove quote - system_literal = md[2] - system = system_literal[1..-2] if system_literal # Remove quote - ["PUBLIC", pubid, system] - elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) - pubid = system = nil - pubid_literal = md[1] - pubid = pubid_literal[1..-2] if pubid_literal # Remove quote - ["PUBLIC", pubid, nil] - elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) - system = nil - system_literal = md[1] - system = system_literal[1..-2] if system_literal # Remove quote - ["SYSTEM", nil, system] - else - details = parse_id_invalid_details(accept_external_id: accept_external_id, - accept_public_id: accept_public_id) - message = "#{base_error_message}: #{details}" - raise REXML::ParseException.new(message, @source) - end - end - - def parse_id_invalid_details(accept_external_id:, - accept_public_id:) - public = /\A\s*PUBLIC/um - system = /\A\s*SYSTEM/um - if (accept_external_id or accept_public_id) and @source.match?(/#{public}/um) - if @source.match?(/#{public}(?:\s+[^'"]|\s*[\[>])/um) - return "public ID literal is missing" - end - unless @source.match?(/#{public}\s+#{PUBIDLITERAL}/um) - return "invalid public ID literal" - end - if accept_public_id - if @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) - return "system ID literal is missing" - end - unless @source.match?(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" - end - "garbage after system literal" + @source.skip_spaces + if @source.match?("PUBLIC", true) + if (md = @source.match(Private::EXTERNAL_ID_PUBLIC_PATTERN, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + system_literal = md[2] + system = system_literal[1..-2] if system_literal # Remove quote + ["PUBLIC", pubid, system] + elsif accept_public_id and (md = @source.match(Private::PUBLIC_ID_PATTERN, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + ["PUBLIC", pubid, nil] + elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um) + raise REXML::ParseException.new("#{base_error_message}: public ID literal is missing", @source) + elsif !@source.match?(Private::PUBLIC_ID_PATTERN) + raise REXML::ParseException.new("#{base_error_message}: invalid public ID literal", @source) else - "garbage after public ID literal" + raise REXML::ParseException.new("#{base_error_message}: garbage after public ID literal", @source) end - elsif accept_external_id and @source.match?(/#{system}/um) - if @source.match?(/#{system}(?:\s+[^'"]|\s*[\[>])/um) - return "system literal is missing" - end - unless @source.match?(/#{system}\s+#{SYSTEMLITERAL}/um) - return "invalid system literal" + elsif @source.match?("SYSTEM", true) + if (md = @source.match(Private::EXTERNAL_ID_SYSTEM_PATTERN, true)) + system = nil + system_literal = md[1] + system = system_literal[1..-2] if system_literal # Remove quote + ["SYSTEM", nil, system] + elsif @source.match?(/(?:\s+[^'"]|\s*[\[>])/um) + raise REXML::ParseException.new("#{base_error_message}: system literal is missing", @source) + else + raise REXML::ParseException.new("#{base_error_message}: invalid system literal", @source) end - "garbage after system literal" else - unless @source.match?(/\A\s*(?:PUBLIC|SYSTEM)\s/um) - return "invalid ID type" - end - "ID type is missing" + raise REXML::ParseException.new("#{base_error_message}: invalid ID type", @source) end end diff --git a/test/parse/test_document_type_declaration.rb b/test/parse/test_document_type_declaration.rb index d4658b9e..cc37ad3f 100644 --- a/test/parse/test_document_type_declaration.rb +++ b/test/parse/test_document_type_declaration.rb @@ -153,7 +153,22 @@ def test_no_literal Line: 3 Position: 26 Last 80 unconsumed characters: -SYSTEM> +> + DETAIL + end + + def test_garbage_invalid_system_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + DETAIL end @@ -165,10 +180,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed DOCTYPE: garbage after external ID -Line: 3 -Position: 36 +Line: 1 +Position: 29 Last 80 unconsumed characters: -x'> +x'> DETAIL end @@ -200,7 +215,7 @@ def test_content_double_quote Line: 3 Position: 62 Last 80 unconsumed characters: -PUBLIC 'double quote " is invalid' "r.dtd"> + 'double quote " is invalid' "r.dtd"> DETAIL end @@ -220,6 +235,21 @@ def test_double_quote end class TestSystemLiteral < self + def test_garbage_after_public_ID_literal + exception = assert_raise(REXML::ParseException) do + parse(<<-DOCTYPE) + + DOCTYPE + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Malformed DOCTYPE: garbage after public ID literal +Line: 3 +Position: 54 +Last 80 unconsumed characters: + "public-id-literal" 'system> + DETAIL + end + def test_garbage_after_literal exception = assert_raise(REXML::ParseException) do parse(<<-DOCTYPE) diff --git a/test/parse/test_notation_declaration.rb b/test/parse/test_notation_declaration.rb index 9e81b6a4..11914d37 100644 --- a/test/parse/test_notation_declaration.rb +++ b/test/parse/test_notation_declaration.rb @@ -32,10 +32,10 @@ def test_no_name end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: name is missing -Line: 5 -Position: 72 +Line: 2 +Position: 62 Last 80 unconsumed characters: - ]> + DETAIL end @@ -62,10 +62,10 @@ def test_no_id_type end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: invalid ID type -Line: 5 -Position: 77 +Line: 2 +Position: 67 Last 80 unconsumed characters: -> ]> +> DETAIL end @@ -77,10 +77,10 @@ def test_invalid_id_type end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: invalid ID type -Line: 5 -Position: 85 +Line: 2 +Position: 75 Last 80 unconsumed characters: - INVALID> ]> +INVALID> DETAIL end end @@ -98,7 +98,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - SYSTEM> ]> +> ]> DETAIL end @@ -110,10 +110,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: garbage before end > -Line: 5 -Position: 103 +Line: 2 +Position: 93 Last 80 unconsumed characters: -x'> ]> +x'> DETAIL end @@ -145,7 +145,7 @@ def test_content_double_quote Line: 5 Position: 129 Last 80 unconsumed characters: - PUBLIC 'double quote " is invalid' "system-literal"> ]> + 'double quote " is invalid' "system-literal"> ]> DETAIL end @@ -173,10 +173,10 @@ def test_garbage_after_literal end assert_equal(<<-DETAIL.chomp, exception.to_s) Malformed notation declaration: garbage before end > -Line: 5 -Position: 123 +Line: 2 +Position: 113 Last 80 unconsumed characters: -x'> ]> +x'> DETAIL end @@ -229,7 +229,7 @@ def test_no_literal Line: 5 Position: 84 Last 80 unconsumed characters: - PUBLIC> ]> +> ]> DETAIL end @@ -244,7 +244,7 @@ def test_literal_content_double_quote Line: 5 Position: 128 Last 80 unconsumed characters: - PUBLIC 'double quote \" is invalid in PubidLiteral'> ]> + 'double quote \" is invalid in PubidLiteral'> ]> DETAIL end