From a28beb304840fe725f8b1c00d443ac54fb282afa Mon Sep 17 00:00:00 2001 From: Henry Conklin Date: Thu, 11 Feb 2021 09:33:55 -0600 Subject: [PATCH] Add support for numeric XML entities to XMLParser * Add support for decimal numeric entities to String::xml_unescape * Add more error checks to String::xml_unescape * Refactor XMLParser to use String::xml_unescape instead of an internal implementation --- core/io/xml_parser.cpp | 69 +-------------------- core/io/xml_parser.h | 15 ++--- core/ustring.cpp | 63 ++++++++++++++----- main/tests/test_main.cpp | 6 ++ main/tests/test_string.cpp | 40 ++++++++++++ main/tests/test_xml_parser.cpp | 109 +++++++++++++++++++++++++++++++++ main/tests/test_xml_parser.h | 45 ++++++++++++++ 7 files changed, 255 insertions(+), 92 deletions(-) create mode 100644 main/tests/test_xml_parser.cpp create mode 100644 main/tests/test_xml_parser.h diff --git a/core/io/xml_parser.cpp b/core/io/xml_parser.cpp index a4715a229c11..dbb7a4952109 100644 --- a/core/io/xml_parser.cpp +++ b/core/io/xml_parser.cpp @@ -36,63 +36,6 @@ VARIANT_ENUM_CAST(XMLParser::NodeType); -static bool _equalsn(const CharType *str1, const CharType *str2, int len) { - int i; - for (i = 0; i < len && str1[i] && str2[i]; ++i) { - if (str1[i] != str2[i]) { - return false; - } - } - - // if one (or both) of the strings was smaller then they - // are only equal if they have the same length - return (i == len) || (str1[i] == 0 && str2[i] == 0); -} - -String XMLParser::_replace_special_characters(const String &origstr) { - int pos = origstr.find("&"); - int oldPos = 0; - - if (pos == -1) { - return origstr; - } - - String newstr; - - while (pos != -1 && pos < origstr.length() - 2) { - // check if it is one of the special characters - - int specialChar = -1; - for (int i = 0; i < (int)special_characters.size(); ++i) { - const CharType *p = &origstr[pos] + 1; - - if (_equalsn(&special_characters[i][1], p, special_characters[i].length() - 1)) { - specialChar = i; - break; - } - } - - if (specialChar != -1) { - newstr += (origstr.substr(oldPos, pos - oldPos)); - newstr += (special_characters[specialChar][0]); - pos += special_characters[specialChar].length(); - } else { - newstr += (origstr.substr(oldPos, pos - oldPos + 1)); - pos += 1; - } - - // find next & - oldPos = pos; - pos = origstr.find("&", pos); - } - - if (oldPos < origstr.length() - 1) { - newstr += (origstr.substr(oldPos, origstr.length() - oldPos)); - } - - return newstr; -} - static inline bool _is_white_space(char c) { return (c == ' ' || c == '\t' || c == '\n' || c == '\r'); } @@ -116,7 +59,7 @@ bool XMLParser::_set_text(char *start, char *end) { // set current text to the parsed text, and replace xml special characters String s = String::utf8(start, (int)(end - start)); - node_name = _replace_special_characters(s); + node_name = s.xml_unescape(); // current XML node type is text node_type = NODE_TEXT; @@ -321,7 +264,7 @@ void XMLParser::_parse_opening_xml_element() { String s = String::utf8(attributeValueBegin, (int)(attributeValueEnd - attributeValueBegin)); - attr.value = _replace_special_characters(s); + attr.value = s.xml_unescape(); attributes.push_back(attr); } else { // tag is closed directly @@ -579,14 +522,8 @@ int XMLParser::get_current_line() const { } XMLParser::XMLParser() { - data = nullptr; - close(); - special_characters.push_back("&"); - special_characters.push_back("gt;"); - special_characters.push_back("\"quot;"); - special_characters.push_back("'apos;"); } + XMLParser::~XMLParser() { if (data) { memdelete_arr(data); diff --git a/core/io/xml_parser.h b/core/io/xml_parser.h index 8fecc9cd580b..387c46dc1483 100644 --- a/core/io/xml_parser.h +++ b/core/io/xml_parser.h @@ -65,15 +65,13 @@ class XMLParser : public Reference { }; private: - char *data; - char *P; - uint64_t length; - void unescape(String &p_str); - Vector special_characters; + char *data = nullptr; + char *P = nullptr; + uint64_t length = 0; String node_name; - bool node_empty; - NodeType node_type; - uint64_t node_offset; + bool node_empty = false; + NodeType node_type = NODE_NONE; + uint64_t node_offset = 0; struct Attribute { String name; @@ -82,7 +80,6 @@ class XMLParser : public Reference { Vector attributes; - String _replace_special_characters(const String &origstr); bool _set_text(char *start, char *end); void _parse_closing_xml_element(); void _ignore_definition(); diff --git a/core/ustring.cpp b/core/ustring.cpp index 96459722ca7b..4b7b385b36a6 100644 --- a/core/ustring.cpp +++ b/core/ustring.cpp @@ -3549,29 +3549,58 @@ static _FORCE_INLINE_ int _xml_unescape(const CharType *p_src, int p_src_len, Ch if (p_src_len >= 4 && p_src[1] == '#') { CharType c = 0; - - for (int i = 2; i < p_src_len; i++) { - eat = i + 1; - CharType ct = p_src[i]; - if (ct == ';') { - break; - } else if (ct >= '0' && ct <= '9') { - ct = ct - '0'; - } else if (ct >= 'a' && ct <= 'f') { - ct = (ct - 'a') + 10; - } else if (ct >= 'A' && ct <= 'F') { - ct = (ct - 'A') + 10; - } else { - continue; + bool overflow = false; + if (p_src[2] == 'x') { + // Hex entity &#x; + for (int i = 3; i < p_src_len; i++) { + eat = i + 1; + CharType ct = p_src[i]; + if (ct == ';') { + break; + } else if (ct >= '0' && ct <= '9') { + ct = ct - '0'; + } else if (ct >= 'a' && ct <= 'f') { + ct = (ct - 'a') + 10; + } else if (ct >= 'A' && ct <= 'F') { + ct = (ct - 'A') + 10; + } else { + break; + } + if (c > (WCHAR_MAX >> 4)) { + overflow = true; + break; + } + c <<= 4; + c |= ct; + } + } else { + // Decimal entity &#; + for (int i = 2; i < p_src_len; i++) { + eat = i + 1; + CharType ct = p_src[i]; + if (ct == ';' || ct < '0' || ct > '9') { + break; + } + } + if (p_src[eat - 1] == ';') { + int64_t val = String::to_int(p_src + 2, eat - 3); + if (val > 0 && val <= WCHAR_MAX) { + c = (CharType)val; + } else { + overflow = true; + } } - c <<= 4; - c |= ct; } + // Value must be non-zero, in the range of char32_t, + // actually end with ';'. If invalid, leave the entity as-is + if (c == '\0' || overflow || p_src[eat - 1] != ';') { + eat = 1; + c = *p_src; + } if (p_dst) { *p_dst = c; } - } else if (p_src_len >= 4 && p_src[1] == 'g' && p_src[2] == 't' && p_src[3] == ';') { if (p_dst) { *p_dst = '>'; diff --git a/main/tests/test_main.cpp b/main/tests/test_main.cpp index dec01cbce0c5..4c7016f0b0d8 100644 --- a/main/tests/test_main.cpp +++ b/main/tests/test_main.cpp @@ -47,6 +47,7 @@ #include "test_render.h" #include "test_shader_lang.h" #include "test_string.h" +#include "test_xml_parser.h" const char **tests_get_names() { static const char *test_names[] = { @@ -65,6 +66,7 @@ const char **tests_get_names() { "gd_bytecode", "ordered_hash_map", "astar", + "xml_parser", nullptr }; @@ -138,6 +140,10 @@ MainLoop *test_main(String p_test, const List &p_args) { return TestAStar::test(); } + if (p_test == "xml_parser") { + return TestXMLParser::test(); + } + print_line("Unknown test: " + p_test); return nullptr; } diff --git a/main/tests/test_string.cpp b/main/tests/test_string.cpp index 3cb6baf81a3a..e00f5e75c6d2 100644 --- a/main/tests/test_string.cpp +++ b/main/tests/test_string.cpp @@ -1123,6 +1123,45 @@ bool test_35() { return state; } +bool test_36() { +#define CHECK(X) \ + if (!(X)) { \ + OS::get_singleton()->print("\tFAIL at %s\n", #X); \ + return false; \ + } else { \ + OS::get_singleton()->print("\tPASS\n"); \ + } + OS::get_singleton()->print("\n\nTest 36: xml unescape\n"); + // Named entities + String input = ""&'<>"; + CHECK(input.xml_unescape() == "\"&\'<>"); + + // Numeric entities + input = "AB"; + CHECK(input.xml_unescape() == "AB"); + + input = "�&x#0;More text"; + String result = input.xml_unescape(); + // Didn't put in a leading NUL and terminate the string + CHECK(input.length() > 0); + CHECK(input[0] != '\0'); + // Entity should be left as-is if invalid + CHECK(input.xml_unescape() == input); + + // Shouldn't consume without ending in a ';' + input = "B"; + CHECK(input.xml_unescape() == input); + input = "A"; + CHECK(input.xml_unescape() == input); + + // Invalid characters should make the entity ignored + input = "ASomeIrrelevantText;"; + CHECK(input.xml_unescape() == input); + input = "BSomeIrrelevantText;"; + CHECK(input.xml_unescape() == input); + return true; +} + typedef bool (*TestFunc)(); TestFunc test_funcs[] = { @@ -1162,6 +1201,7 @@ TestFunc test_funcs[] = { test_33, test_34, test_35, + test_36, nullptr }; diff --git a/main/tests/test_xml_parser.cpp b/main/tests/test_xml_parser.cpp new file mode 100644 index 000000000000..9ab9136936a5 --- /dev/null +++ b/main/tests/test_xml_parser.cpp @@ -0,0 +1,109 @@ +/*************************************************************************/ +/* test_xml_parser.cpp */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#include "test_xml_parser.h" + +#include "core/os/os.h" + +namespace TestXMLParser { +#define CHECK(X) \ + if (!(X)) { \ + OS::get_singleton()->print("\tFAIL at %s\n", #X); \ + return false; \ + } else { \ + OS::get_singleton()->print("\tPASS\n"); \ + } +bool test_1() { + String source = "\ +\ + Text<AB>\ +"; + Vector buff; + for (int i = 0; i < source.length(); i++) { + buff.push_back(source.get(i)); + } + XMLParser parser; + parser.open_buffer(buff); + + // gets parsed as NODE_UNKNOWN + CHECK(parser.read() == OK); + CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_UNKNOWN); + + CHECK(parser.read() == OK); + CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT); + CHECK(parser.get_node_name() == "top"); + CHECK(parser.has_attribute("attr")); + CHECK(parser.get_attribute_value("attr") == "attr value"); + + CHECK(parser.read() == OK); + CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_TEXT); + CHECK(parser.get_node_data().lstrip(" \t") == "Text"); + + CHECK(parser.read() == OK); + CHECK(parser.get_node_type() == XMLParser::NodeType::NODE_ELEMENT_END); + CHECK(parser.get_node_name() == "top"); + + parser.close(); + return true; +} + +typedef bool (*TestFunc)(); +TestFunc test_funcs[] = { + test_1, + nullptr +}; + +MainLoop *test() { + int count = 0; + int passed = 0; + + while (true) { + if (!test_funcs[count]) { + break; + } + bool pass = test_funcs[count](); + if (pass) { + passed++; + } + OS::get_singleton()->print("\t%s\n", pass ? "PASS" : "FAILED"); + + count++; + } + + OS::get_singleton()->print("\n\n\n"); + OS::get_singleton()->print("*************\n"); + OS::get_singleton()->print("***TOTALS!***\n"); + OS::get_singleton()->print("*************\n"); + + OS::get_singleton()->print("Passed %i of %i tests\n", passed, count); + + return nullptr; +} +} // namespace TestXMLParser diff --git a/main/tests/test_xml_parser.h b/main/tests/test_xml_parser.h new file mode 100644 index 000000000000..97823f927df3 --- /dev/null +++ b/main/tests/test_xml_parser.h @@ -0,0 +1,45 @@ +/*************************************************************************/ +/* test_xml_parser.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef TEST_XML_PARSER_H +#define TEST_XML_PARSER_H + +#include + +#include "core/io/xml_parser.h" +#include "core/os/main_loop.h" +#include "core/ustring.h" +#include "core/vector.h" + +namespace TestXMLParser { + +MainLoop *test(); +} +#endif // TEST_XML_PARSER_H