diff --git a/src/main/cpp/src/ftos_converter.cuh b/src/main/cpp/src/ftos_converter.cuh index c2fa07377c..f2e8ce0006 100644 --- a/src/main/cpp/src/ftos_converter.cuh +++ b/src/main/cpp/src/ftos_converter.cuh @@ -800,15 +800,15 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha if (sign) { result[index++] = '-'; } uint64_t output = v.mantissa; - uint32_t const olength = decimal_length(output); - int32_t exp = v.exponent + static_cast(olength) - 1; + int32_t const olength = decimal_length(output); + int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); // Values in the interval [1E-3, 1E7) are special. if (scientificNotation) { // Print in the format x.xxxxxE-yy. - for (uint32_t i = 0; i < olength - 1; ++i) { - uint32_t const c = output % 10; + for (int i = 0; i < olength - 1; ++i) { + int const c = output % 10; output /= 10; result[index + olength - i] = (char)('0' + c); } @@ -845,7 +845,7 @@ __device__ inline int to_chars(floating_decimal_64 const v, bool const sign, cha output /= 10; index++; } - } else if (exp + 1 >= olength) { + } else if (exp + 1 >= static_cast(olength)) { // Decimal dot is after any of the digits. for (int i = 0; i < olength; i++) { result[index + olength - i - 1] = (char)('0' + output % 10); @@ -880,7 +880,7 @@ __device__ inline int d2s_size(floating_decimal_64 const v, bool const sign) if (sign) { index++; } uint64_t output = v.mantissa; - uint32_t const olength = decimal_length(output); + int32_t const olength = decimal_length(output); int32_t exp = v.exponent + static_cast(olength) - 1; bool scientificNotation = (exp < -3) || (exp >= 7); @@ -920,7 +920,7 @@ __device__ inline int to_chars(floating_decimal_32 const v, bool const sign, cha if (sign) { result[index++] = '-'; } uint32_t output = v.mantissa; - uint32_t const olength = decimal_length(output); + int32_t const olength = decimal_length(output); int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); @@ -995,7 +995,7 @@ __device__ inline int f2s_size(floating_decimal_32 const v, bool const sign) if (sign) { index++; } uint32_t output = v.mantissa; - uint32_t const olength = decimal_length(output); + int32_t const olength = decimal_length(output); int32_t exp = v.exponent + olength - 1; bool scientificNotation = (exp < -3) || (exp >= 7); @@ -1149,6 +1149,57 @@ __device__ inline int compute_f2s_size(float value) return f2s_size(v, sign); } +//===== special inf handling for json ===== + +__device__ inline int copy_special_str_json(char* const result, + bool const sign, + bool const exponent, + bool const mantissa) +{ + // no NaN in json + if (exponent) { + if (sign) { + memcpy(result, "\"-Infinity\"", 11); + return 11; + } else { + memcpy(result, "\"Infinity\"", 10); + return 10; + } + } + if (sign) { + memcpy(result, "-0.0", 4); + return 4; + } else { + memcpy(result, "0.0", 3); + return 3; + } +} + +__device__ inline int special_str_size_json(bool const sign, + bool const exponent, + bool const mantissa) +{ + // no NaN in json + if (exponent) { return sign + 10; } + return sign + 3; +} + +__device__ inline int d2s_buffered_n_json(double f, char* result) +{ + bool sign = false, special = false; + floating_decimal_64 v = d2d(f, sign, special); + if (special) { return copy_special_str_json(result, sign, v.exponent, v.mantissa); } + return to_chars(v, sign, result); +} + +__device__ inline int compute_d2s_size_json(double value) +{ + bool sign = false, special = false; + floating_decimal_64 v = d2d(value, sign, special); + if (special) { return special_str_size_json(sign, v.exponent, v.mantissa); } + return d2s_size(v, sign); +} + } // namespace //===== APIs ===== @@ -1223,9 +1274,9 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const using U = std::conditional_t, uint32_t, uint64_t>; int index = 0; if (sign) { result[index++] = '-'; } - U output = v.mantissa; - uint32_t const olength = decimal_length(output); - int32_t exp = v.exponent + static_cast(olength) - 1; + U output = v.mantissa; + int32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; if (exp < 0) { // Decimal dot is before any of the digits. int index_for_carrier = index; @@ -1291,7 +1342,7 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const } } else { // 0 <= exp < olength - 1 - uint32_t temp_d = digits, tailing_zero = 0; + int32_t temp_d = digits, tailing_zero = 0; if (exp + digits + 1 > olength) { temp_d = olength - exp - 1; tailing_zero = digits - temp_d; @@ -1301,10 +1352,10 @@ __device__ inline int to_formatted_chars(T const v, bool const sign, char* const U integer = rounded_output / pow10; U decimal = rounded_output % pow10; // calculate integer length after format to cover carry case - uint32_t integer_len = decimal_length(integer); - uint32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; - uint32_t sep_cnt = 0; - int rev_index = 0; + int32_t integer_len = decimal_length(integer); + int32_t formated_integer_len = index + integer_len + (integer_len - 1) / 3; + int32_t sep_cnt = 0; + int rev_index = 0; for (int i = 0; i < integer_len; i++) { if (sep_cnt == 3) { result[formated_integer_len - (rev_index++) - 1] = ','; @@ -1338,9 +1389,9 @@ __device__ inline int format_size(T const v, bool const sign, int digits) using U = std::conditional_t, uint32_t, uint64_t>; int index = 0; if (sign) { index++; } - U output = v.mantissa; - uint32_t const olength = decimal_length(output); - int32_t exp = v.exponent + static_cast(olength) - 1; + U output = v.mantissa; + int32_t const olength = decimal_length(output); + int32_t exp = v.exponent + static_cast(olength) - 1; if (exp < 0) { index += 2 + digits; } else if (exp + 1 >= olength) { @@ -1424,4 +1475,15 @@ __device__ inline int format_float(double value, int digits, bool is_float, char } } +//===== json_parser utility ===== + +__device__ inline int double_normalization(double value, char* output) +{ + if (output == nullptr) { + return compute_d2s_size_json(value); + } else { + return d2s_buffered_n_json(value, output); + } +} + } // namespace spark_rapids_jni::ftos_converter diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp index 688e986b4d..e5936b0ed7 100644 --- a/src/main/cpp/src/get_json_object.hpp +++ b/src/main/cpp/src/get_json_object.hpp @@ -46,7 +46,7 @@ enum class write_style { raw_style, quoted_style, flatten_style }; * path instruction */ struct path_instruction { - CUDF_HOST_DEVICE inline path_instruction(path_instruction_type _type) : type(_type) {} + __device__ inline path_instruction(path_instruction_type _type) : type(_type) {} path_instruction_type type; @@ -65,22 +65,20 @@ struct path_instruction { template class json_generator { public: - CUDF_HOST_DEVICE json_generator(char* _output) + __device__ json_generator(char* _output) : output(_output), output_len(0), hide_outer_array_tokens(false) { } - CUDF_HOST_DEVICE json_generator() : output(nullptr), output_len(0), hide_outer_array_tokens(false) - { - } - CUDF_HOST_DEVICE json_generator(char* _output, bool _hide_outer_array_tokens) + __device__ json_generator() : output(nullptr), output_len(0), hide_outer_array_tokens(false) {} + __device__ json_generator(char* _output, bool _hide_outer_array_tokens) : output(_output), output_len(0), hide_outer_array_tokens(_hide_outer_array_tokens) { } - CUDF_HOST_DEVICE json_generator(bool _hide_outer_array_tokens) + __device__ json_generator(bool _hide_outer_array_tokens) : output(nullptr), output_len(0), hide_outer_array_tokens(_hide_outer_array_tokens) { } - CUDF_HOST_DEVICE CUDF_HOST_DEVICE json_generator<>& operator=(const json_generator<>& other) + __device__ json_generator<>& operator=(const json_generator<>& other) { this->output = other.output; this->output_len = other.output_len; @@ -95,7 +93,7 @@ class json_generator { // create a nested child generator based on this parent generator // child generator is a view - CUDF_HOST_DEVICE json_generator new_child_generator(bool hide_outer_array_tokens) + __device__ json_generator new_child_generator(bool hide_outer_array_tokens) { if (nullptr == output) { return json_generator(hide_outer_array_tokens); @@ -104,7 +102,7 @@ class json_generator { } } - CUDF_HOST_DEVICE void write_start_array() + __device__ void write_start_array() { if (!hide_outer_array_tokens) { if (output) { *(output + output_len) = '['; } @@ -119,7 +117,7 @@ class json_generator { } } - CUDF_HOST_DEVICE void write_end_array() + __device__ void write_end_array() { if (!hide_outer_array_tokens) { if (output) { *(output + output_len) = ']'; } @@ -132,15 +130,12 @@ class json_generator { } // return true if it's in a array context and it's not writing the first item. - CUDF_HOST_DEVICE bool need_comma() - { - return (array_depth > 0 && !is_first_item[array_depth - 1]); - } + __device__ bool need_comma() { return (array_depth > 0 && !is_first_item[array_depth - 1]); } /** * write comma accroding to current generator state */ - CUDF_HOST_DEVICE void try_write_comma() + __device__ void try_write_comma() { if (need_comma()) { // in array context and writes first item @@ -154,7 +149,7 @@ class json_generator { * object/array, then copy to corresponding matched end object/array. return * false if JSON format is invalid return true if JSON format is valid */ - CUDF_HOST_DEVICE bool copy_current_structure(json_parser<>& parser) + __device__ bool copy_current_structure(json_parser<>& parser) { // first try add comma try_write_comma(); @@ -181,7 +176,7 @@ class json_generator { * then can not return a pointer and length pair (char *, len), * For number token, JSON parser can return a pair (char *, len) */ - CUDF_HOST_DEVICE void write_raw(json_parser<>& parser) + __device__ void write_raw(json_parser<>& parser) { if (array_depth > 0) { is_first_item[array_depth - 1] = false; } @@ -220,9 +215,9 @@ class json_generator { * @param child_block_begin * @param child_block_len */ - CUDF_HOST_DEVICE void write_child_raw_value(char* child_block_begin, - size_t child_block_len, - bool write_outer_array_tokens) + __device__ void write_child_raw_value(char* child_block_begin, + size_t child_block_len, + bool write_outer_array_tokens) { bool insert_comma = need_comma(); @@ -258,7 +253,7 @@ class json_generator { output_len += child_block_len; } - CUDF_HOST_DEVICE void move_forward(char* begin, size_t len, int forward) + __device__ void move_forward(char* begin, size_t len, int forward) { char* pos = begin + len + forward - 1; char* e = begin + forward - 1; @@ -270,19 +265,19 @@ class json_generator { } } - CUDF_HOST_DEVICE void reset() { output_len = 0; } + __device__ void reset() { output_len = 0; } - CUDF_HOST_DEVICE inline size_t get_output_len() const { return output_len; } - CUDF_HOST_DEVICE inline char* get_output_start_position() const { return output; } - CUDF_HOST_DEVICE inline char* get_current_output_position() const { return output + output_len; } + __device__ inline size_t get_output_len() const { return output_len; } + __device__ inline char* get_output_start_position() const { return output; } + __device__ inline char* get_current_output_position() const { return output + output_len; } /** * generator may contain trash output, e.g.: generator writes some output, * then JSON format is invalid, the previous output becomes trash. */ - CUDF_HOST_DEVICE inline void set_output_len_zero() { output_len = 0; } + __device__ inline void set_output_len_zero() { output_len = 0; } - CUDF_HOST_DEVICE inline void set_output_len(size_t len) { output_len = len; } + __device__ inline void set_output_len(size_t len) { output_len = len; } private: char* output; @@ -297,38 +292,38 @@ class json_generator { * path evaluator which can run on both CPU and GPU */ struct path_evaluator { - static CUDF_HOST_DEVICE inline bool path_is_empty(size_t path_size) { return path_size == 0; } + static __device__ inline bool path_is_empty(size_t path_size) { return path_size == 0; } - static CUDF_HOST_DEVICE inline bool path_match_element(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0) + static __device__ inline bool path_match_element(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0) { if (path_size < 1) { return false; } return path_ptr[0].type == path_type0; } - static CUDF_HOST_DEVICE inline bool path_match_elements(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0, - path_instruction_type path_type1) + static __device__ inline bool path_match_elements(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0, + path_instruction_type path_type1) { if (path_size < 2) { return false; } return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1; } - static CUDF_HOST_DEVICE inline bool path_match_elements(path_instruction const* path_ptr, - size_t path_size, - path_instruction_type path_type0, - path_instruction_type path_type1, - path_instruction_type path_type2, - path_instruction_type path_type3) + static __device__ inline bool path_match_elements(path_instruction const* path_ptr, + size_t path_size, + path_instruction_type path_type0, + path_instruction_type path_type1, + path_instruction_type path_type2, + path_instruction_type path_type3) { if (path_size < 4) { return false; } return path_ptr[0].type == path_type0 && path_ptr[1].type == path_type1 && path_ptr[2].type == path_type2 && path_ptr[3].type == path_type3; } - static CUDF_HOST_DEVICE inline thrust::tuple path_match_subscript_index( + static __device__ inline thrust::tuple path_match_subscript_index( path_instruction const* path_ptr, size_t path_size) { auto match = path_match_elements( @@ -340,7 +335,7 @@ struct path_evaluator { } } - static CUDF_HOST_DEVICE inline thrust::tuple path_match_named( + static __device__ inline thrust::tuple path_match_named( path_instruction const* path_ptr, size_t path_size) { auto match = path_match_element(path_ptr, path_size, path_instruction_type::NAMED); @@ -351,8 +346,8 @@ struct path_evaluator { } } - static CUDF_HOST_DEVICE inline thrust::tuple - path_match_subscript_index_subscript_wildcard(path_instruction const* path_ptr, size_t path_size) + static __device__ inline thrust::tuple path_match_subscript_index_subscript_wildcard( + path_instruction const* path_ptr, size_t path_size) { auto match = path_match_elements(path_ptr, path_size, @@ -375,7 +370,7 @@ struct path_evaluator { * is not human friendly. * */ - // static CUDF_HOST_DEVICE bool evaluate_path(json_parser<>& p, + // static __device__ bool evaluate_path(json_parser<>& p, // json_generator<>& g, // write_style style, // path_instruction const* path_ptr, @@ -651,11 +646,11 @@ struct path_evaluator { * This function is rewritten from above commented recursive function. * this function is equivalent to the above commented recursive function. */ - static CUDF_HOST_DEVICE bool evaluate_path(json_parser<>& p, - json_generator<>& root_g, - write_style root_style, - path_instruction const* root_path_ptr, - int root_path_size) + static __device__ bool evaluate_path(json_parser<>& p, + json_generator<>& root_g, + write_style root_style, + path_instruction const* root_path_ptr, + int root_path_size) { // manually maintained context stack in lieu of calling evaluate_path recursively. struct context { @@ -685,7 +680,7 @@ struct path_evaluator { // used to save child JSON generator for case path 8 json_generator<> child_g; - CUDF_HOST_DEVICE context() + __device__ context() : token(json_token::INIT), case_path(-1), g(json_generator<>()), @@ -695,12 +690,12 @@ struct path_evaluator { { } - CUDF_HOST_DEVICE context(json_token _token, - int _case_path, - json_generator<> _g, - write_style _style, - path_instruction const* _path_ptr, - int _path_size) + __device__ context(json_token _token, + int _case_path, + json_generator<> _g, + write_style _style, + path_instruction const* _path_ptr, + int _path_size) : token(_token), case_path(_case_path), g(_g), @@ -710,7 +705,7 @@ struct path_evaluator { { } - CUDF_HOST_DEVICE context& operator=(const context& other) + __device__ context& operator=(const context& other) { token = other.token; case_path = other.case_path; diff --git a/src/main/cpp/src/json_parser.hpp b/src/main/cpp/src/json_parser.hpp index 9eb6f25926..9e9f203ad8 100644 --- a/src/main/cpp/src/json_parser.hpp +++ b/src/main/cpp/src/json_parser.hpp @@ -15,6 +15,9 @@ */ #pragma once +#include "ftos_converter.cuh" + +#include #include #include @@ -180,8 +183,7 @@ template class json_parser { public: - CUDF_HOST_DEVICE inline json_parser(char const* const _json_start_pos, - cudf::size_type const _json_len) + __device__ inline json_parser(char const* const _json_start_pos, cudf::size_type const _json_len) : json_start_pos(_json_start_pos), json_end_pos(_json_start_pos + _json_len), curr_pos(_json_start_pos) @@ -192,12 +194,12 @@ class json_parser { /** * is current position EOF */ - CUDF_HOST_DEVICE inline bool eof(char const* pos) { return pos >= json_end_pos; } + __device__ inline bool eof(char const* pos) { return pos >= json_end_pos; } /** * is hex digits: 0-9, A-F, a-f */ - CUDF_HOST_DEVICE inline bool is_hex_digit(char c) + __device__ inline bool is_hex_digit(char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } @@ -205,12 +207,12 @@ class json_parser { /** * is 0 to 9 digit */ - CUDF_HOST_DEVICE inline bool is_digit(char c) { return (c >= '0' && c <= '9'); } + __device__ inline bool is_digit(char c) { return (c >= '0' && c <= '9'); } /** * is white spaces: ' ', '\t', '\n' '\r' */ - CUDF_HOST_DEVICE inline bool is_whitespace(char c) + __device__ inline bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } @@ -218,7 +220,7 @@ class json_parser { /** * skips 4 characters: ' ', '\t', '\n' '\r' */ - CUDF_HOST_DEVICE inline void skip_whitespaces(char const*& pos) + __device__ inline void skip_whitespaces(char const*& pos) { while (!eof(pos) && is_whitespace(*pos)) { pos++; @@ -228,7 +230,7 @@ class json_parser { /** * check current char, if it's expected, then plus the position */ - CUDF_HOST_DEVICE inline bool try_skip(char const*& pos, char expected) + __device__ inline bool try_skip(char const*& pos, char expected) { if (!eof(pos) && *pos == expected) { pos++; @@ -241,7 +243,7 @@ class json_parser { * try to push current context into stack * if nested depth exceeds limitation, return false */ - CUDF_HOST_DEVICE inline bool try_push_context(json_token token) + __device__ inline bool try_push_context(json_token token) { if (stack_size < max_json_nesting_depth) { push_context(token); @@ -254,7 +256,7 @@ class json_parser { /** * record the nested state into stack: JSON object or JSON array */ - CUDF_HOST_DEVICE inline void push_context(json_token token) + __device__ inline void push_context(json_token token) { bool v = json_token::START_OBJECT == token ? true : false; context_stack[stack_size++] = v; @@ -265,17 +267,17 @@ class json_parser { * true is object, false is array * only has two contexts: object or array */ - CUDF_HOST_DEVICE inline bool is_object_context() { return context_stack[stack_size - 1]; } + __device__ inline bool is_object_context() { return context_stack[stack_size - 1]; } /** * pop top context from stack */ - CUDF_HOST_DEVICE inline void pop_curr_context() { stack_size--; } + __device__ inline void pop_curr_context() { stack_size--; } /** * is context stack is empty */ - CUDF_HOST_DEVICE inline bool is_context_stack_empty() { return stack_size == 0; } + __device__ inline bool is_context_stack_empty() { return stack_size == 0; } /** * parse the first value token from current position @@ -285,7 +287,7 @@ class json_parser { * current token is string/num/true/false/null if current value is terminal * current token is ERROR if parse failed */ - CUDF_HOST_DEVICE inline void parse_first_token_in_value() + __device__ inline void parse_first_token_in_value() { // already checked eof char c = *curr_pos; @@ -342,7 +344,7 @@ class json_parser { /** * parse ' quoted string */ - CUDF_HOST_DEVICE inline void parse_single_quoted_string() + __device__ inline void parse_single_quoted_string() { auto [success, end_char_pos] = try_parse_single_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped); @@ -357,7 +359,7 @@ class json_parser { /** * parse " quoted string */ - CUDF_HOST_DEVICE inline void parse_double_quoted_string() + __device__ inline void parse_double_quoted_string() { auto [success, end_char_pos] = try_parse_double_quoted_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped); @@ -383,7 +385,7 @@ class json_parser { * @return whether passed successfully and the end position of parsed str * */ - CUDF_HOST_DEVICE inline std::pair try_parse_string( + __device__ inline std::pair try_parse_string( char const* str_pos, char const* to_match_str_pos, char const* const to_match_str_end, @@ -416,7 +418,7 @@ class json_parser { * not copy * */ - CUDF_HOST_DEVICE inline std::pair try_parse_single_quoted_string( + __device__ inline std::pair try_parse_single_quoted_string( char const* str_pos, char const* to_match_str_pos, char const* const to_match_str_end, @@ -445,7 +447,7 @@ class json_parser { * not copy * */ - CUDF_HOST_DEVICE inline std::pair try_parse_double_quoted_string( + __device__ inline std::pair try_parse_double_quoted_string( char const* str_pos, char const* to_match_str_pos, char const* const to_match_str_end, @@ -464,7 +466,7 @@ class json_parser { /** * transform int value from [0, 15] to hex char */ - CUDF_HOST_DEVICE inline char to_hex_char(unsigned int v) + __device__ inline char to_hex_char(unsigned int v) { if (v < 10) return '0' + v; @@ -479,7 +481,7 @@ class json_parser { * @param char to be escaped, c should in range [0, 31) * @param[out] escape output */ - CUDF_HOST_DEVICE inline int escape_char(unsigned char c, char* output) + __device__ inline int escape_char(unsigned char c, char* output) { switch (c) { case 8: @@ -573,7 +575,7 @@ class json_parser { * @param copy_destination copy unescaped str to destination, nullptr means do * not copy */ - CUDF_HOST_DEVICE inline std::pair try_parse_quoted_string( + __device__ inline std::pair try_parse_quoted_string( char const* str_pos, char const quote_char, char const* to_match_str_pos, @@ -675,9 +677,9 @@ class json_parser { return std::make_pair(false, nullptr); } - CUDF_HOST_DEVICE inline bool try_match_char(char const*& char_pos, - char const* const char_end_pos, - char c) + __device__ inline bool try_match_char(char const*& char_pos, + char const* const char_end_pos, + char c) { if (nullptr != char_pos) { if (char_pos < char_end_pos && *char_pos == c) { @@ -696,11 +698,11 @@ class json_parser { * skip the HEX chars in \u HEX HEX HEX HEX. * @return positive escaped ASCII value if success, -1 otherwise */ - CUDF_HOST_DEVICE inline bool try_skip_escape_part(char const*& str_pos, - char const*& to_match_str_pos, - char const* const to_match_str_end, - char*& copy_dest, - write_style w_style) + __device__ inline bool try_skip_escape_part(char const*& str_pos, + char const*& to_match_str_pos, + char const* const to_match_str_end, + char*& copy_dest, + write_style w_style) { // already skipped the first '\' // try skip second part @@ -832,7 +834,7 @@ class json_parser { * : ~ ["\\\u0000-\u001F] * ; */ - CUDF_HOST_DEVICE inline bool try_skip_safe_code_point(char const*& str_pos, char c) + __device__ inline bool try_skip_safe_code_point(char const*& str_pos, char c) { // 1 the char is not quoted(' or ") char, here satisfy, do not need to check // again @@ -852,7 +854,7 @@ class json_parser { /** * convert chars 0-9, a-f, A-F to int value */ - CUDF_HOST_DEVICE inline uint8_t hex_value(char c) + __device__ inline uint8_t hex_value(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -863,7 +865,7 @@ class json_parser { /** * parse four HEX chars to unsigned int */ - CUDF_HOST_DEVICE inline cudf::char_utf8 parse_code_point(char const* p) + __device__ inline cudf::char_utf8 parse_code_point(char const* p) { cudf::char_utf8 v = 0; for (size_t i = 0; i < 4; i++) { @@ -878,7 +880,7 @@ class json_parser { * @param character Single character * @return Number of bytes */ - CUDF_HOST_DEVICE cudf::size_type bytes_in_char_utf8(cudf::char_utf8 character) + __device__ cudf::size_type bytes_in_char_utf8(cudf::char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -891,7 +893,7 @@ class json_parser { * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ - CUDF_HOST_DEVICE cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) + __device__ cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) { @@ -926,7 +928,7 @@ class json_parser { * @param[out] str Output array. * @return The number of bytes in the character */ - CUDF_HOST_DEVICE cudf::size_type from_char_utf8(cudf::char_utf8 character, char* str) + __device__ cudf::size_type from_char_utf8(cudf::char_utf8 character, char* str) { cudf::size_type const chr_width = bytes_in_char_utf8(character); for (cudf::size_type idx = 0; idx < chr_width; ++idx) { @@ -940,10 +942,10 @@ class json_parser { * try skip 4 HEX chars * in pattern: '\\' 'u' HEX HEX HEX HEX, it's a code point of unicode */ - CUDF_HOST_DEVICE bool try_skip_unicode(char const*& str_pos, - char const*& to_match_str_pos, - char const* const to_match_str_end, - char*& copy_dest) + __device__ bool try_skip_unicode(char const*& str_pos, + char const*& to_match_str_pos, + char const* const to_match_str_end, + char*& copy_dest) { // already parsed u bool is_success = try_skip_hex(str_pos) && try_skip_hex(str_pos) && try_skip_hex(str_pos) && @@ -983,7 +985,7 @@ class json_parser { /** * try skip HEX */ - CUDF_HOST_DEVICE inline bool try_skip_hex(char const*& str_pos) + __device__ inline bool try_skip_hex(char const*& str_pos) { if (!eof(str_pos) && is_hex_digit(*str_pos)) { str_pos++; @@ -1016,7 +1018,7 @@ class json_parser { * invalid number: 0., 0e, 0E * */ - CUDF_HOST_DEVICE inline void parse_number() + __device__ inline void parse_number() { // reset the float parts float_integer_len = 0; @@ -1051,7 +1053,7 @@ class json_parser { * verify max number length if enabled * e.g.: -1.23e-456, int len is 1, fraction len is 2, exp digits len is 3 */ - CUDF_HOST_DEVICE inline bool check_max_num_len() + __device__ inline bool check_max_num_len() { // exp part contains + or - sign char, do not count the exp sign int exp_digit_len = float_exp_len; @@ -1068,7 +1070,7 @@ class json_parser { /** * verify max string length if enabled */ - CUDF_HOST_DEVICE inline bool check_string_max_utf8_bytes() + __device__ inline bool check_string_max_utf8_bytes() { return // disabled str len check @@ -1082,7 +1084,7 @@ class json_parser { * * @param[out] is_float, if contains `.` or `e`, set true */ - CUDF_HOST_DEVICE inline bool try_unsigned_number(bool& is_float) + __device__ inline bool try_unsigned_number(bool& is_float) { if (!eof(curr_pos)) { char c = *curr_pos; @@ -1113,7 +1115,7 @@ class json_parser { * parse: ('.' [0-9]+)? EXP? * @param[is_float] is float */ - CUDF_HOST_DEVICE inline bool parse_number_from_fraction(bool& is_float) + __device__ inline bool parse_number_from_fraction(bool& is_float) { // parse fraction if (try_skip(curr_pos, '.')) { @@ -1138,7 +1140,7 @@ class json_parser { * parse: [0-9]* * skip zero or more [0-9] */ - CUDF_HOST_DEVICE inline int skip_zero_or_more_digits() + __device__ inline int skip_zero_or_more_digits() { int digits = 0; while (!eof(curr_pos)) { @@ -1158,7 +1160,7 @@ class json_parser { * try skip one or more [0-9] * @param[out] len: skipped num of digits */ - CUDF_HOST_DEVICE inline bool try_skip_one_or_more_digits(int& len) + __device__ inline bool try_skip_one_or_more_digits(int& len) { if (!eof(curr_pos) && is_digit(*curr_pos)) { curr_pos++; @@ -1174,7 +1176,7 @@ class json_parser { * parse [eE][+-]?[0-9]+ * @param[out] exp_len exp len */ - CUDF_HOST_DEVICE inline bool try_parse_exp() + __device__ inline bool try_parse_exp() { // already parsed [eE] @@ -1196,7 +1198,7 @@ class json_parser { /** * parse true */ - CUDF_HOST_DEVICE inline void parse_true() + __device__ inline void parse_true() { // already parsed 't' if (try_skip(curr_pos, 'r') && try_skip(curr_pos, 'u') && try_skip(curr_pos, 'e')) { @@ -1209,7 +1211,7 @@ class json_parser { /** * parse false */ - CUDF_HOST_DEVICE inline void parse_false() + __device__ inline void parse_false() { // already parsed 'f' if (try_skip(curr_pos, 'a') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 's') && @@ -1223,7 +1225,7 @@ class json_parser { /** * parse null */ - CUDF_HOST_DEVICE inline void parse_null() + __device__ inline void parse_null() { // already parsed 'n' if (try_skip(curr_pos, 'u') && try_skip(curr_pos, 'l') && try_skip(curr_pos, 'l')) { @@ -1236,7 +1238,7 @@ class json_parser { /** * parse the key string in key:value pair */ - CUDF_HOST_DEVICE inline void parse_field_name() + __device__ inline void parse_field_name() { auto [success, end_char_pos] = try_parse_string(curr_pos, nullptr, nullptr, nullptr, write_style::unescaped); @@ -1254,8 +1256,8 @@ class json_parser { * @param[out] has_comma_before_token has comma before next token * @param[out] has_colon_before_token has colon before next token */ - CUDF_HOST_DEVICE inline json_token parse_next_token(bool& has_comma_before_token, - bool& has_colon_before_token) + __device__ inline json_token parse_next_token(bool& has_comma_before_token, + bool& has_colon_before_token) { skip_whitespaces(curr_pos); if (!eof(curr_pos)) { @@ -1381,7 +1383,7 @@ class json_parser { * continute parsing, get next token. * The final tokens are ERROR or SUCCESS; */ - CUDF_HOST_DEVICE json_token next_token() + __device__ json_token next_token() { // parse next token bool has_comma_before_token; // no-initialization because of do not care here @@ -1392,12 +1394,12 @@ class json_parser { /** * get current token */ - CUDF_HOST_DEVICE json_token get_current_token() { return curr_token; } + __device__ json_token get_current_token() { return curr_token; } /** * is valid JSON by parsing through all tokens */ - CUDF_HOST_DEVICE bool is_valid() + __device__ bool is_valid() { while (curr_token != json_token::ERROR && curr_token != json_token::SUCCESS) { next_token(); @@ -1410,7 +1412,7 @@ class json_parser { * after this call, the current token is ] or } if token is { or [ * @return true if JSON is valid so far, false otherwise. */ - CUDF_HOST_DEVICE bool try_skip_children() + __device__ bool try_skip_children() { if (curr_token == json_token::ERROR || curr_token == json_token::INIT || curr_token == json_token::SUCCESS) { @@ -1434,7 +1436,7 @@ class json_parser { } } - CUDF_HOST_DEVICE cudf::size_type compute_unescaped_len() { return write_unescaped_text(nullptr); } + __device__ cudf::size_type compute_unescaped_len() { return write_unescaped_text(nullptr); } /** * unescape current token text, then write to destination @@ -1443,7 +1445,7 @@ class json_parser { * writes 6 utf8 bytes: -28 -72 -83 -27 -101 -67 * For number, write verbatim without normalization */ - CUDF_HOST_DEVICE cudf::size_type write_unescaped_text(char* destination) + __device__ cudf::size_type write_unescaped_text(char* destination) { switch (curr_token) { case json_token::VALUE_STRING: @@ -1453,27 +1455,27 @@ class json_parser { current_token_start_pos, nullptr, nullptr, destination, write_style::unescaped); return string_token_utf8_bytes; case json_token::VALUE_NUMBER_INT: - // TODO normalization if needed: - // https://github.com/NVIDIA/spark-rapids/issues/10218 leverage function: - // `get_current_float_parts` + if (number_token_len == 2 && current_token_start_pos[0] == '-' && + current_token_start_pos[1] == '0') { + if (nullptr != destination) *destination++ = '0'; + return 1; + } if (nullptr != destination) { for (cudf::size_type i = 0; i < number_token_len; ++i) { *destination++ = *(current_token_start_pos + i); } } return number_token_len; - case json_token::VALUE_NUMBER_FLOAT: - // TODO normalization: https://github.com/NVIDIA/spark-rapids/issues/10218 + case json_token::VALUE_NUMBER_FLOAT: { + // number normalization: // 0.03E-2 => 0.3E-5; infinity; // 200.000 => 200.0, 351.980 => 351.98, 12345678900000000000.0 // => 1.23456789E19 0.0000000000003 => 3.0E-13; 0.003 => 0.003; 0.0003 // => 3.0E-4 leverage function: `get_current_float_parts` - if (nullptr != destination) { - for (cudf::size_type i = 0; i < number_token_len; ++i) { - *destination++ = *(current_token_start_pos + i); - } - } - return number_token_len; + double d_value = + cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len)); + return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); + } case json_token::VALUE_TRUE: if (nullptr != destination) { *destination++ = 't'; @@ -1525,7 +1527,7 @@ class json_parser { return 0; } - CUDF_HOST_DEVICE cudf::size_type compute_escaped_len() { return write_escaped_text(nullptr); } + __device__ cudf::size_type compute_escaped_len() { return write_escaped_text(nullptr); } /** * escape current token text, then write to destination * e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"' '"' @@ -1533,7 +1535,7 @@ class json_parser { * writes 8 utf8 bytes: '"' -28 -72 -83 -27 -101 -67 '"' * For number, write verbatim without normalization */ - CUDF_HOST_DEVICE cudf::size_type write_escaped_text(char* destination) + __device__ cudf::size_type write_escaped_text(char* destination) { switch (curr_token) { case json_token::VALUE_STRING: @@ -1543,14 +1545,22 @@ class json_parser { current_token_start_pos, nullptr, nullptr, destination, write_style::escaped); return string_token_utf8_bytes + bytes_diff_for_escape_writing; case json_token::VALUE_NUMBER_INT: - case json_token::VALUE_NUMBER_FLOAT: - // number can be copied from JSON string directly + if (number_token_len == 2 && current_token_start_pos[0] == '-' && + current_token_start_pos[1] == '0') { + if (nullptr != destination) *destination++ = '0'; + return 1; + } if (nullptr != destination) { for (cudf::size_type i = 0; i < number_token_len; ++i) { *destination++ = *(current_token_start_pos + i); } } return number_token_len; + case json_token::VALUE_NUMBER_FLOAT: { + double d_value = + cudf::strings::detail::stod(cudf::string_view(current_token_start_pos, number_token_len)); + return spark_rapids_jni::ftos_converter::double_normalization(d_value, destination); + } case json_token::VALUE_TRUE: if (nullptr != destination) { *destination++ = 't'; @@ -1605,7 +1615,7 @@ class json_parser { /** * reset the parser */ - CUDF_HOST_DEVICE void reset() + __device__ void reset() { curr_pos = json_start_pos; curr_token = json_token::INIT; @@ -1615,7 +1625,7 @@ class json_parser { /** * get float parts, current token should be VALUE_NUMBER_FLOAT. */ - CUDF_HOST_DEVICE thrust::tuple + __device__ thrust::tuple get_current_float_parts() { return thrust::make_tuple(float_sign, @@ -1633,7 +1643,7 @@ class json_parser { * return false otherwise, * Note: to_match_str_ptr should not be nullptr */ - CUDF_HOST_DEVICE bool match_current_field_name(cudf::string_view name) + __device__ bool match_current_field_name(cudf::string_view name) { return match_current_field_name(name.data(), name.size_bytes()); } @@ -1641,7 +1651,7 @@ class json_parser { /** * match current field name */ - CUDF_HOST_DEVICE bool match_current_field_name(char const* to_match_str_ptr, cudf::size_type len) + __device__ bool match_current_field_name(char const* to_match_str_ptr, cudf::size_type len) { if (json_token::FIELD_NAME == curr_token) { auto [b, end_pos] = try_parse_string(current_token_start_pos, @@ -1661,7 +1671,7 @@ class json_parser { * reurn true otherwise. * @param[out] copy_to */ - CUDF_HOST_DEVICE thrust::pair copy_current_structure(char* copy_to) + __device__ thrust::pair copy_current_structure(char* copy_to) { switch (curr_token) { case json_token::INIT: diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 48cbc4ebdc..e81f86e87a 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -63,11 +63,6 @@ ConfigureTest(DATETIME_REBASE ConfigureTest(ROW_CONVERSION row_conversion.cpp) -ConfigureTest(GET_JSON_OBJECT - json_parser_tests.cpp - get_json_object_tests.cpp - ) - ConfigureTest(HASH hash.cpp) diff --git a/src/main/cpp/tests/get_json_object_tests.cpp b/src/main/cpp/tests/get_json_object_tests.cpp deleted file mode 100644 index 2e2df4f632..0000000000 --- a/src/main/cpp/tests/get_json_object_tests.cpp +++ /dev/null @@ -1,509 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -// defined in json_parser_tests.cpp -void clear_buff(char buf[], std::size_t size); -void assert_start_with(char* buf, std::size_t buf_size, const std::string& prefix); - -struct GetJsonObjectTests : public cudf::test::BaseFixture {}; -using spark_rapids_jni::json_parser; -using spark_rapids_jni::path_instruction_type; -using spark_rapids_jni::detail::path_instruction; - -spark_rapids_jni::json_parser<> get_parser(std::string const& json_str) -{ - return json_parser<>(json_str.data(), json_str.size()); -} - -spark_rapids_jni::detail::json_generator<> get_generator(char* buf) -{ - return spark_rapids_jni::detail::json_generator<>(buf); -} - -spark_rapids_jni::detail::json_generator<> get_nullptr_generator() -{ - return spark_rapids_jni::detail::json_generator<>(nullptr); -} - -bool eval_path(spark_rapids_jni::json_parser<>& p, - spark_rapids_jni::detail::json_generator<>& g, - spark_rapids_jni::detail::path_instruction const* path_ptr, - int path_size) -{ - return spark_rapids_jni::detail::path_evaluator::evaluate_path( - p, g, spark_rapids_jni::detail::write_style::raw_style, path_ptr, path_size); -} - -path_instruction get_subscript_path() { return path_instruction(path_instruction_type::SUBSCRIPT); } - -path_instruction get_wildcard_path() { return path_instruction(path_instruction_type::WILDCARD); } - -path_instruction get_key_path() { return path_instruction(path_instruction_type::KEY); } - -path_instruction get_index_path(int index) -{ - auto p = path_instruction(path_instruction_type::INDEX); - p.index = index; - return p; -} - -path_instruction get_named_path(std::string name) -{ - auto p = path_instruction(path_instruction_type::NAMED); - p.name = cudf::string_view(name.data(), name.size()); - return p; -} - -void test_get_json_object(std::string json, - std::vector paths, - std::string expected) -{ - size_t buf_len = 100 * 1024; - char buf[buf_len]; - clear_buff(buf, buf_len); - - auto p = get_parser(json); - auto g = get_generator(buf); - p.next_token(); - - ASSERT_TRUE(eval_path(p, g, paths.data(), paths.size())); - assert_start_with(buf, buf_len, expected); - - // the following checks generator output size without writes bytes - clear_buff(buf, buf_len); - auto p2 = get_parser(json); - auto g2 = get_nullptr_generator(); - p2.next_token(); - - ASSERT_TRUE(eval_path(p2, g2, paths.data(), paths.size())); - ASSERT_EQ(g2.get_output_len(), expected.size()); -} - -void test_get_json_object_fail(std::string json, std::vector paths) -{ - size_t buf_len = 100 * 1024; - char buf[buf_len]; - clear_buff(buf, buf_len); - - auto p = get_parser(json); - auto g = get_generator(buf); - p.next_token(); - - ASSERT_FALSE(eval_path(p, g, paths.data(), paths.size())); -} - -void test_get_json_object(std::string json, std::string expected) -{ - size_t buf_len = 100 * 1024; - char buf[buf_len]; - clear_buff(buf, buf_len); - - auto p = get_parser(json); - auto g = get_generator(buf); - p.next_token(); - - ASSERT_TRUE(eval_path(p, g, nullptr, 0)); - assert_start_with(buf, buf_len, expected); -} - -static const std::string json_for_test = R"( -{"store":{"fruit":[{"weight":8,"type":"apple"},{"weight":9,"type":"pear"}], -"basket":[[1,2,{"b":"y","a":"x"}],[3,4],[5,6]],"book":[{"author":"Nigel Rees", -"title":"Sayings of the Century","category":"reference","price":8.95}, -{"author":"Herman Melville","title":"Moby Dick","category":"fiction","price":8.99, -"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":"The Lord of the Rings", -"category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}], -"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}}, -"email":"amy@only_for_json_udf_test.net","owner":"amy","zip code":"94025", -"fb:testid":"1234"} -)"; - -/** - * Tests from Spark JsonExpressionsSuite - */ -TEST_F(GetJsonObjectTests, NormalTest) -{ - test_get_json_object(" { 'k' : [1, [21, 22, 23], 3] } ", - std::vector{get_key_path(), get_named_path("k")}, - "[1,[21,22,23],3]"); - test_get_json_object(" { 'k' : [1, [21, 22, 23], 3] } ", R"({"k":[1,[21,22,23],3]})"); - test_get_json_object( - " { 'k' : [1, [21, 22, 23], 3] } ", - std::vector{ - get_key_path(), get_named_path("k"), get_subscript_path(), get_wildcard_path()}, - R"([1,[21,22,23],3])"); - test_get_json_object(" { 'k' : [1, [21, 22, 23], 3] } ", - std::vector{get_key_path(), - get_named_path("k"), - get_subscript_path(), - get_wildcard_path(), - get_subscript_path(), - get_wildcard_path()}, - R"([1,21,22,23,3])"); - test_get_json_object(" { 'k' : [1, [21, 22, 23], 3] } ", - std::vector{get_key_path(), - get_named_path("k"), - get_subscript_path(), - get_wildcard_path(), - get_subscript_path(), - get_index_path(0)}, - R"(21)"); - test_get_json_object( - " [[11,12,13], [21, 22, 23], [31, 32, 33]] ", - std::vector{ - get_subscript_path(), get_wildcard_path(), get_subscript_path(), get_index_path(0)}, - R"([11,21,31])"); - test_get_json_object( - " [[11,12,13]] ", - std::vector{ - get_subscript_path(), get_wildcard_path(), get_subscript_path(), get_index_path(0)}, - R"(11)"); - - test_get_json_object( - " [[11,12,13]] ", - std::vector{ - get_subscript_path(), get_wildcard_path(), get_subscript_path(), get_index_path(0)}, - R"(11)"); - - // tests from Spark unit test cases - test_get_json_object( - json_for_test, - std::vector{ - get_key_path(), get_named_path("store"), get_key_path(), get_named_path("bicycle")}, - R"({"price":19.95,"color":"red"})"); - - test_get_json_object( - R"({ "key with spaces": "it works" })", - std::vector{get_key_path(), get_named_path("key with spaces")}, - R"(it works)"); - - std::string e1 = - R"([{"author":"Nigel Rees","title":"Sayings of the Century","category":"reference",)"; - e1 += R"("price":8.95},{"author":"Herman Melville","title":"Moby Dick","category":"fiction",)"; - e1 += R"("price":8.99,"isbn":"0-553-21311-3"},{"author":"J. R. R. Tolkien","title":)"; - e1 += R"("The Lord of the Rings","category":"fiction","reader":[{"age":25,"name":"bob"},)"; - e1 += R"({"age":26,"name":"jack"}],"price":22.99,"isbn":"0-395-19395-8"}])"; - - test_get_json_object( - json_for_test, - std::vector{ - get_key_path(), get_named_path("store"), get_key_path(), get_named_path("book")}, - e1); - - std::string e2 = R"({"author":"Nigel Rees","title":"Sayings of the Century",)"; - e2 += R"("category":"reference","price":8.95})"; - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_index_path(0)}, - e2); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_wildcard_path()}, - e1); - - auto e3 = json_for_test; - e3.erase(std::remove(e3.begin(), e3.end(), '\n'), e3.end()); - test_get_json_object(json_for_test, e3); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_index_path(0), - get_key_path(), - get_named_path("category")}, - "reference"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_wildcard_path(), - get_key_path(), - get_named_path("category")}, - R"(["reference","fiction","fiction"])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_wildcard_path(), - get_key_path(), - get_named_path("isbn")}, - R"(["0-553-21311-3","0-395-19395-8"])"); - - // Fix https://github.com/NVIDIA/spark-rapids/issues/10216 - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_wildcard_path(), - get_key_path(), - get_named_path("reader")}, - R"([{"age":25,"name":"bob"},{"age":26,"name":"jack"}])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_index_path(0), - get_subscript_path(), - get_index_path(1)}, - R"(2)"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_wildcard_path()}, - R"([[1,2,{"b":"y","a":"x"}],[3,4],[5,6]])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_wildcard_path(), - get_subscript_path(), - get_index_path(0)}, - R"([1,3,5])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_index_path(0), - get_subscript_path(), - get_wildcard_path()}, - R"([1,2,{"b":"y","a":"x"}])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_wildcard_path(), - get_subscript_path(), - get_wildcard_path()}, - R"([1,2,{"b":"y","a":"x"},3,4,5,6])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_index_path(0), - get_subscript_path(), - get_index_path(2), - get_key_path(), - get_named_path("b")}, - R"(y)"); - - // Fix https://github.com/NVIDIA/spark-rapids/issues/10217 - test_get_json_object(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_index_path(0), - get_subscript_path(), - get_wildcard_path(), - get_key_path(), - get_named_path("b")}, - R"(["y"])"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), get_named_path("zip code")}, - R"(94025)"); - - test_get_json_object(json_for_test, - std::vector{get_key_path(), get_named_path("fb:testid")}, - R"(1234)"); - - test_get_json_object( - R"({"a":"b\nc"})", std::vector{get_key_path(), get_named_path("a")}, "b\nc"); - - test_get_json_object( - R"({"a":"b\"c"})", std::vector{get_key_path(), get_named_path("a")}, "b\"c"); - - test_get_json_object_fail( - json_for_test, std::vector{get_key_path(), get_named_path("non_exist_key")}); - - test_get_json_object_fail(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_index_path(10)}); - - test_get_json_object_fail(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("book"), - get_subscript_path(), - get_index_path(0), - get_key_path(), - get_named_path("non_exist_key")}); - - test_get_json_object_fail(json_for_test, - std::vector{get_key_path(), - get_named_path("store"), - get_key_path(), - get_named_path("basket"), - get_subscript_path(), - get_wildcard_path(), - get_key_path(), - get_named_path("non_exist_key")}); - - std::string bad_json = "\u0000\u0000\u0000A\u0001AAA"; - test_get_json_object_fail(bad_json, - std::vector{get_key_path(), get_named_path("a")}); -} - -/** - * https://github.com/NVIDIA/spark-rapids/issues/10537 - */ -TEST_F(GetJsonObjectTests, TestIssue_10537) -{ - test_get_json_object( - R"({"'a":"v"})", std::vector{get_key_path(), get_named_path("'a")}, "v"); -} - -/** - * https://github.com/NVIDIA/spark-rapids/issues/10218 - */ -TEST_F(GetJsonObjectTests, TestIssue_10218) -{ - test_get_json_object(R"({"a" : "A"})", R"({"a":"A"})"); - test_get_json_object( - R"({'a' : 'A"'})", std::vector{get_key_path(), get_named_path("a")}, "A\""); - test_get_json_object(R"({'a' : 'A"'})", R"({"a":"A\""})"); - test_get_json_object(R"({"a" : "B\'"})", R"({"a":"B'"})"); - test_get_json_object(R"({"a" : "B'"})", R"({"a":"B'"})"); -} - -/** - * https://github.com/NVIDIA/spark-rapids/issues/10196 - * one char '\t' and 2 chars '\\', 't' in field name both are one char '\t' after unescape. - */ -TEST_F(GetJsonObjectTests, TestIssue_10196) -{ - // filed name is 2 chars: 't', '\t'; path is 2 chars: 't', '\t' - // because of allowing control char, '\t' char can be in the string directly without escape - test_get_json_object(" { \"t\t\" : \"t\" } ", - std::vector{get_key_path(), get_named_path("t\t")}, - "t"); - - // filed name is 3 chars: 't', '\\', 't'; path is 2 chars: 't', '\t' - // unescaped filed name is 2 chars: 't', '\t' - test_get_json_object(" { \"t\\t\" : \"t\" } ", - std::vector{get_key_path(), get_named_path("t\t")}, - "t"); - - // filed name is 2 chars: 't', '\t'; path is 2 chars: 't', '\t' - // because of allowing control char, '\t' char can be in the string directly without escape - // According to conventional JSON format, '\t' char can be in string directly without escape - test_get_json_object(" { \"t\t\" : \"t\" } ", - std::vector{get_key_path(), get_named_path("t\t")}, - "t"); - - // filed name is 3 chars: 't', '\\', 't'; path is 2 chars: 't', '\t' - test_get_json_object(" { 't\\t' : 't' } ", - std::vector{get_key_path(), get_named_path("t\t")}, - "t"); - - // path is 3 chars: 't', '\\', 't' - test_get_json_object_fail(" { 't\\t' : 't' } ", - std::vector{get_key_path(), get_named_path("t\\t")}); -} - -/** - * https://github.com/NVIDIA/spark-rapids/issues/10194 - */ -TEST_F(GetJsonObjectTests, TestIssue_10194) -{ - test_get_json_object_fail(R"( {"url":"http://test.com",,} )", - std::vector{get_key_path(), get_named_path("url")}); -} - -/** - * https://github.com/NVIDIA/spark-rapids/issues/9033 - */ -TEST_F(GetJsonObjectTests, TestIssue_9033) -{ - test_get_json_object(" {\"A\": \"B\"} ", - std::vector{get_key_path(), get_named_path("A")}, - "B"); - - test_get_json_object(" {\"A\": \"B\nB\"} ", - std::vector{get_key_path(), get_named_path("A")}, - "B\nB"); - - test_get_json_object(" {\"A\": \"\\u7CFB\\u7D71\"} ", - std::vector{get_key_path(), get_named_path("A")}, - "系統"); - - test_get_json_object(" {\"A\": \"\\u7CFB\t\\u7D71\"} ", - std::vector{get_key_path(), get_named_path("A")}, - "系\t統"); -} - -TEST_F(GetJsonObjectTests, Test_paths_depth_10) -{ - test_get_json_object( - "{\"k1\":{\"k2\":{\"k3\":{\"k4\":{\"k5\":{\"k6\":{\"k7\":{\"k8\":{\"k9\":{\"k10\":\"v10\"}}}}}}" - "}}}}", - std::vector{ - get_key_path(), get_named_path("k1"), get_key_path(), get_named_path("k2"), - get_key_path(), get_named_path("k3"), get_key_path(), get_named_path("k4"), - get_key_path(), get_named_path("k5"), get_key_path(), get_named_path("k6"), - get_key_path(), get_named_path("k7"), get_key_path(), get_named_path("k8"), - get_key_path(), get_named_path("k9"), get_key_path(), get_named_path("k10")}, - "v10"); -} diff --git a/src/main/cpp/tests/json_parser_tests.cpp b/src/main/cpp/tests/json_parser_tests.cpp deleted file mode 100644 index ef282182db..0000000000 --- a/src/main/cpp/tests/json_parser_tests.cpp +++ /dev/null @@ -1,1264 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include - -struct JsonParserTests : public cudf::test::BaseFixture {}; -using spark_rapids_jni::json_parser; -using spark_rapids_jni::json_token; - -template -std::vector parse(std::string json_str) -{ - json_parser - parser(json_str.data(), json_str.size()); - std::vector tokens; - json_token token = parser.next_token(); - tokens.push_back(token); - while (token != json_token::ERROR && token != json_token::SUCCESS) { - token = parser.next_token(); - tokens.push_back(token); - } - return tokens; -} - -template -void test_basic() -{ - std::vector>> cases = { - std::make_pair( - // test terminal number - std::string{" \r\n\t \r\n\t 1 \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_NUMBER_INT, json_token::SUCCESS}), - std::make_pair( - // test terminal float - std::string{" \r\n\t \r\n\t 1.5 \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_NUMBER_FLOAT, json_token::SUCCESS}), - std::make_pair( - // test terminal string - std::string{" \r\n\t \r\n\t \"abc\" \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_STRING, json_token::SUCCESS}), - std::make_pair( - // test terminal true - std::string{" \r\n\t \r\n\t true \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_TRUE, json_token::SUCCESS}), - std::make_pair( - // test terminal false - std::string{" \r\n\t \r\n\t false \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_FALSE, json_token::SUCCESS}), - std::make_pair( - // test terminal null - std::string{" \r\n\t \r\n\t null \r\n\t \r\n\t "}, - std::vector{json_token::VALUE_NULL, json_token::SUCCESS}), - - std::make_pair( - // test numbers - std::string{R"( - [ - 0, 102, -0, -102, 0.3, -0.3000, 1e-050, -1e-5, 1.0e-5, -1.0010e-050, 1E+5, 1e0, 1E0, 1.3e5, -1e01, 1e00000 - ] - )"}, - std::vector{json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::VALUE_NUMBER_FLOAT, - json_token::END_ARRAY, - json_token::SUCCESS}), - std::make_pair( - // test string - std::string{"\"美国,中国\\u12f3\\u113E---abc---\\\", \\/, \\\\, \\b, " - "\\f, \\n, \\r, \\t\""}, - std::vector{json_token::VALUE_STRING, json_token::SUCCESS}), - std::make_pair( - // test empty object - std::string{" { } "}, - std::vector{json_token::START_OBJECT, json_token::END_OBJECT, json_token::SUCCESS}), - std::make_pair( - // test empty array - std::string{" [ ] "}, - std::vector{json_token::START_ARRAY, json_token::END_ARRAY, json_token::SUCCESS}), - std::make_pair( - // test nesting arrays - std::string{R"( - [ - 1 , - [ - 2 , - [ - 3 , - [ - 41 , 42 , 43 - ] - ] - ] - ] - )"}, - std::vector{json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::END_ARRAY, - json_token::END_ARRAY, - json_token::END_ARRAY, - json_token::END_ARRAY, - json_token::SUCCESS}), - std::make_pair( - // test nesting objects - std::string{R"( - { - "k1" : "v1" , - "k2" : { - "k3" : { - "k4" : { - "k51" : "v51" , - "k52" : "v52" - } - } - } - } - )"}, - std::vector{json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_STRING, - json_token::FIELD_NAME, - json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_STRING, - json_token::FIELD_NAME, - json_token::VALUE_STRING, - json_token::END_OBJECT, - json_token::END_OBJECT, - json_token::END_OBJECT, - json_token::END_OBJECT, - json_token::SUCCESS}), - std::make_pair( - // test nesting objects and arrays - std::string{R"( - { - "k1" : "v1", - "k2" : [ - 1, { - "k21" : "v21", - "k22" : [1 , 2 , -1.5] - } - ] - } - )"}, - std::vector{json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_STRING, - json_token::FIELD_NAME, - json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_STRING, - json_token::FIELD_NAME, - json_token::START_ARRAY, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_INT, - json_token::VALUE_NUMBER_FLOAT, - json_token::END_ARRAY, - json_token::END_OBJECT, - json_token::END_ARRAY, - json_token::END_OBJECT, - json_token::SUCCESS}), - - std::make_pair( - // test invalid string: should have 4 HEX - std::string{"\" \\uFFF \""}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid string: invalid HEX 'T' - std::string{"\" \\uTFFF \""}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid string: unclosed string - std::string{" \"abc "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid string: have no char after escape char '\' - std::string{"\"\\"}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid string: \X is not allowed - std::string{"\" \\X \""}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid num - std::string{" +5 "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid num - std::string{" 1. "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid num - std::string{" 1e "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid num - std::string{" 1e- "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid num - std::string{" infinity "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{" {"}, - std::vector{json_token::START_OBJECT, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{" ["}, - std::vector{json_token::START_ARRAY, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{" {1} "}, - std::vector{json_token::START_OBJECT, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{R"( - {"k",} - )"}, - std::vector{json_token::START_OBJECT, json_token::FIELD_NAME, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{R"( - {"k": } - )"}, - std::vector{json_token::START_OBJECT, json_token::FIELD_NAME, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{R"( - {"k": 1 :} - )"}, - std::vector{json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_NUMBER_INT, - json_token::ERROR}), - - std::make_pair( - // test invalid structure - std::string{R"( - {"k": 1 , } - )"}, - std::vector{json_token::START_OBJECT, - json_token::FIELD_NAME, - json_token::VALUE_NUMBER_INT, - json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{R"( - [ 1 : - )"}, - std::vector{json_token::START_ARRAY, json_token::VALUE_NUMBER_INT, json_token::ERROR}), - std::make_pair( - // test invalid structure - std::string{R"( - [ 1, - )"}, - std::vector{json_token::START_ARRAY, json_token::VALUE_NUMBER_INT, json_token::ERROR}), - std::make_pair( - // test invalid null - std::string{" nul "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid false - std::string{" fals "}, - std::vector{json_token::ERROR}), - std::make_pair( - // test invalid true - std::string{" tru "}, - std::vector{json_token::ERROR}), - - }; - for (std::size_t i = 0; i < cases.size(); ++i) { - std::string json_str = cases[i].first; - std::vector expected_tokens = cases[i].second; - std::vector actual_tokens = parse(json_str); - ASSERT_EQ(actual_tokens, expected_tokens); - } -} - -void test_len_limitation() -{ - std::vector v; - v.push_back(" '123456' "); - v.push_back(" 'k\n\\'\\\"56' "); // do not count escape char '\', actual - // has 6 chars: k \n ' " 5 6 - v.push_back(" 123456 "); - v.push_back(" -1.23e-456 "); - - auto error_token = std::vector{json_token::ERROR}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - true, - // max json nesting depth - spark_rapids_jni::curr_max_json_nesting_depth, - // max_string_len - 5, - // max_num_len - 5, - // allow_tailing - true>(v[i]); - // exceed num/str length limits - ASSERT_EQ(actual_tokens, error_token); - } - - v.clear(); - v.push_back(" '12345' "); - v.push_back(" 'k\n\\'\\\"5' "); // do not count escape char '\', - // has 5 chars: k \n ' " 5 - auto expect_str_ret = std::vector{json_token::VALUE_STRING, json_token::SUCCESS}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - true, - // max json nesting depth - spark_rapids_jni::curr_max_json_nesting_depth, - // max_string_len - 5, - // max_num_len - 5, - // allow_tailing - true>(v[i]); - ASSERT_EQ(actual_tokens, expect_str_ret); - } - - v.clear(); - v.push_back(" 12345 "); - v.push_back(" -1.23e-45 "); - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - false, - // max json nesting depth - spark_rapids_jni::curr_max_json_nesting_depth, - // max_string_len - 5, - // max_num_len - 5, - // allow_tailing - true>(v[i]); - ASSERT_EQ(actual_tokens[1], json_token::SUCCESS); - } -} - -void test_single_double_quote() -{ - std::vector v; - // allow \' \" " in single quote - v.push_back("' \\\' \\\" \" '"); - // allow \' \" ' in double quote - v.push_back("\" \\\' \\\" ' \' \""); // C++ allow \' to represent - // ' in string - auto expect_ret = std::vector{json_token::VALUE_STRING, json_token::SUCCESS}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - false>(v[i]); - ASSERT_EQ(actual_tokens, expect_ret); - } - - v.clear(); - v.push_back("\" \\' \""); // not allow \' when single_quote is disabled - expect_ret = std::vector{json_token::ERROR}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - false, - // control_char - true>(v[i]); - - ASSERT_EQ(actual_tokens, expect_ret); - } - - v.clear(); - v.push_back("\" ' \\\" \""); // allow ' \" in double quote - expect_ret = std::vector{json_token::VALUE_STRING, json_token::SUCCESS}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - false, - // control_char - true>(v[i]); - ASSERT_EQ(actual_tokens, expect_ret); - } - - v.clear(); - v.push_back(" 'str' "); // ' is not allowed to quote string - expect_ret = std::vector{json_token::ERROR}; - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - false, - // control_char - true>(v[i]); - ASSERT_EQ(actual_tokens, expect_ret); - } -} - -void test_max_nested_len() -{ - std::vector v; - v.push_back("[[[[[]]]]]"); - v.push_back("{'k1':{'k2':{'k3':{'k4':{'k5': 5}}}}}"); - for (std::size_t i = 0; i < v.size(); ++i) { - // set max nested len template value as 5 - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - true, - // max json nesting depth - 5>(v[i]); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::SUCCESS); - } - - v.clear(); - v.push_back("[[[[[[]]]]]]"); - v.push_back("{'k1':{'k2':{'k3':{'k4':{'k5': {'k6': 6}}}}}}"); - for (std::size_t i = 0; i < v.size(); ++i) { - // set max nested len template value as 5 - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - false, - // max json nesting depth - 5>(v[i]); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::ERROR); - } -} - -void test_control_char() -{ - std::vector v; - v.push_back("' \t \n \b '"); // \t \n \b are control chars - for (std::size_t i = 0; i < v.size(); ++i) { - // set max nested len template value as 5 - std::vector actual_tokens = parse<>(v[i]); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::SUCCESS); - } - - for (std::size_t i = 0; i < v.size(); ++i) { - // set max nested len template value as 5 - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - false>(v[i]); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::ERROR); - } -} - -void test_allow_tailing_useless_chars() -{ - std::vector v; - v.push_back(" 0xxxx "); // 0 is valid JSON, tailing xxxx is ignored - // when allow tailing - v.push_back(" {}xxxx "); // tailing xxxx is ignored - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse<>(v[i]); - ASSERT_TRUE(actual_tokens.size() > 0); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::SUCCESS); - } - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - true, - // max json nesting depth - spark_rapids_jni::curr_max_json_nesting_depth, - // max_string_len - spark_rapids_jni::curr_max_string_utf8_bytes, - // max_num_len - spark_rapids_jni::curr_max_num_len, - // allow_tailing - false>(v[i]); - ASSERT_TRUE(actual_tokens.size() > 0); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::ERROR); - } - - v.clear(); - v.push_back(" 12345xxxxxx "); - v.push_back(" -1.23e-45xxxxx "); - for (std::size_t i = 0; i < v.size(); ++i) { - std::vector actual_tokens = parse< - // bool single_quote, - true, - // control_char - false, - // max json nesting depth - spark_rapids_jni::curr_max_json_nesting_depth, - // max_string_len - 5, - // max_num_len - 5, - // allow_tailing - true>(v[i]); - ASSERT_TRUE(actual_tokens.size() > 0); - ASSERT_EQ(actual_tokens[actual_tokens.size() - 1], json_token::SUCCESS); - } -} - -void test_is_valid() -{ - std::string json_str = " { \"k\" : [1,2,3]} "; - json_parser parser1(json_str.data(), json_str.size()); - ASSERT_TRUE(parser1.is_valid()); - - json_str = " {[1,2, "; - json_parser parser2(json_str.data(), json_str.size()); - ASSERT_FALSE(parser2.is_valid()); -} - -TEST_F(JsonParserTests, NormalTest) -{ - test_basic(); - test_basic(); - test_basic(); - test_basic(); - test_len_limitation(); - test_single_double_quote(); - test_max_nested_len(); - test_control_char(); - test_allow_tailing_useless_chars(); - test_is_valid(); -} - -template -json_parser -get_parser(std::string const& json_str) -{ - return json_parser(json_str.data(), json_str.size()); -} - -TEST_F(JsonParserTests, SkipChildrenForObject) -{ - // test skip for the first { - std::string json = " { 'k1' : 'v1' , 'k2' : { 'k3' : { 'k4' : 'v5' } } } "; - auto parser = get_parser(json); - // can not skip for INIT token - ASSERT_FALSE(parser.try_skip_children()); - ASSERT_EQ(json_token::START_OBJECT, parser.next_token()); - // test skip for tokens: { - ASSERT_TRUE(parser.try_skip_children()); - ASSERT_EQ(json_token::END_OBJECT, parser.get_current_token()); - ASSERT_EQ(json_token::SUCCESS, parser.next_token()); - // can not skip for SUCCESS token - ASSERT_FALSE(parser.try_skip_children()); - - // test skip for tokens: not [ { - parser.reset(); - ASSERT_EQ(json_token::START_OBJECT, parser.next_token()); - ASSERT_EQ(json_token::FIELD_NAME, parser.next_token()); - ASSERT_TRUE(parser.try_skip_children()); - ASSERT_EQ(json_token::FIELD_NAME, parser.get_current_token()); -} - -TEST_F(JsonParserTests, SkipChildrenForArray) -{ - // skip for [ - std::string json = " [ [ [ [ 1, 2, 3 ] ] ] ] "; - auto parser = get_parser(json); - ASSERT_FALSE(parser.try_skip_children()); - ASSERT_EQ(json_token::START_ARRAY, parser.next_token()); - ASSERT_EQ(json_token::START_ARRAY, parser.next_token()); - ASSERT_TRUE(parser.try_skip_children()); - ASSERT_EQ(json_token::END_ARRAY, parser.get_current_token()); - ASSERT_EQ(json_token::END_ARRAY, parser.next_token()); - ASSERT_EQ(json_token::SUCCESS, parser.next_token()); - // can not skip for SUCCESS token - ASSERT_FALSE(parser.try_skip_children()); -} - -TEST_F(JsonParserTests, SkipChildrenInvalid) -{ - std::string json = " invalid "; - auto parser = get_parser(json); - parser.next_token(); - ASSERT_EQ(json_token::ERROR, parser.get_current_token()); - // can not skip for ERROR token - ASSERT_FALSE(parser.try_skip_children()); -} - -void clear_buff(char buf[], std::size_t size) { memset(buf, 0, size); } - -void assert_start_with(char* buf, std::size_t buf_size, const std::string& prefix) -{ - std::string str(buf, buf_size); - ASSERT_EQ(0, str.find(prefix)); - for (std::size_t i = prefix.size(); i < str.size(); i++) { - ASSERT_EQ('\0', str[i]); - } -} - -TEST_F(JsonParserTests, WriteUnescapedStringText) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " { 'key123' : 'value123' } "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::START_OBJECT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "{"); - - ASSERT_EQ(json_token::FIELD_NAME, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(6, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "key123"); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(8, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "value123"); - - ASSERT_EQ(json_token::END_OBJECT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "}"); -} - -TEST_F(JsonParserTests, WriteUnescapedNumberText) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " [ -12345 , -1.23e-000123 , true , false , null ] "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::START_ARRAY, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "["); - - ASSERT_EQ(json_token::VALUE_NUMBER_INT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(6, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "-12345"); - - ASSERT_EQ(json_token::VALUE_NUMBER_FLOAT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(13, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "-1.23e-000123"); - - ASSERT_EQ(json_token::VALUE_TRUE, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(4, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "true"); - - ASSERT_EQ(json_token::VALUE_FALSE, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(5, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "false"); - - ASSERT_EQ(json_token::VALUE_NULL, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(4, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "null"); - - ASSERT_EQ(json_token::END_ARRAY, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "]"); - - ASSERT_EQ(json_token::SUCCESS, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, ""); -} - -TEST_F(JsonParserTests, WriteUnescapedInvalid) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " invalid "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::INIT, parser.get_current_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, ""); - - ASSERT_EQ(json_token::ERROR, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, ""); -} - -TEST_F(JsonParserTests, WriteUnescapedEscape) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - // test escape: \", \', \\, \/, \b, \f, \n, \r, \t - std::string json = " '\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b' "; - auto parser = get_parser(json); - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(10, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "\"\'\\/\b\f\n\r\t\b"); -} - -TEST_F(JsonParserTests, WriteUnescapedUnicode) -{ - // "中国".getBytes(StandardCharsets.UTF_8) is: - // Array(-28, -72, -83, -27, -101, -67) - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = " '\\u4e2d\\u56FD' "; // Represents 中国 - auto parser = get_parser(json); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(6, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "中国"); -} - -TEST_F(JsonParserTests, WriteUnescapedOther) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = " '中国' "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(6, parser.write_unescaped_text(buf)); - assert_start_with(buf, buf_size, "中国"); -} - -void assert_ptr_len(char const* actaul_ptr, - cudf::size_type actual_len, - char* expected_ptr, - cudf::size_type expected_len) -{ - ASSERT_EQ(expected_ptr, actaul_ptr); - ASSERT_EQ(expected_len, actual_len); -} - -void assert_float_parts(bool float_sign, - char const* float_integer_pos, - int float_integer_len, - char const* float_fraction_pos, - int float_fraction_len, - char const* float_exp_pos, - int float_exp_len, - bool actual_float_sign, - char const* actual_float_integer_pos, - int actual_float_integer_len, - char const* actual_float_fraction_pos, - int actual_float_fraction_len, - char const* actual_float_exp_pos, - int actual_float_exp_len) -{ - ASSERT_EQ(float_sign, actual_float_sign); - ASSERT_EQ(float_integer_pos, actual_float_integer_pos); - ASSERT_EQ(float_integer_len, actual_float_integer_len); - ASSERT_EQ(float_fraction_pos, actual_float_fraction_pos); - ASSERT_EQ(float_fraction_len, actual_float_fraction_len); - ASSERT_EQ(float_exp_pos, actual_float_exp_pos); - ASSERT_EQ(float_exp_len, actual_float_exp_len); -} - -TEST_F(JsonParserTests, GetFloatParts) -{ - // int part is 123, fraction part is 0345, exp part is -05678 - std::string json = "[-123.0345e-05678] "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::INIT, parser.get_current_token()); - ASSERT_EQ(json_token::START_ARRAY, parser.next_token()); - - ASSERT_EQ(json_token::VALUE_NUMBER_FLOAT, parser.next_token()); - auto parts = parser.get_current_float_parts(); - assert_float_parts(false, - json.data() + 2, - 3, - json.data() + 6, - 4, - json.data() + 11, - 6, - thrust::get<0>(parts), - thrust::get<1>(parts), - thrust::get<2>(parts), - thrust::get<3>(parts), - thrust::get<4>(parts), - thrust::get<5>(parts), - thrust::get<6>(parts)); -} - -void assert_field_names(std::string json, - std::vector> expected_field_names) -{ - auto parser = get_parser(json); - size_t i = 0; - while (true) { - auto t = parser.next_token(); - if (json_token::SUCCESS == t) { - break; - } else if (json_token::ERROR == t) { - ASSERT_TRUE(false); - } else { - auto opt = expected_field_names[i]; - if (opt.has_value()) { - auto str = opt.value(); - ASSERT_TRUE( - parser.match_current_field_name(str.data(), static_cast(str.size()))); - } else { - ASSERT_FALSE(parser.match_current_field_name(nullptr, 0)); - } - } - i++; - } -} - -TEST_F(JsonParserTests, MatchFieldNameTest) -{ - std::string json; - json = - " { 'k1' : { 'k2' : { 'k3': { 'k4': 4 } } " - " } } "; - assert_field_names(json, - std::vector>{std::nullopt, - "k1", - std::nullopt, - "k2", - std::nullopt, - "k3", - std::nullopt, - "k4", - std::nullopt, - std::nullopt, - std::nullopt, - std::nullopt, - std::nullopt}); -} - -TEST_F(JsonParserTests, WriteEscapedStringText) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " { 'key123' : 'value123' } "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::START_OBJECT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "{"); - - ASSERT_EQ(json_token::FIELD_NAME, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(8, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "\"key123\""); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(10, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "\"value123\""); - - ASSERT_EQ(json_token::END_OBJECT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "}"); -} - -TEST_F(JsonParserTests, WriteEscapedNumberText) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " [ -12345 , -1.23e-000123 , true , false , null ] "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::START_ARRAY, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "["); - - ASSERT_EQ(json_token::VALUE_NUMBER_INT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(6, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "-12345"); - - ASSERT_EQ(json_token::VALUE_NUMBER_FLOAT, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(13, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "-1.23e-000123"); - - ASSERT_EQ(json_token::VALUE_TRUE, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(4, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "true"); - - ASSERT_EQ(json_token::VALUE_FALSE, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(5, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "false"); - - ASSERT_EQ(json_token::VALUE_NULL, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(4, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "null"); - - ASSERT_EQ(json_token::END_ARRAY, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(1, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "]"); - - ASSERT_EQ(json_token::SUCCESS, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, ""); -} - -TEST_F(JsonParserTests, WriteEscapedInvalid) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - - std::string json = " invalid "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::INIT, parser.get_current_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, ""); - - ASSERT_EQ(json_token::ERROR, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(0, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, ""); -} - -TEST_F(JsonParserTests, WriteEscapedEscape) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - // test escape: \", \', \\, \/, \b, \f, \n, \r, \t, \b - std::string json = " '\\\"\\'\\\\\\/\\b\\f\\n\\r\\t\\b' "; - auto parser = get_parser(json); - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(20, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "\"\\\"'\\\\/\\b\\f\\n\\r\\t\\b\""); -} - -TEST_F(JsonParserTests, WriteEscapedUnicode) -{ - // "中国".getBytes(StandardCharsets.UTF_8) is: - // Array(-28, -72, -83, -27, -101, -67) - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = " '\\u4e2d\\u56FD' "; // Represents 中国 - auto parser = get_parser(json); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(8, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "\"中国\""); -} - -TEST_F(JsonParserTests, WriteEscapedOther) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = " '中国' "; - auto parser = get_parser(json); - - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - ASSERT_EQ(8, parser.write_escaped_text(buf)); - assert_start_with(buf, buf_size, "\"中国\""); -} - -TEST_F(JsonParserTests, WriteEscapedContralChars) -{ - std::vector> cases = { - std::make_pair(0, "\"\\u0000\""), std::make_pair(1, "\"\\u0001\""), - std::make_pair(2, "\"\\u0002\""), std::make_pair(3, "\"\\u0003\""), - std::make_pair(4, "\"\\u0004\""), std::make_pair(5, "\"\\u0005\""), - std::make_pair(6, "\"\\u0006\""), std::make_pair(7, "\"\\u0007\""), - std::make_pair(8, "\"\\b\""), std::make_pair(9, "\"\\t\""), - std::make_pair(10, "\"\\n\""), std::make_pair(11, "\"\\u000B\""), - std::make_pair(12, "\"\\f\""), std::make_pair(13, "\"\\r\""), - std::make_pair(14, "\"\\u000E\""), std::make_pair(15, "\"\\u000F\""), - std::make_pair(16, "\"\\u0010\""), std::make_pair(17, "\"\\u0011\""), - std::make_pair(18, "\"\\u0012\""), std::make_pair(19, "\"\\u0013\""), - std::make_pair(20, "\"\\u0014\""), std::make_pair(21, "\"\\u0015\""), - std::make_pair(22, "\"\\u0016\""), std::make_pair(23, "\"\\u0017\""), - std::make_pair(24, "\"\\u0018\""), std::make_pair(25, "\"\\u0019\""), - std::make_pair(26, "\"\\u001A\""), std::make_pair(27, "\"\\u001B\""), - std::make_pair(28, "\"\\u001C\""), std::make_pair(29, "\"\\u001D\""), - std::make_pair(30, "\"\\u001E\""), std::make_pair(31, "\"\\u001F\"")}; - for (size_t i = 0; i < cases.size(); ++i) { - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = "'"; - json = json + (char)(cases[i].first); - json = json + "'"; - auto parser = get_parser(json); - ASSERT_EQ(json_token::VALUE_STRING, parser.next_token()); - clear_buff(buf, buf_size); - parser.write_escaped_text(buf); - assert_start_with(buf, buf_size, cases[i].second); - } -} - -void testCopyCurrentStructureValid(bool copy_to_nullptr) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - { - "k1": 1, - 'k2': { - "k3": {} - } - } - )"; - auto parser = get_parser(json); - parser.next_token(); - clear_buff(buf, buf_size); - char* copy_to = copy_to_nullptr ? nullptr : buf; - thrust::pair ret; - ret = parser.copy_current_structure(copy_to); - ASSERT_TRUE(thrust::get<0>(ret)); // copy from the first { - std::string expect = R"({"k1":1,"k2":{"k3":{}}})"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - if (!copy_to_nullptr) { assert_start_with(buf, buf_size, expect); } - ASSERT_EQ(parser.get_current_token(), json_token::END_OBJECT); - - parser.reset(); - clear_buff(buf, buf_size); - parser.next_token(); - parser.next_token(); - parser.next_token(); - parser.next_token(); - parser.next_token(); - copy_to = copy_to_nullptr ? nullptr : buf; - ret = parser.copy_current_structure(copy_to); - ASSERT_TRUE(thrust::get<0>(ret)); // copy from the the 2nd { - expect = R"({"k3":{}})"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - if (!copy_to_nullptr) { assert_start_with(buf, buf_size, expect); } - ASSERT_EQ(parser.get_current_token(), json_token::END_OBJECT); - - std::string json2 = R"( - [[1,{'k':2},3]] - )"; - auto parser2 = get_parser(json2); - parser2.next_token(); - clear_buff(buf, buf_size); - copy_to = copy_to_nullptr ? nullptr : buf; - ret = parser2.copy_current_structure(copy_to); // copy from the first [ - ASSERT_TRUE(thrust::get<0>(ret)); - expect = R"([[1,{"k":2},3]])"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - if (!copy_to_nullptr) { assert_start_with(buf, buf_size, expect); } - ASSERT_EQ(parser2.get_current_token(), json_token::END_ARRAY); - - parser2.reset(); - clear_buff(buf, buf_size); - parser2.next_token(); - parser2.next_token(); - copy_to = copy_to_nullptr ? nullptr : buf; - ret = parser2.copy_current_structure(copy_to); - ASSERT_TRUE(thrust::get<0>(ret)); // copy from the 2nd [ - expect = R"([1,{"k":2},3])"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - if (!copy_to_nullptr) { assert_start_with(buf, buf_size, expect); } - ASSERT_EQ(parser2.get_current_token(), json_token::END_ARRAY); - - parser2.reset(); - clear_buff(buf, buf_size); - parser2.next_token(); - parser2.next_token(); - parser2.next_token(); // current token is 1 - copy_to = copy_to_nullptr ? nullptr : buf; - ret = parser2.copy_current_structure(copy_to); - ASSERT_TRUE(thrust::get<0>(ret)); - expect = "1"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - if (!copy_to_nullptr) { assert_start_with(buf, buf_size, expect); } - ASSERT_EQ(parser2.get_current_token(), json_token::VALUE_NUMBER_INT); -} - -TEST_F(JsonParserTests, CopyCurrentStructureValid) -{ - testCopyCurrentStructureValid(/* copy_to_nullptr */ false); - testCopyCurrentStructureValid(/* copy_to_nullptr */ true); -} - -TEST_F(JsonParserTests, CopyCurrentStructureValidEscape) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - {'a' : 'A"'} - )"; - auto parser = get_parser(json); - char* copy_to = buf; - thrust::pair ret; - parser.next_token(); - copy_to = buf; - clear_buff(buf, buf_size); - ret = parser.copy_current_structure(copy_to); - ASSERT_TRUE(thrust::get<0>(ret)); - std::string expect = R"({"a":"A\""})"; - ASSERT_EQ(thrust::get<1>(ret), expect.size()); - assert_start_with(buf, buf_size, expect); -} - -TEST_F(JsonParserTests, CopyCurrentStructureInValid1) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - [{}] - )"; - auto parser = get_parser(json); - char* copy_to = buf; - thrust::pair ret; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // INIT token - parser.next_token(); - parser.next_token(); - parser.next_token(); - copy_to = buf; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for } token - parser.next_token(); - copy_to = buf; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for ] token - parser.next_token(); - copy_to = buf; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for SUCCESS token -} - -TEST_F(JsonParserTests, CopyCurrentStructureInValid2) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - {'k' : 1} - )"; - auto parser = get_parser(json); - parser.next_token(); - parser.next_token(); - char* copy_to = buf; - thrust::pair ret; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for field name 'k', return false -} - -TEST_F(JsonParserTests, CopyCurrentStructureInValid3) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - {'k' : 1 - )"; - auto parser = get_parser(json); - parser.next_token(); - char* copy_to = buf; - thrust::pair ret; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for { token, fails because of invalid JSON format -} - -TEST_F(JsonParserTests, CopyCurrentStructureInValid4) -{ - constexpr std::size_t buf_size = 256; - char buf[buf_size]; - std::string json = R"( - invalid - )"; - auto parser = get_parser(json); - parser.next_token(); - char* copy_to = buf; - thrust::pair ret; - ret = parser.copy_current_structure(copy_to); - ASSERT_FALSE(thrust::get<0>(ret)); // for ERROR token -} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java index 6ff79a4e12..d2dd22c929 100644 --- a/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java +++ b/src/test/java/com/nvidia/spark/rapids/jni/GetJsonObjectTest.java @@ -168,52 +168,68 @@ void getJsonObjectTest_Baidu_get_unexist_field_name() { } } - /** - * query path is : $ + * test escape chars: " in ' pair; ' in " pair */ @Test - void getJsonObjectTest_Baidu_path_is_empty() { + void getJsonObjectTest_Escape() { int paths_num = 0; JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[0]; - String JSON = "[100.0,200.000,351.980]"; - String expectedStr = "[100.0,200.000,351.980]"; + String JSON1 = "{ \"a\": \"A\" }"; + String JSON2 = "{'a':'A\"'}"; + String JSON3 = "{'a':\"B'\"}"; + String JSON4 = "['a','b','\"C\"']"; + + String expectedStr1 = "{\"a\":\"A\"}"; + String expectedStr2 = "{\"a\":\"A\\\"\"}"; + String expectedStr3 = "{\"a\":\"B'\"}"; + String expectedStr4 = "[\"a\",\"b\",\"\\\"C\\\"\"]"; + try ( ColumnVector jsonCv = ColumnVector.fromStrings( - JSON, JSON, JSON, JSON, JSON, JSON, JSON); + JSON1, JSON2, JSON3, JSON4); ColumnVector expected = ColumnVector.fromStrings( - expectedStr, expectedStr, expectedStr, expectedStr, expectedStr, expectedStr, expectedStr); + expectedStr1, expectedStr2, expectedStr3, expectedStr4); ColumnVector actual = JSONUtils.getJsonObject(jsonCv, paths_num, query)) { assertColumnsAreEqual(expected, actual); } } /** - * test escape chars: " in ' pair; ' in " pair + * test number normalizations */ @Test - void getJsonObjectTest_Escape() { + void getJsonObjectTest_Number_Normalization() { int paths_num = 0; JSONUtils.PathInstructionJni[] query = new JSONUtils.PathInstructionJni[0]; - String JSON1 = "{ \"a\": \"A\" }"; - String JSON2 = "{'a':'A\"'}"; - String JSON3 = "{'a':\"B'\"}"; - String JSON4 = "['a','b','\"C\"']"; - - String expectedStr1 = "{\"a\":\"A\"}"; - String expectedStr2 = "{\"a\":\"A\\\"\"}"; - String expectedStr3 = "{\"a\":\"B'\"}"; - String expectedStr4 = "[\"a\",\"b\",\"\\\"C\\\"\"]"; + String JSON1 = "[100.0,200.000,351.980]"; + String JSON2 = "[12345678900000000000.0]"; + String JSON3 = "[0.0]"; + String JSON4 = "[-0.0]"; + String JSON5 = "[-0]"; + String JSON6 = "[12345678999999999999999999]"; + String JSON7 = "[1E308]"; + String JSON8 = "[1.0E309,-1E309,1E5000]"; + + String expectedStr1 = "[100.0,200.0,351.98]"; + String expectedStr2 = "[1.23456789E19]"; + String expectedStr3 = "[0.0]"; + String expectedStr4 = "[-0.0]"; + String expectedStr5 = "[0]"; + String expectedStr6 = "[12345678999999999999999999]"; + String expectedStr7 = "[1.0E308]"; + String expectedStr8 = "[\"Infinity\",\"-Infinity\",\"Infinity\"]"; try ( ColumnVector jsonCv = ColumnVector.fromStrings( - JSON1, JSON2, JSON3, JSON4); + JSON1, JSON2, JSON3, JSON4, JSON5, JSON6, JSON7, JSON8); ColumnVector expected = ColumnVector.fromStrings( - expectedStr1, expectedStr2, expectedStr3, expectedStr4); + expectedStr1, expectedStr2, expectedStr3, expectedStr4, expectedStr5, expectedStr6, expectedStr7, expectedStr8); ColumnVector actual = JSONUtils.getJsonObject(jsonCv, paths_num, query)) { assertColumnsAreEqual(expected, actual); } + } }