-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
356 additions
and
28 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
============== | ||
JSON Functions | ||
============== | ||
|
||
JSON Format | ||
----------- | ||
|
||
JSON is a language-independent data format that represents data as | ||
human-readable text. A JSON text can represent a number, a boolean, a | ||
string, an array, an object, or a null, with slightly different grammar. | ||
For instance, a JSON text representing a string must escape all characters | ||
and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON | ||
text representing a number does not need to, such as ``123``. A JSON text | ||
representing an array must enclose the array elements in square brackets, | ||
such as ``[1,2,3]``. More detailed grammar can be found in | ||
`this JSON introduction`_. | ||
|
||
.. _this JSON introduction: https://www.json.org | ||
|
||
JSON Functions | ||
-------------- | ||
|
||
.. spark:function:: get_json_object(jsonString, path) -> varchar | ||
Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``. | ||
Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field" | ||
to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns | ||
``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed. | ||
Also returns NULL if ``path`` doesn't exist. :: | ||
|
||
SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b' | ||
SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}' | ||
SELECT get_json_object('{"a":3}', '$.b'); -- NULL (not found field) | ||
SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string) | ||
SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
/* | ||
* Copyright (c) Facebook, Inc. and its affiliates. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "velox/functions/prestosql/SIMDJsonFunctions.h" | ||
|
||
using namespace simdjson; | ||
|
||
namespace facebook::velox::functions::sparksql { | ||
|
||
template <typename T> | ||
struct SIMDGetJsonObjectFunction { | ||
VELOX_DEFINE_FUNCTION_TYPES(T); | ||
|
||
// ASCII input always produces ASCII result. | ||
static constexpr bool is_default_ascii_behavior = true; | ||
|
||
FOLLY_ALWAYS_INLINE void initialize( | ||
const std::vector<TypePtr>& /*inputTypes*/, | ||
const core::QueryConfig& config, | ||
const arg_type<Varchar>* /*json*/, | ||
const arg_type<Varchar>* jsonPath) { | ||
if (jsonPath != nullptr) { | ||
if (jsonPath->size() > 1 && jsonPath->data()[0] == '$') { | ||
jsonPath_ = removeSingleQuotes( | ||
std::string_view(jsonPath->data() + 1, jsonPath->size() - 1)); | ||
} | ||
} | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE bool call( | ||
out_type<Varchar>& result, | ||
const arg_type<Varchar>& json, | ||
const arg_type<Varchar>& jsonPath) { | ||
// Spark requires the first char in jsonPath is '$'. | ||
if (jsonPath.size() < 1 || jsonPath.data()[0] != '$') { | ||
return false; | ||
} | ||
// jsonPath is "$". | ||
if (jsonPath.size() == 1) { | ||
result.append(json); | ||
return true; | ||
} | ||
ParserContext ctx(json.data(), json.size()); | ||
ctx.parseDocument(); | ||
auto rawResult = jsonPath_.has_value() | ||
? ctx.jsonDoc.at_path(jsonPath_.value().data()) | ||
: ctx.jsonDoc.at_path( | ||
removeSingleQuotes( | ||
std::string_view(jsonPath.data() + 1, jsonPath.size() - 1)) | ||
.data()); | ||
if (rawResult.error()) { | ||
return false; | ||
} | ||
auto error = extractStringResult(rawResult, result); | ||
if (error) { | ||
return false; | ||
} | ||
|
||
const char* currentPos; | ||
ctx.jsonDoc.current_location().get(currentPos); | ||
return isValidEndingCharacter(currentPos); | ||
} | ||
|
||
private: | ||
// Spark's json path requires field name surrounded by single quotes if it is | ||
// specified in "[]". But simdjson lib requires not. This method just removes | ||
// such single quotes, e.g., converts "['a']['b']" to "[a][b]". | ||
FOLLY_ALWAYS_INLINE std::string removeSingleQuotes( | ||
std::string_view jsonPath) { | ||
std::string result(jsonPath.data(), jsonPath.size()); | ||
size_t pairEnd = 0; | ||
while (true) { | ||
auto pairBegin = result.find("['", pairEnd); | ||
if (pairBegin == std::string::npos) { | ||
break; | ||
} | ||
pairEnd = result.find("]", pairBegin); | ||
if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') { | ||
return "-1"; | ||
} | ||
result.erase(pairEnd - 1, 1); | ||
result.erase(pairBegin + 1, 1); | ||
pairEnd -= 2; | ||
} | ||
return result; | ||
} | ||
|
||
FOLLY_ALWAYS_INLINE simdjson::error_code extractStringResult( | ||
simdjson_result<ondemand::value> rawResult, | ||
out_type<Varchar>& result) { | ||
simdjson::error_code error; | ||
std::stringstream ss; | ||
switch (rawResult.type()) { | ||
// For number and bool types, we need to explicitly get the value | ||
// for specific types instead of using `ss << rawResult`. Thus, we | ||
// can make simdjson's internal parsing position moved and then we | ||
// can check the validity of ending character. | ||
case ondemand::json_type::number: { | ||
switch (rawResult.get_number_type()) { | ||
case ondemand::number_type::unsigned_integer: { | ||
uint64_t numberResult; | ||
error = rawResult.get_uint64().get(numberResult); | ||
if (!error) { | ||
ss << numberResult; | ||
result.append(ss.str()); | ||
} | ||
return error; | ||
} | ||
case ondemand::number_type::signed_integer: { | ||
int64_t numberResult; | ||
error = rawResult.get_int64().get(numberResult); | ||
if (!error) { | ||
ss << numberResult; | ||
result.append(ss.str()); | ||
} | ||
return error; | ||
} | ||
case ondemand::number_type::floating_point_number: { | ||
double numberResult; | ||
error = rawResult.get_double().get(numberResult); | ||
if (!error) { | ||
ss << rawResult; | ||
result.append(ss.str()); | ||
} | ||
return error; | ||
} | ||
default: | ||
VELOX_UNREACHABLE(); | ||
} | ||
} | ||
case ondemand::json_type::boolean: { | ||
bool boolResult; | ||
error = rawResult.get_bool().get(boolResult); | ||
if (!error) { | ||
result.append(boolResult ? "true" : "false"); | ||
} | ||
return error; | ||
} | ||
case ondemand::json_type::string: { | ||
std::string_view stringResult; | ||
error = rawResult.get_string().get(stringResult); | ||
result.append(stringResult); | ||
return error; | ||
} | ||
case ondemand::json_type::object: { | ||
// For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will | ||
// return an object type. | ||
ss << rawResult; | ||
result.append(ss.str()); | ||
return SUCCESS; | ||
} | ||
case ondemand::json_type::array: { | ||
ss << rawResult; | ||
result.append(ss.str()); | ||
return SUCCESS; | ||
} | ||
default: { | ||
return UNSUPPORTED_ARCHITECTURE; | ||
} | ||
} | ||
} | ||
|
||
// This is a simple validation by checking whether the obtained result is | ||
// followed by valid char. Because ondemand parsing we are using ignores json | ||
// format validation for characters following the current parsing position. | ||
bool isValidEndingCharacter(const char* currentPos) { | ||
char endingChar = *currentPos; | ||
if (endingChar == ',' || endingChar == '}' || endingChar == ']') { | ||
return true; | ||
} | ||
// These chars can be prior to a valid ending char. | ||
if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' || | ||
endingChar == '\t') { | ||
return isValidEndingCharacter(currentPos++); | ||
} | ||
return false; | ||
} | ||
|
||
std::optional<std::string> jsonPath_; | ||
}; | ||
|
||
} // namespace facebook::velox::functions::sparksql |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.