Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
PHILO-HE committed Apr 26, 2024
1 parent 32289f9 commit 2d3c983
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 28 deletions.
35 changes: 35 additions & 0 deletions velox/docs/functions/spark/json.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
==============
JSON Functions
==============

JSON Format
-----------

JSON is a language-independent data format that represents data as
human-readable text. A JSON text can represent a number, a boolean, a
string, an array, an object, or a null, with slightly different grammar.
For instance, a JSON text representing a string must escape all characters
and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON
text representing a number does not need to, such as ``123``. A JSON text
representing an array must enclose the array elements in square brackets,
such as ``[1,2,3]``. More detailed grammar can be found in
`this JSON introduction`_.

.. _this JSON introduction: https://www.json.org

JSON Functions
--------------

.. spark:function:: get_json_object(jsonString, path) -> varchar
Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``.
Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field"
to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns
``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed.
Also returns NULL if ``path`` doesn't exist. ::

SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b'
SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}'
SELECT get_json_object('{"a":3}', '$.b'); -- NULL (not found field)
SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string)
SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path)
27 changes: 0 additions & 27 deletions velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,6 @@ class JsonBenchmark : public velox::functions::test::FunctionBenchmarkBase {
{"folly_json_array_length"});
registerFunction<SIMDJsonArrayLengthFunction, int64_t, Json>(
{"simd_json_array_length"});
registerFunction<JsonExtractScalarFunction, Varchar, Json, Varchar>(
{"folly_json_extract_scalar"});
registerFunction<SIMDJsonExtractScalarFunction, Varchar, Json, Varchar>(
{"simd_json_extract_scalar"});
registerFunction<JsonExtractFunction, Varchar, Json, Varchar>(
Expand Down Expand Up @@ -335,15 +333,6 @@ void SIMDJsonArrayLength(int iter, int vectorSize, int jsonSize) {
benchmark.runWithJson(iter, vectorSize, "simd_json_array_length", json);
}

void FollyJsonExtractScalar(int iter, int vectorSize, int jsonSize) {
folly::BenchmarkSuspender suspender;
JsonBenchmark benchmark;
auto json = benchmark.prepareData(jsonSize);
suspender.dismiss();
benchmark.runWithJsonExtract(
iter, vectorSize, "folly_json_extract_scalar", json, "$.key[7].k1");
}

void SIMDJsonExtractScalar(int iter, int vectorSize, int jsonSize) {
folly::BenchmarkSuspender suspender;
JsonBenchmark benchmark;
Expand Down Expand Up @@ -510,43 +499,27 @@ BENCHMARK_RELATIVE_NAMED_PARAM(
BENCHMARK_DRAW_LINE();

BENCHMARK_DRAW_LINE();
BENCHMARK_NAMED_PARAM(FollyJsonExtractScalar, 100_iters_10bytes_size, 100, 10);
BENCHMARK_RELATIVE_NAMED_PARAM(
SIMDJsonExtractScalar,
100_iters_10bytes_size,
100,
10);
BENCHMARK_DRAW_LINE();

BENCHMARK_NAMED_PARAM(
FollyJsonExtractScalar,
100_iters_100bytes_size,
100,
100);
BENCHMARK_RELATIVE_NAMED_PARAM(
SIMDJsonExtractScalar,
100_iters_100bytes_size,
100,
100);
BENCHMARK_DRAW_LINE();

BENCHMARK_NAMED_PARAM(
FollyJsonExtractScalar,
100_iters_1000bytes_size,
100,
1000);
BENCHMARK_RELATIVE_NAMED_PARAM(
SIMDJsonExtractScalar,
100_iters_1000bytes_size,
100,
1000);
BENCHMARK_DRAW_LINE();

BENCHMARK_NAMED_PARAM(
FollyJsonExtractScalar,
100_iters_10000bytes_size,
100,
10000);
BENCHMARK_RELATIVE_NAMED_PARAM(
SIMDJsonExtractScalar,
100_iters_10000bytes_size,
Expand Down
3 changes: 2 additions & 1 deletion velox/functions/sparksql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ target_link_libraries(
velox_functions_spark_specialforms
velox_is_null_functions
velox_functions_util
Folly::folly)
Folly::folly
simdjson)

set_property(TARGET velox_functions_spark PROPERTY JOB_POOL_COMPILE
high_memory_pool)
Expand Down
4 changes: 4 additions & 0 deletions velox/functions/sparksql/Register.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "velox/functions/sparksql/RegexFunctions.h"
#include "velox/functions/sparksql/RegisterArithmetic.h"
#include "velox/functions/sparksql/RegisterCompare.h"
#include "velox/functions/sparksql/SIMDJsonFunctions.h"
#include "velox/functions/sparksql/Size.h"
#include "velox/functions/sparksql/SparkPartitionId.h"
#include "velox/functions/sparksql/String.h"
Expand Down Expand Up @@ -162,6 +163,9 @@ void registerFunctions(const std::string& prefix) {

registerRegexpReplace(prefix);

registerFunction<SIMDGetJsonObjectFunction, Varchar, Varchar, Varchar>(
{prefix + "get_json_object"});

// Register string functions.
registerFunction<sparksql::ChrFunction, Varchar, int64_t>({prefix + "chr"});
registerFunction<AsciiFunction, int32_t, Varchar>({prefix + "ascii"});
Expand Down
195 changes: 195 additions & 0 deletions velox/functions/sparksql/SIMDJsonFunctions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "velox/functions/prestosql/SIMDJsonFunctions.h"

using namespace simdjson;

namespace facebook::velox::functions::sparksql {

template <typename T>
struct SIMDGetJsonObjectFunction {
VELOX_DEFINE_FUNCTION_TYPES(T);

// ASCII input always produces ASCII result.
static constexpr bool is_default_ascii_behavior = true;

FOLLY_ALWAYS_INLINE void initialize(
const std::vector<TypePtr>& /*inputTypes*/,
const core::QueryConfig& config,
const arg_type<Varchar>* /*json*/,
const arg_type<Varchar>* jsonPath) {
if (jsonPath != nullptr) {
if (jsonPath->size() > 1 && jsonPath->data()[0] == '$') {
jsonPath_ = removeSingleQuotes(
std::string_view(jsonPath->data() + 1, jsonPath->size() - 1));
}
}
}

FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
const arg_type<Varchar>& json,
const arg_type<Varchar>& jsonPath) {
// Spark requires the first char in jsonPath is '$'.
if (jsonPath.size() < 1 || jsonPath.data()[0] != '$') {
return false;
}
// jsonPath is "$".
if (jsonPath.size() == 1) {
result.append(json);
return true;
}
ParserContext ctx(json.data(), json.size());
ctx.parseDocument();
auto rawResult = jsonPath_.has_value()
? ctx.jsonDoc.at_path(jsonPath_.value().data())
: ctx.jsonDoc.at_path(
removeSingleQuotes(
std::string_view(jsonPath.data() + 1, jsonPath.size() - 1))
.data());
if (rawResult.error()) {
return false;
}
auto error = extractStringResult(rawResult, result);
if (error) {
return false;
}

const char* currentPos;
ctx.jsonDoc.current_location().get(currentPos);
return isValidEndingCharacter(currentPos);
}

private:
// Spark's json path requires field name surrounded by single quotes if it is
// specified in "[]". But simdjson lib requires not. This method just removes
// such single quotes, e.g., converts "['a']['b']" to "[a][b]".
FOLLY_ALWAYS_INLINE std::string removeSingleQuotes(
std::string_view jsonPath) {
std::string result(jsonPath.data(), jsonPath.size());
size_t pairEnd = 0;
while (true) {
auto pairBegin = result.find("['", pairEnd);
if (pairBegin == std::string::npos) {
break;
}
pairEnd = result.find("]", pairBegin);
if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') {
return "-1";
}
result.erase(pairEnd - 1, 1);
result.erase(pairBegin + 1, 1);
pairEnd -= 2;
}
return result;
}

FOLLY_ALWAYS_INLINE simdjson::error_code extractStringResult(
simdjson_result<ondemand::value> rawResult,
out_type<Varchar>& result) {
simdjson::error_code error;
std::stringstream ss;
switch (rawResult.type()) {
// For number and bool types, we need to explicitly get the value
// for specific types instead of using `ss << rawResult`. Thus, we
// can make simdjson's internal parsing position moved and then we
// can check the validity of ending character.
case ondemand::json_type::number: {
switch (rawResult.get_number_type()) {
case ondemand::number_type::unsigned_integer: {
uint64_t numberResult;
error = rawResult.get_uint64().get(numberResult);
if (!error) {
ss << numberResult;
result.append(ss.str());
}
return error;
}
case ondemand::number_type::signed_integer: {
int64_t numberResult;
error = rawResult.get_int64().get(numberResult);
if (!error) {
ss << numberResult;
result.append(ss.str());
}
return error;
}
case ondemand::number_type::floating_point_number: {
double numberResult;
error = rawResult.get_double().get(numberResult);
if (!error) {
ss << rawResult;
result.append(ss.str());
}
return error;
}
default:
VELOX_UNREACHABLE();
}
}
case ondemand::json_type::boolean: {
bool boolResult;
error = rawResult.get_bool().get(boolResult);
if (!error) {
result.append(boolResult ? "true" : "false");
}
return error;
}
case ondemand::json_type::string: {
std::string_view stringResult;
error = rawResult.get_string().get(stringResult);
result.append(stringResult);
return error;
}
case ondemand::json_type::object: {
// For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will
// return an object type.
ss << rawResult;
result.append(ss.str());
return SUCCESS;
}
case ondemand::json_type::array: {
ss << rawResult;
result.append(ss.str());
return SUCCESS;
}
default: {
return UNSUPPORTED_ARCHITECTURE;
}
}
}

// This is a simple validation by checking whether the obtained result is
// followed by valid char. Because ondemand parsing we are using ignores json
// format validation for characters following the current parsing position.
bool isValidEndingCharacter(const char* currentPos) {
char endingChar = *currentPos;
if (endingChar == ',' || endingChar == '}' || endingChar == ']') {
return true;
}
// These chars can be prior to a valid ending char.
if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' ||
endingChar == '\t') {
return isValidEndingCharacter(currentPos++);
}
return false;
}

std::optional<std::string> jsonPath_;
};

} // namespace facebook::velox::functions::sparksql
1 change: 1 addition & 0 deletions velox/functions/sparksql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ add_executable(
ElementAtTest.cpp
HashTest.cpp
InTest.cpp
JsonFunctionsTest.cpp
LeastGreatestTest.cpp
MakeDecimalTest.cpp
MakeTimestampTest.cpp
Expand Down
Loading

0 comments on commit 2d3c983

Please sign in to comment.