Initial commit

facebookincubator · Apr 26, 2024 · 2d3c983 · 2d3c983
1 parent 32289f9
commit 2d3c983
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 28 deletions.
diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst
@@ -0,0 +1,35 @@
+==============
+JSON Functions
+==============
+
+JSON Format
+-----------
+
+JSON is a language-independent data format that represents data as
+human-readable text. A JSON text can represent a number, a boolean, a
+string, an array, an object, or a null, with slightly different grammar.
+For instance, a JSON text representing a string must escape all characters
+and enclose the string in double quotes, such as ``"123\n"``, whereas a JSON
+text representing a number does not need to, such as ``123``. A JSON text
+representing an array must enclose the array elements in square brackets,
+such as ``[1,2,3]``. More detailed grammar can be found in
+`this JSON introduction`_.
+
+.. _this JSON introduction: https://www.json.org
+
+JSON Functions
+--------------
+
+.. spark:function:: get_json_object(jsonString, path) -> varchar
+
+    Returns a json object, represented by VARCHAR, from ``jsonString`` by searching ``path``.
+    Valid ``path`` should start with '$' and then contain "[index]", "['field']" or ".field"
+    to define a JSON path. Here are some examples: "$.a" "$.a.b", "$[0]['a'].b". Returns
+    ``jsonString`` if ``path`` is "$". Returns NULL if ``jsonString`` or ``path`` is malformed.
+    Also returns NULL if ``path`` doesn't exist. ::
+
+        SELECT get_json_object('{"a":"b"}', '$.a'); -- 'b'
+        SELECT get_json_object('{"a":{"b":"c"}}', '$.a'); -- '{"b":"c"}'
+        SELECT get_json_object('{"a":3}', '$.b'); -- NULL (not found field)
+        SELECT get_json_object('{"a"-3}'', '$.a'); -- NULL (malformed JSON string)
+        SELECT get_json_object('{"a":3}'', '.a'); -- NULL (malformed JSON path)
diff --git a/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp b/velox/functions/prestosql/benchmarks/JsonExprBenchmark.cpp
@@ -189,8 +189,6 @@ class JsonBenchmark : public velox::functions::test::FunctionBenchmarkBase {
         {"folly_json_array_length"});
     registerFunction<SIMDJsonArrayLengthFunction, int64_t, Json>(
         {"simd_json_array_length"});
-    registerFunction<JsonExtractScalarFunction, Varchar, Json, Varchar>(
-        {"folly_json_extract_scalar"});
     registerFunction<SIMDJsonExtractScalarFunction, Varchar, Json, Varchar>(
         {"simd_json_extract_scalar"});
     registerFunction<JsonExtractFunction, Varchar, Json, Varchar>(
@@ -335,15 +333,6 @@ void SIMDJsonArrayLength(int iter, int vectorSize, int jsonSize) {
   benchmark.runWithJson(iter, vectorSize, "simd_json_array_length", json);
 }
 
-void FollyJsonExtractScalar(int iter, int vectorSize, int jsonSize) {
-  folly::BenchmarkSuspender suspender;
-  JsonBenchmark benchmark;
-  auto json = benchmark.prepareData(jsonSize);
-  suspender.dismiss();
-  benchmark.runWithJsonExtract(
-      iter, vectorSize, "folly_json_extract_scalar", json, "$.key[7].k1");
-}
-
 void SIMDJsonExtractScalar(int iter, int vectorSize, int jsonSize) {
   folly::BenchmarkSuspender suspender;
   JsonBenchmark benchmark;
@@ -510,43 +499,27 @@ BENCHMARK_RELATIVE_NAMED_PARAM(
 BENCHMARK_DRAW_LINE();
 
 BENCHMARK_DRAW_LINE();
-BENCHMARK_NAMED_PARAM(FollyJsonExtractScalar, 100_iters_10bytes_size, 100, 10);
 BENCHMARK_RELATIVE_NAMED_PARAM(
     SIMDJsonExtractScalar,
     100_iters_10bytes_size,
     100,
     10);
 BENCHMARK_DRAW_LINE();
 
-BENCHMARK_NAMED_PARAM(
-    FollyJsonExtractScalar,
-    100_iters_100bytes_size,
-    100,
-    100);
 BENCHMARK_RELATIVE_NAMED_PARAM(
     SIMDJsonExtractScalar,
     100_iters_100bytes_size,
     100,
     100);
 BENCHMARK_DRAW_LINE();
 
-BENCHMARK_NAMED_PARAM(
-    FollyJsonExtractScalar,
-    100_iters_1000bytes_size,
-    100,
-    1000);
 BENCHMARK_RELATIVE_NAMED_PARAM(
     SIMDJsonExtractScalar,
     100_iters_1000bytes_size,
     100,
     1000);
 BENCHMARK_DRAW_LINE();
 
-BENCHMARK_NAMED_PARAM(
-    FollyJsonExtractScalar,
-    100_iters_10000bytes_size,
-    100,
-    10000);
 BENCHMARK_RELATIVE_NAMED_PARAM(
     SIMDJsonExtractScalar,
     100_iters_10000bytes_size,

diff --git a/velox/functions/sparksql/CMakeLists.txt b/velox/functions/sparksql/CMakeLists.txt
@@ -49,7 +49,8 @@ target_link_libraries(
   velox_functions_spark_specialforms
   velox_is_null_functions
   velox_functions_util
-  Folly::folly)
+  Folly::folly
+  simdjson)
 
 set_property(TARGET velox_functions_spark PROPERTY JOB_POOL_COMPILE
                                                    high_memory_pool)

diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp
@@ -39,6 +39,7 @@
 #include "velox/functions/sparksql/RegexFunctions.h"
 #include "velox/functions/sparksql/RegisterArithmetic.h"
 #include "velox/functions/sparksql/RegisterCompare.h"
+#include "velox/functions/sparksql/SIMDJsonFunctions.h"
 #include "velox/functions/sparksql/Size.h"
 #include "velox/functions/sparksql/SparkPartitionId.h"
 #include "velox/functions/sparksql/String.h"
@@ -162,6 +163,9 @@ void registerFunctions(const std::string& prefix) {
 
   registerRegexpReplace(prefix);
 
+  registerFunction<SIMDGetJsonObjectFunction, Varchar, Varchar, Varchar>(
+      {prefix + "get_json_object"});
+
   // Register string functions.
   registerFunction<sparksql::ChrFunction, Varchar, int64_t>({prefix + "chr"});
   registerFunction<AsciiFunction, int32_t, Varchar>({prefix + "ascii"});

diff --git a/velox/functions/sparksql/SIMDJsonFunctions.h b/velox/functions/sparksql/SIMDJsonFunctions.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "velox/functions/prestosql/SIMDJsonFunctions.h"
+
+using namespace simdjson;
+
+namespace facebook::velox::functions::sparksql {
+
+template <typename T>
+struct SIMDGetJsonObjectFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  // ASCII input always produces ASCII result.
+  static constexpr bool is_default_ascii_behavior = true;
+
+  FOLLY_ALWAYS_INLINE void initialize(
+      const std::vector<TypePtr>& /*inputTypes*/,
+      const core::QueryConfig& config,
+      const arg_type<Varchar>* /*json*/,
+      const arg_type<Varchar>* jsonPath) {
+    if (jsonPath != nullptr) {
+      if (jsonPath->size() > 1 && jsonPath->data()[0] == '$') {
+        jsonPath_ = removeSingleQuotes(
+            std::string_view(jsonPath->data() + 1, jsonPath->size() - 1));
+      }
+    }
+  }
+
+  FOLLY_ALWAYS_INLINE bool call(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>& json,
+      const arg_type<Varchar>& jsonPath) {
+    // Spark requires the first char in jsonPath is '$'.
+    if (jsonPath.size() < 1 || jsonPath.data()[0] != '$') {
+      return false;
+    }
+    // jsonPath is "$".
+    if (jsonPath.size() == 1) {
+      result.append(json);
+      return true;
+    }
+    ParserContext ctx(json.data(), json.size());
+    ctx.parseDocument();
+    auto rawResult = jsonPath_.has_value()
+        ? ctx.jsonDoc.at_path(jsonPath_.value().data())
+        : ctx.jsonDoc.at_path(
+              removeSingleQuotes(
+                  std::string_view(jsonPath.data() + 1, jsonPath.size() - 1))
+                  .data());
+    if (rawResult.error()) {
+      return false;
+    }
+    auto error = extractStringResult(rawResult, result);
+    if (error) {
+      return false;
+    }
+
+    const char* currentPos;
+    ctx.jsonDoc.current_location().get(currentPos);
+    return isValidEndingCharacter(currentPos);
+  }
+
+ private:
+  // Spark's json path requires field name surrounded by single quotes if it is
+  // specified in "[]". But simdjson lib requires not. This method just removes
+  // such single quotes, e.g., converts "['a']['b']" to "[a][b]".
+  FOLLY_ALWAYS_INLINE std::string removeSingleQuotes(
+      std::string_view jsonPath) {
+    std::string result(jsonPath.data(), jsonPath.size());
+    size_t pairEnd = 0;
+    while (true) {
+      auto pairBegin = result.find("['", pairEnd);
+      if (pairBegin == std::string::npos) {
+        break;
+      }
+      pairEnd = result.find("]", pairBegin);
+      if (pairEnd == std::string::npos || result[pairEnd - 1] != '\'') {
+        return "-1";
+      }
+      result.erase(pairEnd - 1, 1);
+      result.erase(pairBegin + 1, 1);
+      pairEnd -= 2;
+    }
+    return result;
+  }
+
+  FOLLY_ALWAYS_INLINE simdjson::error_code extractStringResult(
+      simdjson_result<ondemand::value> rawResult,
+      out_type<Varchar>& result) {
+    simdjson::error_code error;
+    std::stringstream ss;
+    switch (rawResult.type()) {
+      // For number and bool types, we need to explicitly get the value
+      // for specific types instead of using `ss << rawResult`. Thus, we
+      // can make simdjson's internal parsing position moved and then we
+      // can check the validity of ending character.
+      case ondemand::json_type::number: {
+        switch (rawResult.get_number_type()) {
+          case ondemand::number_type::unsigned_integer: {
+            uint64_t numberResult;
+            error = rawResult.get_uint64().get(numberResult);
+            if (!error) {
+              ss << numberResult;
+              result.append(ss.str());
+            }
+            return error;
+          }
+          case ondemand::number_type::signed_integer: {
+            int64_t numberResult;
+            error = rawResult.get_int64().get(numberResult);
+            if (!error) {
+              ss << numberResult;
+              result.append(ss.str());
+            }
+            return error;
+          }
+          case ondemand::number_type::floating_point_number: {
+            double numberResult;
+            error = rawResult.get_double().get(numberResult);
+            if (!error) {
+              ss << rawResult;
+              result.append(ss.str());
+            }
+            return error;
+          }
+          default:
+            VELOX_UNREACHABLE();
+        }
+      }
+      case ondemand::json_type::boolean: {
+        bool boolResult;
+        error = rawResult.get_bool().get(boolResult);
+        if (!error) {
+          result.append(boolResult ? "true" : "false");
+        }
+        return error;
+      }
+      case ondemand::json_type::string: {
+        std::string_view stringResult;
+        error = rawResult.get_string().get(stringResult);
+        result.append(stringResult);
+        return error;
+      }
+      case ondemand::json_type::object: {
+        // For nested case, e.g., for "{"my": {"hello": 10}}", "$.my" will
+        // return an object type.
+        ss << rawResult;
+        result.append(ss.str());
+        return SUCCESS;
+      }
+      case ondemand::json_type::array: {
+        ss << rawResult;
+        result.append(ss.str());
+        return SUCCESS;
+      }
+      default: {
+        return UNSUPPORTED_ARCHITECTURE;
+      }
+    }
+  }
+
+  // This is a simple validation by checking whether the obtained result is
+  // followed by valid char. Because ondemand parsing we are using ignores json
+  // format validation for characters following the current parsing position.
+  bool isValidEndingCharacter(const char* currentPos) {
+    char endingChar = *currentPos;
+    if (endingChar == ',' || endingChar == '}' || endingChar == ']') {
+      return true;
+    }
+    // These chars can be prior to a valid ending char.
+    if (endingChar == ' ' || endingChar == '\r' || endingChar == '\n' ||
+        endingChar == '\t') {
+      return isValidEndingCharacter(currentPos++);
+    }
+    return false;
+  }
+
+  std::optional<std::string> jsonPath_;
+};
+
+} // namespace facebook::velox::functions::sparksql
diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt
@@ -30,6 +30,7 @@ add_executable(
   ElementAtTest.cpp
   HashTest.cpp
   InTest.cpp
+  JsonFunctionsTest.cpp
   LeastGreatestTest.cpp
   MakeDecimalTest.cpp
   MakeTimestampTest.cpp