From 557a866ac59c58355c15c5baa817f75a66148412 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Wed, 17 Nov 2021 13:58:03 +0800 Subject: [PATCH] Support "return null" if there is an exception in casting string to int, bigint, float or double (#44) * Initial commit * Add global mapping for new funcs * Add to header * Fix args issue * Check in_valid --- cpp/src/gandiva/function_registry_string.cc | 21 ++++ cpp/src/gandiva/gdv_function_stubs.cc | 106 ++++++++++++++++++++ cpp/src/gandiva/gdv_function_stubs.h | 12 +++ 3 files changed, 139 insertions(+) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index eae6fc7b51a0b..b4a3157520ddb 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -134,18 +134,39 @@ std::vector GetStringFunctionRegistry() { NativeFunction("castINT", {}, DataTypeVector{utf8()}, int32(), kResultNullIfNull, "gdv_fn_castINT_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + // return null if fail to cast + NativeFunction("castINTOrNull", {}, DataTypeVector{utf8()}, int32(), kResultNullInternal, + "gdv_fn_castINT_or_null_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("castBIGINT", {}, DataTypeVector{utf8()}, int64(), kResultNullIfNull, "gdv_fn_castBIGINT_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + // return null if fail to cast + NativeFunction("castBIGINTOrNull", {}, DataTypeVector{utf8()}, int64(), kResultNullInternal, + "gdv_fn_castBIGINT_or_null_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("castFLOAT4", {}, DataTypeVector{utf8()}, float32(), kResultNullIfNull, "gdv_fn_castFLOAT4_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + // return null if fail to cast + NativeFunction("castFLOAT4OrNull", {}, DataTypeVector{utf8()}, float32(), + kResultNullInternal, "gdv_fn_castFLOAT4_or_null_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("castFLOAT8", {}, DataTypeVector{utf8()}, float64(), kResultNullIfNull, "gdv_fn_castFLOAT8_utf8", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + // return null if fail to cast + NativeFunction("castFLOAT8OrNull", {}, DataTypeVector{utf8()}, float64(), + kResultNullInternal, "gdv_fn_castFLOAT8_or_null_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("castVARCHAR", {}, DataTypeVector{int8(), int64()}, utf8(), kResultNullIfNull, "castVARCHAR_int8_int64", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 0886acb10e93d..ed31c55fd5c4c 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -498,6 +498,40 @@ CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8) #undef CAST_NUMERIC_STRING +#define CAST_NUMERIC_OR_NULL_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \ + GANDIVA_EXPORT \ + OUT_TYPE gdv_fn_cast##TYPE_NAME##_or_null_utf8(int64_t context, const char* data, \ + int32_t len, bool in_valid, bool* out_valid) { \ + OUT_TYPE val = 0; \ + *out_valid = true; \ + if (!in_valid) { \ + *out_valid = false; \ + return val; \ + } \ + /* trim leading and trailing spaces */ \ + int32_t trimmed_len; \ + int32_t start = 0, end = len - 1; \ + while (start <= end && data[start] == ' ') { \ + ++start; \ + } \ + while (end >= start && data[end] == ' ') { \ + --end; \ + } \ + trimmed_len = end - start + 1; \ + const char* trimmed_data = data + start; \ + if (!arrow::internal::ParseValue(trimmed_data, trimmed_len, &val)) { \ + *out_valid = false; \ + } \ + return val; \ + } + +CAST_NUMERIC_OR_NULL_FROM_STRING(int32_t, arrow::Int32Type, INT) +CAST_NUMERIC_OR_NULL_FROM_STRING(int64_t, arrow::Int64Type, BIGINT) +CAST_NUMERIC_OR_NULL_FROM_STRING(float, arrow::FloatType, FLOAT4) +CAST_NUMERIC_OR_NULL_FROM_STRING(double, arrow::DoubleType, FLOAT8) + +#undef CAST_NUMERIC_OR_NULL_FROM_STRING + #define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE) \ GANDIVA_EXPORT \ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \ @@ -534,6 +568,42 @@ CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8) return ret; \ } +#define GDV_FN_CAST_VARCHAR_INTEGER(IN_TYPE, ARROW_TYPE) \ + GANDIVA_EXPORT \ + const char* gdv_fn_castVARCHAR_##IN_TYPE##_int64(int64_t context, gdv_##IN_TYPE value, \ + int64_t len, int32_t * out_len) { \ + if (len < 0) { \ + gdv_fn_context_set_error_msg(context, "Buffer length can not be negative"); \ + *out_len = 0; \ + return ""; \ + } \ + if (len == 0) { \ + *out_len = 0; \ + return ""; \ + } \ + arrow::internal::StringFormatter formatter; \ + char* ret = reinterpret_cast( \ + gdv_fn_context_arena_malloc(context, static_cast(len))); \ + if (ret == nullptr) { \ + gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \ + *out_len = 0; \ + return ""; \ + } \ + arrow::Status status = formatter(value, [&](arrow::util::string_view v) { \ + int64_t size = static_cast(v.size()); \ + *out_len = static_cast(len < size ? len : size); \ + memcpy(ret, v.data(), *out_len); \ + return arrow::Status::OK(); \ + }); \ + if (!status.ok()) { \ + std::string err = "Could not cast " + std::to_string(value) + " to string"; \ + gdv_fn_context_set_error_msg(context, err.c_str()); \ + *out_len = 0; \ + return ""; \ + } \ + return ret; \ + } + #define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE) \ GANDIVA_EXPORT \ const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \ @@ -1379,6 +1449,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_castINT_utf8", types->i32_type(), args, reinterpret_cast(gdv_fn_castINT_utf8)); + args = {types->i64_type(), // int64_t context_ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int32_t lenr + types->i1_type(), // bool in2_validity + types->ptr_type(types->i8_type())}; // bool* out_valid + + engine->AddGlobalMappingForFunc("gdv_fn_castINT_or_null_utf8", types->i32_type(), args, + reinterpret_cast(gdv_fn_castINT_or_null_utf8)); + args = {types->i64_type(), // int64_t context_ptr types->i8_ptr_type(), // const char* data types->i32_type()}; // int32_t lenr @@ -1386,6 +1465,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_utf8", types->i64_type(), args, reinterpret_cast(gdv_fn_castBIGINT_utf8)); + args = {types->i64_type(), // int64_t context_ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int32_t lenr + types->i1_type(), // bool in2_validity + types->ptr_type(types->i8_type())}; // bool* out_valid + + engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_or_null_utf8", types->i64_type(), args, + reinterpret_cast(gdv_fn_castBIGINT_or_null_utf8)); + args = {types->i64_type(), // int64_t context_ptr types->i8_ptr_type(), // const char* data types->i32_type()}; // int32_t lenr @@ -1393,6 +1481,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_utf8", types->float_type(), args, reinterpret_cast(gdv_fn_castFLOAT4_utf8)); + args = {types->i64_type(), // int64_t context_ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int32_t lenr + types->i1_type(), // bool in2_validity + types->ptr_type(types->i8_type())}; // bool* out_valid + + engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_or_null_utf8", types->float_type(), args, + reinterpret_cast(gdv_fn_castFLOAT4_or_null_utf8)); + args = {types->i64_type(), // int64_t context_ptr types->i8_ptr_type(), // const char* data types->i32_type()}; // int32_t lenr @@ -1400,6 +1497,15 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_utf8", types->double_type(), args, reinterpret_cast(gdv_fn_castFLOAT8_utf8)); + args = {types->i64_type(), // int64_t context_ptr + types->i8_ptr_type(), // const char* data + types->i32_type(), // int32_t lenr + types->i1_type(), // bool in2_validity + types->ptr_type(types->i8_type())}; // bool* out_valid + + engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_or_null_utf8", types->double_type(), args, + reinterpret_cast(gdv_fn_castFLOAT8_or_null_utf8)); + // gdv_fn_castVARCHAR_int32_int64 args = {types->i64_type(), // int64_t execution_context types->i32_type(), // int32_t value diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 5d52fb8fe4083..e652e5c5e97f2 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -131,15 +131,27 @@ char* gdv_fn_dec_to_string(int64_t context, int64_t x_high, uint64_t x_low, GANDIVA_EXPORT int32_t gdv_fn_castINT_utf8(int64_t context, const char* data, int32_t data_len); +GANDIVA_EXPORT +int32_t gdv_fn_castINT_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid); + GANDIVA_EXPORT int64_t gdv_fn_castBIGINT_utf8(int64_t context, const char* data, int32_t data_len); +GANDIVA_EXPORT +int64_t gdv_fn_castBIGINT_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid); + GANDIVA_EXPORT float gdv_fn_castFLOAT4_utf8(int64_t context, const char* data, int32_t data_len); +GANDIVA_EXPORT +float gdv_fn_castFLOAT4_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid); + GANDIVA_EXPORT double gdv_fn_castFLOAT8_utf8(int64_t context, const char* data, int32_t data_len); +GANDIVA_EXPORT +double gdv_fn_castFLOAT8_or_null_utf8(int64_t context, const char* data, int32_t data_len, bool in_valid, bool* out_valid); + GANDIVA_EXPORT const char* gdv_fn_castVARCHAR_int32_int64(int64_t context, int32_t value, int64_t len, int32_t* out_len);