From 283b15ffb1ed99008ed9f807bc08eadac3ff5bc5 Mon Sep 17 00:00:00 2001 From: philo Date: Thu, 9 Jun 2022 17:36:08 +0800 Subject: [PATCH 1/5] Initial commit --- cpp/src/gandiva/precompiled/string_ops.cc | 50 ++++++++++++++++++- .../gandiva/precompiled/string_ops_test.cc | 11 ++++ cpp/src/gandiva/precompiled/types.h | 2 + 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index bf1532392488a..f938c63219ee4 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1568,4 +1568,52 @@ const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_le return out_str; } -} // extern "C" +FORCE_INLINE +const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32 from_base, + gdv_int32 to_base, gdv_int32* out_len) { + long intermediate = strtol(input, nullptr, from_base); + char reverse_ret[100]; + int i = 0; + while (intermediate > 0) { + int remainder = intermediate % to_base; + char c; + if (to_base == 16) { + if (remainder < 10) { + c = (char)(remainder - (int)'0'); + } else if (remainder == 10) { + c = 'A'; + } else if (remainder == 11) { + c = 'B'; + } else if (remainder == 12) { + c = 'C'; + } else if (remainder == 13) { + c = 'D'; + } else if (remainder == 14) { + c = 'E'; + } else if (remainder == 15) { + c = 'F'; + } + } else { + c = (char)(remainder + (int)'0'); + } + reverse_ret[i] == c; + intermediate = intermediate / to_base; + i++; + } + *out_len = i; + char ret[*out_len]; + for (int i = 0; i < *out_len; i++) { + ret[i] = reverse_ret[out_len - i - 1]; + } + + char* out_str = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + memcpy(out_str, ret, *out_len); + return out_str; +} + +} // extern "C" \ No newline at end of file diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 0afb48d9aa523..c03da4af3b6fc 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1142,4 +1142,15 @@ TEST(TestStringOps, TestURLDecoder) { EXPECT_EQ(std::string(out_str, out_len), exp_str); } +TEST(TestStringOps, TestConv) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + const char* out_str; + + out_str = conv(ctx_ptr, "4", 1, 10, 2); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "100"); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 82c3daa6059a1..521685bfbddbd 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -517,4 +517,6 @@ double castFLOAT8_utf8(int64_t context, const char* data, int32_t len); const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32* out_len); +const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32 from_base, + gdv_int32 to_base, gdv_int32* out_len); } // extern "C" From 6e0d099dc9045d5bce318ebf4752a980817ff379 Mon Sep 17 00:00:00 2001 From: philo Date: Thu, 9 Jun 2022 18:41:28 +0800 Subject: [PATCH 2/5] Consider negative input and add more test cases --- cpp/src/gandiva/function_registry_string.cc | 4 +++ cpp/src/gandiva/precompiled/string_ops.cc | 28 +++++++++++----- .../gandiva/precompiled/string_ops_test.cc | 33 ++++++++++++++++++- 3 files changed, 56 insertions(+), 9 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2f30d084f0014..21bba2606eab0 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -325,6 +325,10 @@ std::vector GetStringFunctionRegistry() { NativeFunction("url_decoder", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull, "url_decoder", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + + NativeFunction("conv", {}, DataTypeVector{utf8(), int32(), int32()}, utf8(), + kResultNullIfNull, "conv", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; return string_fn_registry_; diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index f938c63219ee4..117d974d251ba 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1571,15 +1571,23 @@ const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_le FORCE_INLINE const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32 from_base, gdv_int32 to_base, gdv_int32* out_len) { - long intermediate = strtol(input, nullptr, from_base); - char reverse_ret[100]; + from_base = from_base < 0 ? -from_base : from_base; + to_base = to_base < 0 ? -to_base : to_base; + long decimal_value = strtol(input, nullptr, from_base); + bool is_negative = false; + if (decimal_value < 0) { + is_negative = true; + // Use 0 or positive number. + decimal_value = -decimal_value; + } + char reverse_ret[64]; int i = 0; - while (intermediate > 0) { - int remainder = intermediate % to_base; + while (decimal_value > 0) { + int remainder = decimal_value % to_base; char c; if (to_base == 16) { if (remainder < 10) { - c = (char)(remainder - (int)'0'); + c = (char)(remainder + (int)'0'); } else if (remainder == 10) { c = 'A'; } else if (remainder == 11) { @@ -1596,14 +1604,18 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_ } else { c = (char)(remainder + (int)'0'); } - reverse_ret[i] == c; - intermediate = intermediate / to_base; + reverse_ret[i] = c; + i++; + decimal_value = decimal_value / to_base; + } + if (is_negative) { + reverse_ret[i] = '-'; i++; } *out_len = i; char ret[*out_len]; for (int i = 0; i < *out_len; i++) { - ret[i] = reverse_ret[out_len - i - 1]; + ret[i] = reverse_ret[*out_len - i - 1]; } char* out_str = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index c03da4af3b6fc..acbdbb5932873 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1148,9 +1148,40 @@ TEST(TestStringOps, TestConv) { gdv_int32 out_len = 0; const char* out_str; - out_str = conv(ctx_ptr, "4", 1, 10, 2); + // 10-base to 2-base + out_str = conv(ctx_ptr, "4", 1, 10, 2, &out_len); EXPECT_EQ(out_len, 3); EXPECT_EQ(std::string(out_str, out_len), "100"); + + // 2-bae to 10-base + out_str = conv(ctx_ptr, "110", 3, 2, 10, &out_len); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(std::string(out_str, out_len), "6"); + + // 10-base to 16-base + out_str = conv(ctx_ptr, "15", 2, 10, 16, &out_len); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(std::string(out_str, out_len), "F"); + + // 36-base to 16-base + out_str = conv(ctx_ptr, "big", 3, 36, 16, &out_len); + EXPECT_EQ(out_len, 4); + EXPECT_EQ(std::string(out_str, out_len), "3A48"); + + // Space is contained in input string. + out_str = conv(ctx_ptr, " 15 ", 2, 10, 16, &out_len); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(std::string(out_str, out_len), "F"); + + // Negative input. + out_str = conv(ctx_ptr, "-15", 3, 10, 16, &out_len); + EXPECT_EQ(out_len, 2); + EXPECT_EQ(std::string(out_str, out_len), "-F"); + + // Negative input and negative base. + out_str = conv(ctx_ptr, "-10", 3, 16, -10, &out_len); + EXPECT_EQ(out_len, 3); + EXPECT_EQ(std::string(out_str, out_len), "-16"); } } // namespace gandiva From d17cb75ee743678166563d01b0a8fb18286b51a2 Mon Sep 17 00:00:00 2001 From: philo Date: Thu, 9 Jun 2022 23:43:54 +0800 Subject: [PATCH 3/5] Use kResultNullInternal and consider more spark test cases --- cpp/src/gandiva/function_registry_string.cc | 2 +- cpp/src/gandiva/precompiled/string_ops.cc | 57 +++++++++++-------- .../gandiva/precompiled/string_ops_test.cc | 36 +++++++++--- cpp/src/gandiva/precompiled/types.h | 6 +- 4 files changed, 66 insertions(+), 35 deletions(-) diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 21bba2606eab0..f9cb907f568a1 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -328,7 +328,7 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), NativeFunction("conv", {}, DataTypeVector{utf8(), int32(), int32()}, utf8(), - kResultNullIfNull, "conv", + kResultNullInternal, "conv", NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)}; return string_fn_registry_; diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 117d974d251ba..8e4c29f62d26a 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1569,46 +1569,53 @@ const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_le } FORCE_INLINE -const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32 from_base, - gdv_int32 to_base, gdv_int32* out_len) { +const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool in1_valid, + gdv_int32 from_base, bool in2_valid, gdv_int32 to_base, bool in3_valid, + bool* out_valid, gdv_int32* out_len) { + if (!in1_valid || !in2_valid || !in3_valid) { + *out_len = 0; + *out_valid = false; + return ""; + } + + const int MIN_BASE = 2; + const int MAX_BASE = 36; + if (from_base < MIN_BASE || from_base > MAX_BASE || + fabs(to_base) < MIN_BASE || fabs(to_base) > MAX_BASE) { + *out_len = 0; + *out_valid = false; + return ""; + } + from_base = from_base < 0 ? -from_base : from_base; - to_base = to_base < 0 ? -to_base : to_base; long decimal_value = strtol(input, nullptr, from_base); - bool is_negative = false; - if (decimal_value < 0) { - is_negative = true; + + bool has_negative_mark = false; + if (decimal_value < 0 && to_base < 0) { + has_negative_mark = true; // Use 0 or positive number. decimal_value = -decimal_value; + } else if (decimal_value < 0 && to_base > 0) { + // Use the max value for 64-bit to convert it to positive. + decimal_value = strtol("FFFFFFFFFFFFFFFF", nullptr, 16) + decimal_value -1; } + to_base = to_base < 0 ? -to_base : to_base; + char reverse_ret[64]; int i = 0; while (decimal_value > 0) { int remainder = decimal_value % to_base; char c; - if (to_base == 16) { - if (remainder < 10) { - c = (char)(remainder + (int)'0'); - } else if (remainder == 10) { - c = 'A'; - } else if (remainder == 11) { - c = 'B'; - } else if (remainder == 12) { - c = 'C'; - } else if (remainder == 13) { - c = 'D'; - } else if (remainder == 14) { - c = 'E'; - } else if (remainder == 15) { - c = 'F'; - } - } else { + if (remainder < 10) { c = (char)(remainder + (int)'0'); + } else { + c = (char)(remainder - 10 + (int)'A'); } reverse_ret[i] = c; i++; decimal_value = decimal_value / to_base; } - if (is_negative) { + if (has_negative_mark) { reverse_ret[i] = '-'; i++; } @@ -1622,9 +1629,11 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_ if (ret == nullptr) { gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); *out_len = 0; + *out_valid = false; return ""; } memcpy(out_str, ret, *out_len); + *out_valid = true; return out_str; } diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index acbdbb5932873..337fcba131b21 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1147,41 +1147,61 @@ TEST(TestStringOps, TestConv) { uint64_t ctx_ptr = reinterpret_cast(&ctx); gdv_int32 out_len = 0; const char* out_str; + bool out_valid; // 10-base to 2-base - out_str = conv(ctx_ptr, "4", 1, 10, 2, &out_len); + out_str = conv(ctx_ptr, "4", 1, true, 10, true, 2, true, &out_valid, &out_len); EXPECT_EQ(out_len, 3); + EXPECT_EQ(out_valid, true); EXPECT_EQ(std::string(out_str, out_len), "100"); // 2-bae to 10-base - out_str = conv(ctx_ptr, "110", 3, 2, 10, &out_len); + out_str = conv(ctx_ptr, "110", 3, true, 2, true, 10, true, &out_valid, &out_len); EXPECT_EQ(out_len, 1); EXPECT_EQ(std::string(out_str, out_len), "6"); // 10-base to 16-base - out_str = conv(ctx_ptr, "15", 2, 10, 16, &out_len); + out_str = conv(ctx_ptr, "15", 2, true, 10, true, 16, true, &out_valid, &out_len); EXPECT_EQ(out_len, 1); EXPECT_EQ(std::string(out_str, out_len), "F"); // 36-base to 16-base - out_str = conv(ctx_ptr, "big", 3, 36, 16, &out_len); + out_str = conv(ctx_ptr, "big", 3, true, 36, true, 16, true, &out_valid, &out_len); EXPECT_EQ(out_len, 4); EXPECT_EQ(std::string(out_str, out_len), "3A48"); + // 36-base to 16-base. + std::string input = "9223372036854775807"; + out_str = conv(ctx_ptr, input.c_str(), input.length(), true, 36, true, 16, true, &out_valid, &out_len); + std::string expected_str = "FFFFFFFFFFFFFFFF"; + EXPECT_EQ(out_len, expected_str.length()); + EXPECT_EQ(std::string(out_str, out_len), expected_str); + // Space is contained in input string. - out_str = conv(ctx_ptr, " 15 ", 2, 10, 16, &out_len); + out_str = conv(ctx_ptr, " 15 ", 2, true, 10, true, 16, true, &out_valid, &out_len); EXPECT_EQ(out_len, 1); EXPECT_EQ(std::string(out_str, out_len), "F"); - // Negative input. - out_str = conv(ctx_ptr, "-15", 3, 10, 16, &out_len); + // Negative input and negative to_base. + out_str = conv(ctx_ptr, "-15", 3, true, 10, true, -16, true, &out_valid, &out_len); EXPECT_EQ(out_len, 2); EXPECT_EQ(std::string(out_str, out_len), "-F"); + // Negative input and positive to_base + out_str = conv(ctx_ptr, "-15", 3, true, 10, true, 16, true, &out_valid, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_EQ(std::string(out_str, out_len), "FFFFFFFFFFFFFFF1"); + // Negative input and negative base. - out_str = conv(ctx_ptr, "-10", 3, 16, -10, &out_len); + out_str = conv(ctx_ptr, "-10", 3, true, 16, true, -10, true, &out_valid, &out_len); EXPECT_EQ(out_len, 3); EXPECT_EQ(std::string(out_str, out_len), "-16"); + + // If there is an invalid digit in the number, the longest + // valid prefix should be converted. + out_str = conv(ctx_ptr, "11abc", 5, true, 10, true, 16, true, &out_valid, &out_len); + EXPECT_EQ(out_len, 1); + EXPECT_EQ(std::string(out_str, out_len), "B"); } } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 521685bfbddbd..e69269cca97ac 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -517,6 +517,8 @@ double castFLOAT8_utf8(int64_t context, const char* data, int32_t len); const char* url_decoder(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32* out_len); -const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, gdv_int32 from_base, - gdv_int32 to_base, gdv_int32* out_len); +const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool in1_valid, + gdv_int32 from_base, bool in2_valid, gdv_int32 to_base, bool in3_valid, + bool* out_valid, gdv_int32* out_len); + } // extern "C" From e7ea839a71e9d1555e8ea01c0846675e8a37f8fd Mon Sep 17 00:00:00 2001 From: philo Date: Thu, 9 Jun 2022 23:44:35 +0800 Subject: [PATCH 4/5] Use unsigned long in the converting --- cpp/src/gandiva/precompiled/string_ops.cc | 24 ++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 8e4c29f62d26a..ae14b2feea123 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1588,23 +1588,29 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool } from_base = from_base < 0 ? -from_base : from_base; - long decimal_value = strtol(input, nullptr, from_base); + bool is_negative_input; + unsigned long unsigned_decimal_value; + if (input[0] == '-') { + is_negative_input = true; + unsigned_decimal_value = strtoul(input + 1, nullptr, from_base); + } else { + is_negative_input = false; + unsigned_decimal_value = strtoul(input, nullptr, from_base); + } bool has_negative_mark = false; - if (decimal_value < 0 && to_base < 0) { + if (is_negative_input && to_base < 0) { has_negative_mark = true; - // Use 0 or positive number. - decimal_value = -decimal_value; - } else if (decimal_value < 0 && to_base > 0) { + } else if (is_negative_input && to_base > 0) { // Use the max value for 64-bit to convert it to positive. - decimal_value = strtol("FFFFFFFFFFFFFFFF", nullptr, 16) + decimal_value -1; + unsigned_decimal_value = strtoul("FFFFFFFFFFFFFFFF", nullptr, 16) - unsigned_decimal_value + 1; } to_base = to_base < 0 ? -to_base : to_base; char reverse_ret[64]; int i = 0; - while (decimal_value > 0) { - int remainder = decimal_value % to_base; + while (unsigned_decimal_value > 0) { + int remainder = unsigned_decimal_value % to_base; char c; if (remainder < 10) { c = (char)(remainder + (int)'0'); @@ -1613,7 +1619,7 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool } reverse_ret[i] = c; i++; - decimal_value = decimal_value / to_base; + unsigned_decimal_value = unsigned_decimal_value / to_base; } if (has_negative_mark) { reverse_ret[i] = '-'; From 38f079195c62a51216bd4a9d1aa01ea1771f2226 Mon Sep 17 00:00:00 2001 From: philo Date: Thu, 9 Jun 2022 23:51:05 +0800 Subject: [PATCH 5/5] Return null for empty input --- cpp/src/gandiva/precompiled/string_ops.cc | 3 ++- cpp/src/gandiva/precompiled/string_ops_test.cc | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index ae14b2feea123..5ae0b178dc424 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -1572,12 +1572,13 @@ FORCE_INLINE const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool in1_valid, gdv_int32 from_base, bool in2_valid, gdv_int32 to_base, bool in3_valid, bool* out_valid, gdv_int32* out_len) { - if (!in1_valid || !in2_valid || !in3_valid) { + if (!in1_valid || !in2_valid || !in3_valid || input_len == 0) { *out_len = 0; *out_valid = false; return ""; } + // Consistent with spark, only support base belonging to [2, 36]. const int MIN_BASE = 2; const int MAX_BASE = 36; if (from_base < MIN_BASE || from_base > MAX_BASE || diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index 337fcba131b21..a672c65183541 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -1202,6 +1202,10 @@ TEST(TestStringOps, TestConv) { out_str = conv(ctx_ptr, "11abc", 5, true, 10, true, 16, true, &out_valid, &out_len); EXPECT_EQ(out_len, 1); EXPECT_EQ(std::string(out_str, out_len), "B"); + + // Should return null for Empty input. + out_str = conv(ctx_ptr, "", 0, true, 10, true, 16, true, &out_valid, &out_len); + EXPECT_EQ(out_valid, false); } } // namespace gandiva