Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port to_hex/from_hex functions #116

Merged
merged 1 commit into from
Jun 14, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions cpp/src/gandiva/function_registry_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,21 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
kResultNullIfNull, "url_decoder",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

NativeFunction("to_hex", {"hex"}, DataTypeVector{binary()}, utf8(),
kResultNullIfNull, "to_hex_binary", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
"to_hex_binary", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{int64()}, utf8(),
kResultNullIfNull, "to_hex_int64", NativeFunction::kNeedsContext),

NativeFunction("to_hex", {"hex"}, DataTypeVector{int32()}, utf8(),
kResultNullIfNull, "to_hex_int32", NativeFunction::kNeedsContext),

NativeFunction("from_hex", {"unhex"}, DataTypeVector{utf8()}, binary(),
kResultNullIfNull, "from_hex_utf8", NativeFunction::kNeedsContext),

NativeFunction("conv", {}, DataTypeVector{utf8(), int32(), int32()}, utf8(),
kResultNullInternal, "conv",
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)};
Expand Down
126 changes: 121 additions & 5 deletions cpp/src/gandiva/precompiled/string_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,17 @@
// under the License.

// String functions
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

extern "C" {

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <cinttypes>
#include <climits>
#include <cstdio>
#include <cstdlib>
#include <cstring>

#include "./types.h"

Expand Down Expand Up @@ -1644,4 +1647,117 @@ const char* conv(gdv_int64 context, const char* input, gdv_int32 input_len, bool
return out_str;
}

} // extern "C"
// Gets a binary object and returns its hexadecimal representation. That representation
// maps each byte in the input to a 2-length string containing a hexadecimal number.
// - Examples:
// - foo -> 666F6F = 66[f] 6F[o] 6F[o]
// - bar -> 626172 = 62[b] 61[a] 72[r]
FORCE_INLINE
const char* to_hex_binary(int64_t context, const char* text, int32_t text_len,
int32_t* out_len) {
if (text_len == 0) {
*out_len = 0;
return "";
}

auto ret =
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len * 2 + 1));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

uint32_t ret_index = 0;
uint32_t max_len = static_cast<uint32_t>(text_len) * 2;
uint32_t max_char_to_write = 4;

for (gdv_int32 i = 0; i < text_len; i++) {
DCHECK(ret_index >= 0 && ret_index < max_len);

int32_t ch = static_cast<int32_t>(text[i]) & 0xFF;

ret_index += snprintf(ret + ret_index, max_char_to_write, "%02X", ch);
}

*out_len = static_cast<int32_t>(ret_index);
return ret;
}

FORCE_INLINE
const char* to_hex_int64(int64_t context, int64_t data, int32_t* out_len) {
const int64_t hex_long_max_size = 2 * sizeof(int64_t);
auto ret =
reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, hex_long_max_size));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
snprintf(ret, hex_long_max_size + 1, "%" PRIX64, data);

*out_len = static_cast<int32_t>(strlen(ret));
return ret;
}

FORCE_INLINE
const char* to_hex_int32(int64_t context, int32_t data, int32_t* out_len) {
const int32_t max_size = 2 * sizeof(int32_t);
auto ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, max_size));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
snprintf(ret, max_size + 1, "%" PRIX32, data);

*out_len = static_cast<int32_t>(strlen(ret));
return ret;
}

FORCE_INLINE
const char* from_hex_utf8(int64_t context, const char* text, int32_t text_len,
int32_t* out_len) {
if (text_len == 0) {
*out_len = 0;
return "";
}

// the input string should have a length multiple of two
if (text_len % 2 != 0) {
gdv_fn_context_set_error_msg(
context, "Error parsing hex string, length was not a multiple of two.");
*out_len = 0;
return "";
}

char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, text_len / 2));

if (ret == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}

// converting hex encoded string to normal string
int32_t j = 0;
for (int32_t i = 0; i < text_len; i += 2) {
char b1 = text[i];
char b2 = text[i + 1];
if (isxdigit(b1) && isxdigit(b2)) {
// [a-fA-F0-9]
ret[j++] = to_binary_from_hex(b1) * 16 + to_binary_from_hex(b2);
} else {
gdv_fn_context_set_error_msg(
context, "Error parsing hex string, one or more bytes are not valid.");
*out_len = 0;
return "";
}
}
*out_len = j;
return ret;
}
} // extern "C"
226 changes: 226 additions & 0 deletions cpp/src/gandiva/precompiled/string_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1208,4 +1208,230 @@ TEST(TestStringOps, TestConv) {
EXPECT_EQ(out_valid, false);
}

TEST(TestStringOps, TestToHex) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
int32_t in_len = 0;
const char* out_str;

in_len = 10;
char in_str[] = {0x54, 0x65, 0x73, 0x74, 0x53, 0x74, 0x72, 0x69, 0x6E, 0x67};
out_str = to_hex_binary(ctx_ptr, in_str, in_len, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "54657374537472696E67");

in_len = 0;
out_str = to_hex_binary(ctx_ptr, "", in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 0);
EXPECT_EQ(output, "");

in_len = 1;
char in_str_one_char[] = {0x54};
out_str = to_hex_binary(ctx_ptr, in_str_one_char, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "54");

in_len = 16;
char in_str_spaces[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74,
0x68, 0x20, 0x73, 0x70, 0x61, 0x63, 0x65, 0x73};
out_str = to_hex_binary(ctx_ptr, in_str_spaces, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "54657374207769746820737061636573");

in_len = 20;
char in_str_break_line[] = {0x54, 0x65, 0x78, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68, 0x0A,
0x62, 0x72, 0x65, 0x61, 0x6B, 0x20, 0x6C, 0x69, 0x6E, 0x65};
out_str = to_hex_binary(ctx_ptr, in_str_break_line, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "5465787420776974680A627265616B206C696E65");

in_len = 27;
char in_str_with_num[] = {0x54, 0x65, 0x73, 0x74, 0x20, 0x77, 0x69, 0x74, 0x68,
0x20, 0x6E, 0x75, 0x6D, 0x62, 0x65, 0x72, 0x73, 0x20,
0x31, 0x20, 0x2B, 0x20, 0x31, 0x20, 0x3D, 0x20, 0x32};
out_str = to_hex_binary(ctx_ptr, in_str_with_num, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "546573742077697468206E756D626572732031202B2031203D2032");

in_len = 22;
char in_str_with_tabs[] = {0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A, 0x09, 0x0A,
0x0A, 0x0A, 0x09, 0x20, 0x61, 0x20, 0x6C, 0x65,
0x74, 0x74, 0x40, 0x5D, 0x65, 0x72};
out_str = to_hex_binary(ctx_ptr, in_str_with_tabs, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");

in_len = 22;
const char* binary_string =
"\x09\x0A\x09\x0A\x09\x0A\x09\x0A\x0A\x0A\x09\x20\x61\x20\x6C\x65\x74\x74\x40\x5D"
"\x65\x72";
out_str = to_hex_binary(ctx_ptr, binary_string, in_len, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(out_len, 2 * in_len);
EXPECT_EQ(output, "090A090A090A090A0A0A092061206C657474405D6572");
}

TEST(TestStringOps, TestToHexInt64) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
const char* out_str;

int64_t max_data = INT64_MAX;
out_str = to_hex_int64(ctx_ptr, max_data, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "7FFFFFFFFFFFFFFF");
ctx.Reset();

int64_t min_data = INT64_MIN;
out_str = to_hex_int64(ctx_ptr, min_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "8000000000000000");
ctx.Reset();

int64_t zero_data = 0;
out_str = to_hex_int64(ctx_ptr, zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int64_t minus_zero_data = -0;
out_str = to_hex_int64(ctx_ptr, minus_zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int64_t minus_one_data = -1;
out_str = to_hex_int64(ctx_ptr, minus_one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 16);
EXPECT_EQ(output, "FFFFFFFFFFFFFFFF");
ctx.Reset();

int64_t one_data = 1;
out_str = to_hex_int64(ctx_ptr, one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "1");
ctx.Reset();
}

TEST(TestStringOps, TestToHexInt32) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
int32_t out_len = 0;
const char* out_str;

int32_t max_data = INT32_MAX;
out_str = to_hex_int32(ctx_ptr, max_data, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "7FFFFFFF");
ctx.Reset();

int32_t min_data = INT32_MIN;
out_str = to_hex_int32(ctx_ptr, min_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "80000000");
ctx.Reset();

int32_t zero_data = 0;
out_str = to_hex_int32(ctx_ptr, zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int32_t minus_zero_data = -0;
out_str = to_hex_int32(ctx_ptr, minus_zero_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "0");
ctx.Reset();

int32_t minus_one_data = -1;
out_str = to_hex_int32(ctx_ptr, minus_one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 8);
EXPECT_EQ(output, "FFFFFFFF");
ctx.Reset();

int32_t one_data = 1;
out_str = to_hex_int32(ctx_ptr, one_data, &out_len);
output = std::string(out_str, out_len);
EXPECT_FALSE(ctx.has_error());
EXPECT_EQ(out_len, 1);
EXPECT_EQ(output, "1");
ctx.Reset();
}

TEST(TestStringOps, TestFromHex) {
gandiva::ExecutionContext ctx;
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
gdv_int32 out_len = 0;
const char* out_str;

out_str = from_hex_utf8(ctx_ptr, "414243", 6, &out_len);
std::string output = std::string(out_str, out_len);
EXPECT_EQ(output, "ABC");

out_str = from_hex_utf8(ctx_ptr, "", 0, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");

out_str = from_hex_utf8(ctx_ptr, "41", 2, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "A");

out_str = from_hex_utf8(ctx_ptr, "6d6D", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "mm");

out_str = from_hex_utf8(ctx_ptr, "6f6d", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "om");

out_str = from_hex_utf8(ctx_ptr, "4f4D", 4, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "OM");

out_str = from_hex_utf8(ctx_ptr, "T", 1, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
EXPECT_THAT(
ctx.get_error(),
::testing::HasSubstr("Error parsing hex string, length was not a multiple of"));
ctx.Reset();

out_str = from_hex_utf8(ctx_ptr, "\\x41\\x42\\x43", 12, &out_len);
output = std::string(out_str, out_len);
EXPECT_EQ(output, "");
EXPECT_THAT(
ctx.get_error(),
::testing::HasSubstr("Error parsing hex string, one or more bytes are not valid."));
ctx.Reset();
}
} // namespace gandiva
Loading