Skip to content

Commit

Permalink
Untangle Spark and Presto hashing functions (#196)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #196

MD5 and XXHASH functions have similar behavior but not the same
between Spark and Presto. Re-organizing the code to implement the expected
semantic.

Reviewed By: mbasmanova, funrollloops

Differential Revision: D30883888

fbshipit-source-id: 4b9e3816ba7764dda996085cb95372d173199463
  • Loading branch information
pedroerp authored and facebook-github-bot committed Sep 14, 2021
1 parent 93c4848 commit 20fc850
Show file tree
Hide file tree
Showing 10 changed files with 63 additions and 225 deletions.
2 changes: 1 addition & 1 deletion velox/expression/tests/EvalSimplifiedTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ TEST_F(EvalSimplifiedTest, constantAndInput) {
}

TEST_F(EvalSimplifiedTest, strings) {
runTest("md5(c0)", ROW({"c0"}, {VARCHAR()}));
runTest("lower(upper(c0))", ROW({"c0"}, {VARCHAR()}));
}

TEST_F(EvalSimplifiedTest, doubles) {
Expand Down
2 changes: 1 addition & 1 deletion velox/expression/tests/ExpressionFuzzerMain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ std::vector<CallableSignature> getAllSignatures() {
std::vector<CallableSignature> functions;

// TODO: Skipping buggy functions for now.
std::unordered_set<std::string> skipFunctions = {"xxhash64", "from_unixtime"};
std::unordered_set<std::string> skipFunctions = {"from_unixtime"};
auto keys = exec::AdaptedVectorFunctions().Keys();

for (const auto& key : keys) {
Expand Down
35 changes: 0 additions & 35 deletions velox/functions/lib/string/StringImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,6 @@
*/
#pragma once

// The cpp library for xxhash requires one of the few macros to be set in
// order to get the library to even work (there's no default mode set).
// This macro forces the hash function to be inlined and is not set by default.
// We do not want to change the external library to set this default behavior.
#define XXH_INLINE_ALL

#include <assert.h>
#include <fmt/format.h>
#include <stdio.h>
Expand All @@ -35,7 +29,6 @@
#include "velox/common/base/Exceptions.h"
#include "velox/common/encode/Base64.h"
#include "velox/external/md5/md5.h"
#include "velox/external/xxhash.h"
#include "velox/functions/lib/string/StringCore.h"

namespace facebook::velox::functions::stringImpl {
Expand Down Expand Up @@ -240,34 +233,6 @@ FOLLY_ALWAYS_INLINE void replaceInPlace(
string.resize(outputSize);
}

/// Extract the hash for a given string
/// Following the implementation in HIVE UDF
/// fbcode/fbjava/hive-udfs/core-udfs/src/main/java/com/facebook/hive/udf/UDFXxhash64.java
template <typename TInString>
FOLLY_ALWAYS_INLINE bool
xxhash64int(int64_t& result, const TInString& input, const int64_t seed = 0) {
// Following the implementation in Hive
// They use utf8Slice constructor which is not necessary for correctness
result = XXH64(input.data(), input.size(), seed);
return true;
}

/// Extract the hash for a given string as string
/// Following the implementation in Presto
/// presto/presto-main/src/main/java/com/facebook/presto/operator/scalar/VarbinaryFunctions.java
template <typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE bool xxhash64(TOutString& output, const TInString& input) {
// Following the implementation in Presto (seed is set to 0)
int64_t hash;
xxhash64int(hash, input, 0);
static const auto kLen = sizeof(int64_t);

// Resizing output and copy
output.resize(kLen);
std::memcpy(output.data(), &hash, kLen);
return true;
}

/// Compute the MD5 Hash.
template <typename TOutString, typename TInString>
FOLLY_ALWAYS_INLINE bool md5(TOutString& output, const TInString& input) {
Expand Down
19 changes: 3 additions & 16 deletions velox/functions/prestosql/CoreFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,16 @@ void registerFunctions() {

registerFunction<udf_rand, double>(EMPTY);

registerUnaryScalar<udf_hash, int64_t>(EMPTY);

registerFunction<udf_json_extract_scalar, Varchar, Varchar, Varchar>();

// Register string functions.
registerFunction<udf_chr, Varchar, int64_t>();
registerFunction<udf_codepoint, int32_t, Varchar>();
registerFunction<
udf_xxhash64int<int64_t, Varchar>,
int64_t,
Varchar,
int64_t>({"xxhash64"});
registerFunction<udf_xxhash64int<int64_t, Varchar>, int64_t, Varchar>(
{"xxhash64"});
registerFunction<udf_xxhash64<Varbinary, Varbinary>, Varbinary, Varbinary>(
{"xxhash64"});

// Register hash functions
registerFunction<udf_xxhash64, Varbinary, Varbinary>({"xxhash64"});
registerFunction<udf_md5<Varbinary, Varbinary>, Varbinary, Varbinary>(
{"md5"});
registerFunction<udf_md5_radix<Varchar, Varchar>, Varchar, Varchar, int32_t>(
{"md5"});
registerFunction<udf_md5_radix<Varchar, Varchar>, Varchar, Varchar, int64_t>(
{"md5"});
registerFunction<udf_md5_radix<Varchar, Varchar>, Varchar, Varchar>({"md5"});

registerFunction<udf_to_hex, Varchar, Varbinary>();
registerFunction<udf_from_hex, Varbinary, Varchar>();
Expand Down
1 change: 1 addition & 0 deletions velox/functions/prestosql/StringFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "velox/vector/FlatVector.h"

namespace facebook::velox::functions {

namespace {

/// Check if the input vector's buffers are single referenced
Expand Down
68 changes: 19 additions & 49 deletions velox/functions/prestosql/StringFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@
*/
#pragma once

#define XXH_INLINE_ALL

#include "velox/external/xxhash.h"
#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {
/**
* chr(n) → varchar
* Returns the Unicode code point n as a single character string.
**/

/// chr(n) → varchar
/// Returns the Unicode code point n as a single character string.
VELOX_UDF_BEGIN(chr)
FOLLY_ALWAYS_INLINE bool call(
out_type<Varchar>& result,
Expand All @@ -32,10 +34,8 @@ FOLLY_ALWAYS_INLINE bool call(
}
VELOX_UDF_END();

/**
* codepoint(string) → integer
* Returns the Unicode code point of the only character of string.
**/
/// codepoint(string) → integer
/// Returns the Unicode code point of the only character of string.
VELOX_UDF_BEGIN(codepoint)
FOLLY_ALWAYS_INLINE bool call(
int32_t& result,
Expand All @@ -45,37 +45,23 @@ FOLLY_ALWAYS_INLINE bool call(
}
VELOX_UDF_END();

/**
* Presto variant
* xxhash64(varbinary) → varbinary
* Return an 8-byte binary to hash64 of input (varbinary such as string)
* Always returns true since this function is not null sensitive
*/
template <typename To, typename From>
/// xxhash64(varbinary) → varbinary
/// Return an 8-byte binary to hash64 of input (varbinary such as string)
VELOX_UDF_BEGIN(xxhash64)
FOLLY_ALWAYS_INLINE
bool call(out_type<To>& result, const arg_type<From>& input) {
return stringImpl::xxhash64(result, input);
}
VELOX_UDF_END();
bool call(out_type<Varbinary>& result, const arg_type<Varbinary>& input) {
// Seed is set to 0.
int64_t hash = XXH64(input.data(), input.size(), 0);
static const auto kLen = sizeof(int64_t);

/**
* HIVE variant
* xxhash64(string) → bigint
* Return an 64-bit integer equal to hash64 of input
* Always returns true since this function is not null sensitive
*/
template <typename To, typename From>
VELOX_UDF_BEGIN(xxhash64int)
FOLLY_ALWAYS_INLINE bool call(
out_type<To>& result,
const arg_type<From>& input,
const int64_t seed = 0) {
return stringImpl::xxhash64int(result, input, seed);
// Resizing output and copy
result.resize(kLen);
std::memcpy(result.data(), &hash, kLen);
return true;
}
VELOX_UDF_END();

// md5(varbinary) → varbinary
/// md5(varbinary) → varbinary
template <typename To, typename From>
VELOX_UDF_BEGIN(md5)
FOLLY_ALWAYS_INLINE
Expand All @@ -84,22 +70,6 @@ FOLLY_ALWAYS_INLINE
}
VELOX_UDF_END();

/**
* md5(varchar, int) → varchar
* generate the md5 in varchar, the result is based on the given radix (we only
* supports radix=16 or radix=10 for now).
**/
template <typename To, typename From>
VELOX_UDF_BEGIN(md5_radix)
FOLLY_ALWAYS_INLINE bool call(
out_type<To>& result,
const arg_type<From>& input,
const int32_t radix = 16) {
stringImpl::md5_radix(result, input, radix);
return true;
}
VELOX_UDF_END();

VELOX_UDF_BEGIN(to_hex)
FOLLY_ALWAYS_INLINE bool call(
out_type<Varbinary>& result,
Expand Down
131 changes: 21 additions & 110 deletions velox/functions/prestosql/tests/StringFunctionsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -968,47 +968,16 @@ TEST_F(StringFunctionsTest, codePoint) {
}

TEST_F(StringFunctionsTest, md5) {
{
const auto md5 = [&](std::optional<std::string> key,
std::optional<int32_t> radix) {
return evaluateOnce<std::string>("md5(c0, c1)", key, radix);
};
EXPECT_EQ("533f6357e0210e67d91f651bc49e1278", md5("hashme", 16));
EXPECT_EQ("110655053273001216628802061412889137784", md5("hashme", 10));
EXPECT_EQ("d41d8cd98f00b204e9800998ecf8427e", md5("", 16));
EXPECT_EQ("281949768489412648962353822266799178366", md5("", 10));

EXPECT_THROW(
try {
md5("hashme", 2);
} catch (const facebook::velox::VeloxUserError& err) {
EXPECT_NE(
err.message().find(
"Not a valid radix for md5: 2. Supported values are 10 or 16"),
std::string::npos);
throw;
},
facebook::velox::VeloxUserError);
}
const auto md5 = [&](std::optional<std::string> arg) {
return evaluateOnce<std::string, std::string>(
"md5(c0)", {arg}, {VARBINARY()});
};

{
const auto md5 = [&](const std::string& key) {
return evaluateOnce<std::string>(
"md5(c0)",
std::vector<std::optional<StringView>>{StringView(key)},
{VARBINARY()});
};
EXPECT_EQ(hexToDec("533f6357e0210e67d91f651bc49e1278"), md5("hashme"));
EXPECT_EQ(hexToDec("eb2ac5b04180d8d6011a016aeb8f75b3"), md5("Infinity"));
EXPECT_EQ(hexToDec("d41d8cd98f00b204e9800998ecf8427e"), md5(""));

EXPECT_EQ(hexToDec("533f6357e0210e67d91f651bc49e1278"), md5("hashme"));
EXPECT_EQ(hexToDec("D41D8CD98F00B204E9800998ECF8427E"), md5(""));
}

// Test null input
{
auto result = evaluateOnce<std::string>(
"md5(c0)", std::optional<std::string>(std::nullopt));
ASSERT_EQ(result, std::nullopt);
}
EXPECT_EQ(std::nullopt, md5(std::nullopt));
}

void StringFunctionsTest::testReplaceInPlace(
Expand Down Expand Up @@ -1139,80 +1108,22 @@ TEST_F(StringFunctionsTest, controlExprEncodingPropagation) {
test("if(1!=1, lower(C1), lower(C2))", false);
}

void StringFunctionsTest::testXXHash64(
const std::vector<std::tuple<std::string, int64_t, int64_t>>& tests) {
// Creating vectors for input strings and seed values
auto inputString = makeFlatVector<StringView>(tests.size());
auto inputSeed = makeFlatVector<int64_t>(tests.size());
for (int i = 0; i < tests.size(); i++) {
inputString->set(i, StringView(std::get<0>(tests[i])));
inputSeed->set(i, std::get<1>(tests[i]));
}
auto rowVector = makeRowVector({inputString, inputSeed});

// Evaluating the function for each input and seed
auto result = evaluate<FlatVector<int64_t>>("xxhash64(c0, c1)", rowVector);

// Checking the results
for (int32_t i = 0; i < tests.size(); ++i) {
ASSERT_EQ(result->valueAt(i), std::get<2>(tests[i]));
}
}

void StringFunctionsTest::testXXHash64(
const std::vector<std::pair<std::string, int64_t>>& tests,
bool stringVariant) {
auto type = stringVariant ? std::dynamic_pointer_cast<const Type>(VARCHAR())
: VARBINARY();
// Creating vectors for input strings
auto inputString = makeFlatVector<StringView>(tests.size(), type);
for (int i = 0; i < tests.size(); i++) {
inputString->set(i, StringView(tests[i].first));
}
auto rowVector = makeRowVector({inputString});

// Evaluate and compare results
if (stringVariant) {
auto result = evaluate<FlatVector<int64_t>>("xxhash64(c0)", rowVector);
for (int32_t i = 0; i < tests.size(); ++i) {
ASSERT_EQ(result->valueAt(i), tests[i].second);
}
} else {
auto result = evaluate<FlatVector<StringView>>("xxhash64(c0)", rowVector);
for (int32_t i = 0; i < tests.size(); ++i) {
ASSERT_EQ(
std::memcmp(
result->valueAt(i).data(),
&tests[i].second,
sizeof(tests[i].second)),
0);
}
}
}

TEST_F(StringFunctionsTest, xxhash64) {
// The first two cases are borrowed from the original HIVE implementation
// unittests and the last two are corner cases
// fbcode/fbjava/hive-udfs/core-udfs/src/main/java/com/facebook/hive/udf/UDFXxhash64.java
{
std::vector<std::tuple<std::string, int64_t, int64_t>> validInputTest = {
{"hashmes", 0, 4920146668586838293},
{"hashme", 1, 1571629256661355178},
{"", 0, 0xEF46DB3751D8E999}};

testXXHash64(validInputTest);
}

// Similar tests for the Presto variant
{
std::vector<std::pair<std::string, int64_t>> validInputTest = {
{"hashmes", 4920146668586838293}, {"", 0xEF46DB3751D8E999}};
const auto xxhash64 = [&](std::optional<std::string> value) {
return evaluateOnce<std::string, std::string>(
"xxhash64(c0)", {value}, {VARBINARY()});
};

testXXHash64(validInputTest, false);
const auto toVarbinary = [](const int64_t input) {
std::string out;
out.resize(sizeof(input));
std::memcpy(out.data(), &input, sizeof(input));
return out;
};

// Default value seed for string inputs
testXXHash64(validInputTest, true);
}
EXPECT_EQ(std::nullopt, xxhash64(std::nullopt));
EXPECT_EQ(toVarbinary(-1205034819632174695L), xxhash64(""));
EXPECT_EQ(toVarbinary(-443202081618794350L), xxhash64("hashme"));
}

TEST_F(StringFunctionsTest, toHex) {
Expand Down
Loading

0 comments on commit 20fc850

Please sign in to comment.