Skip to content

Commit 0a85cef

Browse files
jvictorhugueninPindikura Ravindra
authored andcommitted
ARROW-12943: [Gandiva][C++]Implement MD5 Hive function
Implement the hive function hashMD5 in Gandiva Closes apache#10464 from jvictorhuguenin/feature/implement-md5-hive-function Authored-by: frank400 <j.victorhuguenin2018@gmail.com> Signed-off-by: Pindikura Ravindra <ravindra@dremio.com>
1 parent 6704cdf commit 0a85cef

8 files changed

Lines changed: 552 additions & 71 deletions

File tree

cpp/src/gandiva/function_registry_common.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,16 @@ typedef std::unordered_map<const FunctionSignature*, const NativeFunction*, KeyH
229229
ARROW_STRINGIFY(gdv_fn_sha256_##TYPE), \
230230
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
231231

232+
// HashMD5 functions that :
233+
// - NULL handling is of type NULL_NEVER
234+
// - can return errors
235+
//
236+
// The function name includes the base name & input type name. gdv_fn_md5_float64
237+
#define HASH_MD5_NULL_NEVER(NAME, ALIASES, TYPE) \
238+
NativeFunction(#NAME, {"md5"}, DataTypeVector{TYPE()}, utf8(), kResultNullNever, \
239+
ARROW_STRINGIFY(gdv_fn_md5_##TYPE), \
240+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors)
241+
232242
// Iterate the inner macro over all numeric types
233243
#define BASE_NUMERIC_TYPES(INNER, NAME, ALIASES) \
234244
INNER(NAME, ALIASES, int8), INNER(NAME, ALIASES, int16), INNER(NAME, ALIASES, int32), \

cpp/src/gandiva/function_registry_hash.cc

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ namespace gandiva {
3838
#define HASH_SHA256_NULL_NEVER_FN(name, ALIASES) \
3939
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_SHA256_NULL_NEVER, name, ALIASES)
4040

41+
#define HASH_MD5_NULL_NEVER_FN(name, ALIASES) \
42+
NUMERIC_BOOL_DATE_VAR_LEN_TYPES(HASH_MD5_NULL_NEVER, name, ALIASES)
43+
4144
std::vector<NativeFunction> GetHashFunctionRegistry() {
4245
static std::vector<NativeFunction> hash_fn_registry_ = {
4346
HASH32_SAFE_NULL_NEVER_FN(hash, {}),
@@ -55,7 +58,9 @@ std::vector<NativeFunction> GetHashFunctionRegistry() {
5558

5659
HASH_SHA1_NULL_NEVER_FN(hashSHA1, {}),
5760

58-
HASH_SHA256_NULL_NEVER_FN(hashSHA256, {})};
61+
HASH_SHA256_NULL_NEVER_FN(hashSHA256, {}),
62+
63+
HASH_MD5_NULL_NEVER_FN(hashMD5, {})};
5964

6065
return hash_fn_registry_;
6166
}

cpp/src/gandiva/gdv_function_stubs.cc

Lines changed: 283 additions & 42 deletions
Large diffs are not rendered by default.

cpp/src/gandiva/gdv_function_stubs.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,11 @@ const char* gdv_fn_sha256_decimal128(int64_t context, int64_t x_high, uint64_t x
107107
int32_t x_precision, int32_t x_scale,
108108
gdv_boolean x_isvalid, int32_t* out_length);
109109

110+
GANDIVA_EXPORT
111+
const char* gdv_fn_md5_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
112+
int32_t x_precision, int32_t x_scale,
113+
gdv_boolean x_isvalid, int32_t* out_length);
114+
110115
GANDIVA_EXPORT
111116
const char* gdv_fn_sha1_decimal128(int64_t context, int64_t x_high, uint64_t x_low,
112117
int32_t x_precision, int32_t x_scale,

cpp/src/gandiva/hash_utils.cc

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,28 @@
2424
namespace gandiva {
2525
/// Hashes a generic message using the SHA256 algorithm
2626
GANDIVA_EXPORT
27-
const char* gdv_hash_using_sha256(int64_t context, const void* message,
28-
size_t message_length, int32_t* out_length) {
27+
const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length,
28+
int32_t* out_length) {
2929
constexpr int sha256_result_length = 64;
30-
return gdv_hash_using_sha(context, message, message_length, EVP_sha256(),
31-
sha256_result_length, out_length);
30+
return gdv_hash_using_openssl(context, message, message_length, EVP_sha256(),
31+
sha256_result_length, out_length);
3232
}
3333

3434
/// Hashes a generic message using the SHA1 algorithm
3535
GANDIVA_EXPORT
36-
const char* gdv_hash_using_sha1(int64_t context, const void* message,
37-
size_t message_length, int32_t* out_length) {
36+
const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length,
37+
int32_t* out_length) {
3838
constexpr int sha1_result_length = 40;
39-
return gdv_hash_using_sha(context, message, message_length, EVP_sha1(),
40-
sha1_result_length, out_length);
39+
return gdv_hash_using_openssl(context, message, message_length, EVP_sha1(),
40+
sha1_result_length, out_length);
41+
}
42+
43+
GANDIVA_EXPORT
44+
const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length,
45+
int32_t* out_length) {
46+
constexpr int md5_result_length = 32;
47+
return gdv_hash_using_openssl(context, message, message_length, EVP_md5(),
48+
md5_result_length, out_length);
4149
}
4250

4351
/// \brief Hashes a generic message using SHA algorithm.
@@ -46,9 +54,9 @@ const char* gdv_hash_using_sha1(int64_t context, const void* message,
4654
/// the hash. The type of the hash is defined by the
4755
/// \b hash_type \b parameter.
4856
GANDIVA_EXPORT
49-
const char* gdv_hash_using_sha(int64_t context, const void* message,
50-
size_t message_length, const EVP_MD* hash_type,
51-
uint32_t result_buf_size, int32_t* out_length) {
57+
const char* gdv_hash_using_openssl(int64_t context, const void* message,
58+
size_t message_length, const EVP_MD* hash_type,
59+
uint32_t result_buf_size, int32_t* out_length) {
5260
EVP_MD_CTX* md_ctx = EVP_MD_CTX_new();
5361

5462
if (md_ctx == nullptr) {

cpp/src/gandiva/hash_utils.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,17 +25,21 @@
2525

2626
namespace gandiva {
2727
GANDIVA_EXPORT
28-
const char* gdv_hash_using_sha256(int64_t context, const void* message,
29-
size_t message_length, int32_t* out_length);
28+
const char* gdv_sha256_hash(int64_t context, const void* message, size_t message_length,
29+
int32_t* out_length);
3030

3131
GANDIVA_EXPORT
32-
const char* gdv_hash_using_sha1(int64_t context, const void* message,
33-
size_t message_length, int32_t* out_length);
32+
const char* gdv_sha1_hash(int64_t context, const void* message, size_t message_length,
33+
int32_t* out_length);
3434

3535
GANDIVA_EXPORT
36-
const char* gdv_hash_using_sha(int64_t context, const void* message,
37-
size_t message_length, const EVP_MD* hash_type,
38-
uint32_t result_buf_size, int32_t* out_length);
36+
const char* gdv_hash_using_openssl(int64_t context, const void* message,
37+
size_t message_length, const EVP_MD* hash_type,
38+
uint32_t result_buf_size, int32_t* out_length);
39+
40+
GANDIVA_EXPORT
41+
const char* gdv_md5_hash(int64_t context, const void* message, size_t message_length,
42+
int32_t* out_length);
3943

4044
GANDIVA_EXPORT
4145
uint64_t gdv_double_to_long(double value);

cpp/src/gandiva/hash_utils_test.cc

Lines changed: 78 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ TEST(TestShaHashUtils, TestSha1Numeric) {
4646
for (auto value : values_to_be_hashed) {
4747
int out_length;
4848
const char* sha_1 =
49-
gandiva::gdv_hash_using_sha1(ctx_ptr, &value, sizeof(value), &out_length);
49+
gandiva::gdv_sha1_hash(ctx_ptr, &value, sizeof(value), &out_length);
5050
std::string sha1_as_str(sha_1, out_length);
5151
EXPECT_EQ(sha1_as_str.size(), sha1_size);
5252

@@ -81,7 +81,7 @@ TEST(TestShaHashUtils, TestSha256Numeric) {
8181
for (auto value : values_to_be_hashed) {
8282
int out_length;
8383
const char* sha_256 =
84-
gandiva::gdv_hash_using_sha256(ctx_ptr, &value, sizeof(value), &out_length);
84+
gandiva::gdv_sha256_hash(ctx_ptr, &value, sizeof(value), &out_length);
8585
std::string sha256_as_str(sha_256, out_length);
8686
EXPECT_EQ(sha256_as_str.size(), sha256_size);
8787

@@ -91,6 +91,40 @@ TEST(TestShaHashUtils, TestSha256Numeric) {
9191
}
9292
}
9393

94+
TEST(TestShaHashUtils, TestMD5Numeric) {
95+
gandiva::ExecutionContext ctx;
96+
97+
auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
98+
99+
std::vector<uint64_t> values_to_be_hashed;
100+
101+
// Generate a list of values to obtains the MD5 hash
102+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.0));
103+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.1));
104+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.2));
105+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.10000001));
106+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000001));
107+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(1.000000));
108+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(-0.0000002));
109+
values_to_be_hashed.push_back(gandiva::gdv_double_to_long(0.999999));
110+
111+
// Checks if the hash value is different for each one of the values
112+
std::unordered_set<std::string> md5_values;
113+
114+
int md5_size = 32;
115+
116+
for (auto value : values_to_be_hashed) {
117+
int out_length;
118+
const char* md5 = gandiva::gdv_md5_hash(ctx_ptr, &value, sizeof(value), &out_length);
119+
std::string md5_as_str(md5, out_length);
120+
EXPECT_EQ(md5_as_str.size(), md5_size);
121+
122+
// The value can not exists inside the set with the hash results
123+
EXPECT_EQ(md5_values.find(md5_as_str), md5_values.end());
124+
md5_values.insert(md5_as_str);
125+
}
126+
}
127+
94128
TEST(TestShaHashUtils, TestSha1Varlen) {
95129
gandiva::ExecutionContext ctx;
96130

@@ -113,14 +147,14 @@ TEST(TestShaHashUtils, TestSha1Varlen) {
113147
const int sha1_size = 40;
114148
int out_length;
115149

116-
const char* sha_1 = gandiva::gdv_hash_using_sha1(ctx_ptr, first_string.c_str(),
117-
first_string.size(), &out_length);
150+
const char* sha_1 = gandiva::gdv_sha1_hash(ctx_ptr, first_string.c_str(),
151+
first_string.size(), &out_length);
118152
std::string sha1_as_str(sha_1, out_length);
119153
EXPECT_EQ(sha1_as_str.size(), sha1_size);
120154
EXPECT_EQ(sha1_as_str, expected_first_result);
121155

122-
const char* sha_2 = gandiva::gdv_hash_using_sha1(ctx_ptr, second_string.c_str(),
123-
second_string.size(), &out_length);
156+
const char* sha_2 = gandiva::gdv_sha1_hash(ctx_ptr, second_string.c_str(),
157+
second_string.size(), &out_length);
124158
std::string sha2_as_str(sha_2, out_length);
125159
EXPECT_EQ(sha2_as_str.size(), sha1_size);
126160
EXPECT_EQ(sha2_as_str, expected_second_result);
@@ -150,15 +184,49 @@ TEST(TestShaHashUtils, TestSha256Varlen) {
150184
const int sha256_size = 64;
151185
int out_length;
152186

153-
const char* sha_1 = gandiva::gdv_hash_using_sha256(ctx_ptr, first_string.c_str(),
154-
first_string.size(), &out_length);
187+
const char* sha_1 = gandiva::gdv_sha256_hash(ctx_ptr, first_string.c_str(),
188+
first_string.size(), &out_length);
155189
std::string sha1_as_str(sha_1, out_length);
156190
EXPECT_EQ(sha1_as_str.size(), sha256_size);
157191
EXPECT_EQ(sha1_as_str, expected_first_result);
158192

159-
const char* sha_2 = gandiva::gdv_hash_using_sha256(ctx_ptr, second_string.c_str(),
160-
second_string.size(), &out_length);
193+
const char* sha_2 = gandiva::gdv_sha256_hash(ctx_ptr, second_string.c_str(),
194+
second_string.size(), &out_length);
161195
std::string sha2_as_str(sha_2, out_length);
162196
EXPECT_EQ(sha2_as_str.size(), sha256_size);
163197
EXPECT_EQ(sha2_as_str, expected_second_result);
164198
}
199+
200+
TEST(TestShaHashUtils, TestMD5Varlen) {
201+
gandiva::ExecutionContext ctx;
202+
203+
auto ctx_ptr = reinterpret_cast<int64_t>(&ctx);
204+
205+
std::string first_string =
206+
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeıʃnY [ˈʏpsilɔn], Yen [jɛn], Yoga [ˈjoːgɑ]";
207+
208+
std::string second_string =
209+
"ði ıntəˈnæʃənəl fəˈnɛtık əsoʊsiˈeınY [ˈʏpsilɔn], "
210+
"Yen [jɛn], Yoga [ˈjoːgɑ] コンニチハ";
211+
212+
// The strings expected hashes are obtained from shell executing the following command:
213+
// echo -n <output-string> | openssl dgst md5
214+
std::string expected_first_result = "a633460644425b44e0e023d6980849cc";
215+
std::string expected_second_result = "407983529dba21e95d95951ccffd30c3";
216+
217+
// Generate the hashes and compare with expected outputs
218+
const int md5_size = 32;
219+
int out_length;
220+
221+
const char* md5_1 = gandiva::gdv_md5_hash(ctx_ptr, first_string.c_str(),
222+
first_string.size(), &out_length);
223+
std::string md5_as_str(md5_1, out_length);
224+
EXPECT_EQ(md5_as_str.size(), md5_size);
225+
EXPECT_EQ(md5_as_str, expected_first_result);
226+
227+
const char* md5_2 = gandiva::gdv_md5_hash(ctx_ptr, second_string.c_str(),
228+
second_string.size(), &out_length);
229+
std::string md5_2_as_str(md5_2, out_length);
230+
EXPECT_EQ(md5_2_as_str.size(), md5_size);
231+
EXPECT_EQ(md5_2_as_str, expected_second_result);
232+
}

0 commit comments

Comments
 (0)