Skip to content

Commit a2ea61b

Browse files
committed
Adapt convert_from method to support single char on replacement (defined with dremio team)
1 parent 7d4cec0 commit a2ea61b

4 files changed

Lines changed: 23 additions & 6 deletions

File tree

cpp/src/gandiva/function_registry_string.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
204204
utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
205205
NativeFunction::kNeedsContext),
206206

207-
NativeFunction("convert_fromUTF8", {"convert_fromutf8"},
207+
NativeFunction("convert_replaceUTF8", {"convert_replaceutf8"},
208208
DataTypeVector{binary(), utf8()}, utf8(), kResultNullIfNull,
209209
"convert_replace_invalid_fromUTF8_binary",
210210
NativeFunction::kNeedsContext),

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,8 +1249,10 @@ const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_i
12491249
FORCE_INLINE
12501250
const char* convert_replace_invalid_fromUTF8_binary(
12511251
gdv_int64 context, const char* text_in, gdv_int32 text_len,
1252-
const char* char_to_replace, gdv_int32 char_to_replace_len, gdv_int32* out_len) {
1253-
*out_len = text_len * char_to_replace_len;
1252+
const char* char_to_replace, gdv_int32 /*char_to_replace_len*/, gdv_int32* out_len) {
1253+
// actually the convert_replace function replaces the invalid bytes with a single byte
1254+
// so the output length will be the same as the input length
1255+
*out_len = text_len;
12541256
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
12551257
if (ret == nullptr) {
12561258
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
@@ -1264,8 +1266,8 @@ const char* convert_replace_invalid_fromUTF8_binary(
12641266
for (int text_index = 0; text_index < text_len; text_index += char_len) {
12651267
char_len = utf8_char_length(text_in[text_index]);
12661268
if (char_len == 0 || text_index + char_len > text_len) {
1267-
memcpy(ret + out_byte_counter, char_to_replace, char_to_replace_len);
1268-
out_byte_counter += char_to_replace_len;
1269+
memcpy(ret + out_byte_counter, char_to_replace, 1);
1270+
out_byte_counter += 1;
12691271
// define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
12701272
char_len = 1;
12711273
} else {

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,21 @@ TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
129129
EXPECT_EQ(std::string(a_str, a_in_out_len), "ok-a(-a");
130130
EXPECT_FALSE(ctx.has_error());
131131

132+
// invalid utf8 (xa0 and xa1 are invalid)
133+
std::string b("ok-\xa0\xa1-valid");
134+
auto b_in_out_len = static_cast<int>(b.length());
135+
const char* b_str = convert_replace_invalid_fromUTF8_binary(
136+
ctx_ptr, b.data(), b_in_out_len, "b", 1, &b_in_out_len);
137+
EXPECT_EQ(std::string(b_str, b_in_out_len), "ok-bb-valid");
138+
EXPECT_FALSE(ctx.has_error());
139+
140+
// full valid utf8
141+
std::string c("all-valid");
142+
auto c_in_out_len = static_cast<int>(c.length());
143+
const char* c_str = convert_replace_invalid_fromUTF8_binary(
144+
ctx_ptr, c.data(), c_in_out_len, "c", 1, &c_in_out_len);
145+
EXPECT_EQ(std::string(c_str, c_in_out_len), "all-valid");
146+
132147
// valid utf8 (महसुस is a 4-byte utf-8 char)
133148
std::string e("ok-महसुस-valid-new");
134149
auto e_in_out_len = static_cast<int>(e.length());

cpp/src/gandiva/tests/utf8_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -556,7 +556,7 @@ TEST_F(TestUtf8, TestConvertUtf8) {
556556
auto node_b = TreeExprBuilder::MakeStringLiteral("z");
557557

558558
auto convert_replace_utf8 =
559-
TreeExprBuilder::MakeFunction("convert_fromUTF8", {node_a, node_b}, utf8());
559+
TreeExprBuilder::MakeFunction("convert_replaceUTF8", {node_a, node_b}, utf8());
560560
auto equals =
561561
TreeExprBuilder::MakeFunction("equal", {convert_replace_utf8, node_c}, boolean());
562562
auto expr = TreeExprBuilder::MakeExpression(equals, res);

0 commit comments

Comments
 (0)