Skip to content

Commit b43a689

Browse files
jpedroantunespraveenbingo
authored andcommitted
ARROW-12146: [C++][Gandiva] Implement CONVERT_FROM(expression, replacement char) function
Implement CONVERT_FROM(expression, ‘UTF8’, replacement char) Converts the byte data in expression to UTF-8. Expression can be a literal string or a field name. Will replace any invalid UTF-8 characters with the replacement character. Obs.: Actually we will only support a single byte replacement char Closes apache#9844 from jpedroantunes/feature/convert-replace-utf8 and squashes the following commits: bef6eaf <João Pedro> Add optimization for returning original string if no invalid chars were found e7c6a71 <João Pedro> Refactor memcpy unnecessary for single byte 7aac875 <João Pedro> Add handler for cases with 0 char len on replace char 6544583 <João Pedro> Apply proper identation on types.h and string_ops.cc in gandiva c66efb8 <João Pedro> Apply corrections and optimization on convert replace function d815f85 <João Pedro> Add validation for MSBs on convert replace utf8 Gandiva function 8e44d41 <João Pedro> Add validation for defined char length greater than 1 on convert replace a2ea61b <João Pedro> Adapt convert_from method to support single char on replacement (defined with dremio team) 7d4cec0 <João Pedro> Adapt convert_from method to support multiple char on replacement 1a1734b <João Pedro> Change string ops test for defining int variables instead of size_t b96dfc7 <João Pedro> Fix lint problems on string ops and test files 8f9a4bd <João Pedro> Fix identation on string files on gandiva module 875a1dd <João Pedro> Add integration test for convert replace utf8 method 536fd3a <João Pedro> Add definition of convert replace str method to types.h c950c8a <João Pedro> Add base tests for convert replace invalid chars 2a5fe94 <João Pedro> Add base logic for convert replace utf8 invalid chars Authored-by: João Pedro <joaop@simbioseventures.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent 632b2c1 commit b43a689

5 files changed

Lines changed: 183 additions & 0 deletions

File tree

cpp/src/gandiva/function_registry_string.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
204204
utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
205205
NativeFunction::kNeedsContext),
206206

207+
NativeFunction("convert_replaceUTF8", {"convert_replaceutf8"},
208+
DataTypeVector{binary(), utf8()}, utf8(), kResultNullIfNull,
209+
"convert_replace_invalid_fromUTF8_binary",
210+
NativeFunction::kNeedsContext),
211+
207212
NativeFunction("locate", {"position"}, DataTypeVector{utf8(), utf8(), int32()},
208213
int32(), kResultNullIfNull, "locate_utf8_utf8_int32",
209214
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,17 @@ void set_error_for_invalid_utf(int64_t execution_context, char val) {
156156
free(error);
157157
}
158158

159+
FORCE_INLINE
160+
bool validate_utf8_following_bytes(const char* data, int32_t data_len,
161+
int32_t char_index) {
162+
for (int j = 1; j < data_len; ++j) {
163+
if ((data[char_index + j] & 0xC0) != 0x80) { // bytes following head-byte of glyph
164+
return false;
165+
}
166+
}
167+
return true;
168+
}
169+
159170
// Count the number of utf8 characters
160171
// return 0 for invalid/incomplete input byte sequences
161172
FORCE_INLINE
@@ -1246,6 +1257,59 @@ const char* convert_fromUTF8_binary(gdv_int64 context, const char* bin_in, gdv_i
12461257
return ret;
12471258
}
12481259

1260+
FORCE_INLINE
1261+
const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
1262+
int32_t text_len,
1263+
const char* char_to_replace,
1264+
int32_t char_to_replace_len,
1265+
int32_t* out_len) {
1266+
if (char_to_replace_len == 0) {
1267+
*out_len = text_len;
1268+
return text_in;
1269+
} else if (char_to_replace_len != 1) {
1270+
gdv_fn_context_set_error_msg(context, "Replacement of multiple bytes not supported");
1271+
*out_len = 0;
1272+
return "";
1273+
}
1274+
// actually the convert_replace function replaces invalid chars with an ASCII
1275+
// character so the output length will be the same as the input length
1276+
*out_len = text_len;
1277+
char* ret = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, *out_len));
1278+
if (ret == nullptr) {
1279+
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
1280+
*out_len = 0;
1281+
return "";
1282+
}
1283+
int32_t valid_bytes_to_cpy = 0;
1284+
int32_t out_byte_counter = 0;
1285+
int32_t char_len;
1286+
// scan the base text from left to right and increment the start pointer till
1287+
// looking for invalid chars to substitute
1288+
for (int text_index = 0; text_index < text_len; text_index += char_len) {
1289+
char_len = utf8_char_length(text_in[text_index]);
1290+
// only memory copy the bytes when detect invalid char
1291+
if (char_len == 0 || text_index + char_len > text_len ||
1292+
!validate_utf8_following_bytes(text_in, char_len, text_index)) {
1293+
// define char_len = 1 to increase text_index by 1 (as ASCII char fits in 1 byte)
1294+
char_len = 1;
1295+
// first copy the valid bytes until now and then replace the invalid character
1296+
memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
1297+
ret[out_byte_counter + valid_bytes_to_cpy] = char_to_replace[0];
1298+
out_byte_counter += valid_bytes_to_cpy + char_len;
1299+
valid_bytes_to_cpy = 0;
1300+
continue;
1301+
}
1302+
valid_bytes_to_cpy += char_len;
1303+
}
1304+
// if invalid chars were not found, return the original string
1305+
if (out_byte_counter == 0) return text_in;
1306+
// if there are still valid bytes to copy, do it
1307+
if (valid_bytes_to_cpy != 0) {
1308+
memcpy(ret + out_byte_counter, text_in + out_byte_counter, valid_bytes_to_cpy);
1309+
}
1310+
return ret;
1311+
}
1312+
12491313
// Search for a string within another string
12501314
FORCE_INLINE
12511315
gdv_int32 locate_utf8_utf8(gdv_int64 context, const char* sub_str, gdv_int32 sub_str_len,

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,64 @@ TEST(TestStringOps, TestCharLength) {
115115
ctx.Reset();
116116
}
117117

118+
TEST(TestStringOps, TestConvertReplaceInvalidUtf8Char) {
119+
gandiva::ExecutionContext ctx;
120+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
121+
122+
// invalid utf8 (xf8 is invalid but x28 is not - x28 = '(')
123+
std::string a(
124+
"ok-\xf8\x28"
125+
"-a");
126+
auto a_in_out_len = static_cast<int>(a.length());
127+
const char* a_str = convert_replace_invalid_fromUTF8_binary(
128+
ctx_ptr, a.data(), a_in_out_len, "a", 1, &a_in_out_len);
129+
EXPECT_EQ(std::string(a_str, a_in_out_len), "ok-a(-a");
130+
EXPECT_FALSE(ctx.has_error());
131+
132+
// invalid utf8 (xa0 and xa1 are invalid)
133+
std::string b("ok-\xa0\xa1-valid");
134+
auto b_in_out_len = static_cast<int>(b.length());
135+
const char* b_str = convert_replace_invalid_fromUTF8_binary(
136+
ctx_ptr, b.data(), b_in_out_len, "b", 1, &b_in_out_len);
137+
EXPECT_EQ(std::string(b_str, b_in_out_len), "ok-bb-valid");
138+
EXPECT_FALSE(ctx.has_error());
139+
140+
// full valid utf8
141+
std::string c("all-valid");
142+
auto c_in_out_len = static_cast<int>(c.length());
143+
const char* c_str = convert_replace_invalid_fromUTF8_binary(
144+
ctx_ptr, c.data(), c_in_out_len, "c", 1, &c_in_out_len);
145+
EXPECT_EQ(std::string(c_str, c_in_out_len), "all-valid");
146+
EXPECT_FALSE(ctx.has_error());
147+
148+
// valid utf8 (महसुस is 4-char string, each char of which is likely a multibyte char)
149+
std::string d("ok-महसुस-valid-new");
150+
auto d_in_out_len = static_cast<int>(d.length());
151+
const char* d_str = convert_replace_invalid_fromUTF8_binary(
152+
ctx_ptr, d.data(), d_in_out_len, "d", 1, &d_in_out_len);
153+
EXPECT_EQ(std::string(d_str, d_in_out_len), "ok-महसुस-valid-new");
154+
EXPECT_FALSE(ctx.has_error());
155+
156+
// full valid utf8, but invalid replacement char length
157+
std::string e("all-valid");
158+
auto e_in_out_len = static_cast<int>(e.length());
159+
const char* e_str = convert_replace_invalid_fromUTF8_binary(
160+
ctx_ptr, e.data(), e_in_out_len, "ee", 2, &e_in_out_len);
161+
EXPECT_EQ(std::string(e_str, e_in_out_len), "");
162+
EXPECT_TRUE(ctx.has_error());
163+
ctx.Reset();
164+
165+
// full valid utf8, but invalid replacement char length
166+
std::string f("ok-\xa0\xa1-valid");
167+
auto f_in_out_len = static_cast<int>(f.length());
168+
const char* f_str = convert_replace_invalid_fromUTF8_binary(
169+
ctx_ptr, f.data(), f_in_out_len, "", 0, &f_in_out_len);
170+
EXPECT_EQ(std::string(f_str, f_in_out_len), "ok-\xa0\xa1-valid");
171+
EXPECT_FALSE(ctx.has_error());
172+
173+
ctx.Reset();
174+
}
175+
118176
TEST(TestStringOps, TestCastBoolToVarchar) {
119177
gandiva::ExecutionContext ctx;
120178
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);

cpp/src/gandiva/precompiled/types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,12 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
417417
gdv_int32 from_str_len, const char* to_str,
418418
gdv_int32 to_str_len, gdv_int32* out_len);
419419

420+
const char* convert_replace_invalid_fromUTF8_binary(int64_t context, const char* text_in,
421+
int32_t text_len,
422+
const char* char_to_replace,
423+
int32_t char_to_replace_len,
424+
int32_t* out_len);
425+
420426
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
421427
const char* splitter, gdv_int32 split_len, gdv_int32 index,
422428
gdv_int32* out_len);

cpp/src/gandiva/tests/utf8_test.cc

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,56 @@ TEST_F(TestUtf8, TestVarlenOutput) {
539539
EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0));
540540
}
541541

542+
TEST_F(TestUtf8, TestConvertUtf8) {
543+
// schema for input fields
544+
auto field_a = field("a", arrow::binary());
545+
auto field_c = field("c", utf8());
546+
auto schema = arrow::schema({field_a, field_c});
547+
548+
// output fields
549+
auto res = field("res", boolean());
550+
551+
// build expressions.
552+
auto node_a = TreeExprBuilder::MakeField(field_a);
553+
auto node_c = TreeExprBuilder::MakeField(field_c);
554+
555+
// define char to replace
556+
auto node_b = TreeExprBuilder::MakeStringLiteral("z");
557+
558+
auto convert_replace_utf8 =
559+
TreeExprBuilder::MakeFunction("convert_replaceUTF8", {node_a, node_b}, utf8());
560+
auto equals =
561+
TreeExprBuilder::MakeFunction("equal", {convert_replace_utf8, node_c}, boolean());
562+
auto expr = TreeExprBuilder::MakeExpression(equals, res);
563+
564+
// Build a projector for the expressions.
565+
std::shared_ptr<Projector> projector;
566+
auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector);
567+
EXPECT_TRUE(status.ok()) << status.message();
568+
569+
// Create a row-batch with some sample data
570+
int num_records = 3;
571+
auto array_a = MakeArrowArrayUtf8({"ok-\xf8\x28"
572+
"-a",
573+
"all-valid", "ok-\xa0\xa1-valid"},
574+
{true, true, true});
575+
576+
auto array_b =
577+
MakeArrowArrayUtf8({"ok-z(-a", "all-valid", "ok-zz-valid"}, {true, true, true});
578+
579+
// prepare input record batch
580+
auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b});
581+
582+
// Evaluate expression
583+
arrow::ArrayVector outputs;
584+
status = projector->Evaluate(*in_batch, pool_, &outputs);
585+
EXPECT_TRUE(status.ok()) << status.message();
586+
587+
auto exp = MakeArrowArrayBool({true, true, true}, {true, true, true});
588+
// Validate results
589+
EXPECT_ARROW_ARRAY_EQUALS(exp, outputs[0]);
590+
}
591+
542592
TEST_F(TestUtf8, TestCastVarChar) {
543593
// schema for input fields
544594
auto field_a = field("a", utf8());

0 commit comments

Comments
 (0)