Skip to content

Commit dc90ca3

Browse files
projjalpraveenbingo
authored andcommitted
ARROW-7263: [C++][Gandiva] Implemented locate function
Closes apache#5905 from projjal/locate_fn and squashes the following commits: 13d2f59 <Projjal Chanda> addressed review comments 54ec3f1 <Projjal Chanda> Added locate function Authored-by: Projjal Chanda <iam@pchanda.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent 9197f01 commit dc90ca3

4 files changed

Lines changed: 125 additions & 0 deletions

File tree

cpp/src/gandiva/function_registry_string.cc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
4545
BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(starts_with, {}),
4646
BINARY_RELATIONAL_SAFE_NULL_IF_NULL_UTF8_FN(ends_with, {}),
4747

48+
BINARY_UNSAFE_NULL_IF_NULL(locate, {"position"}, utf8, int32),
49+
4850
UNARY_OCTET_LEN_FN(octet_length, {}),
4951
UNARY_OCTET_LEN_FN(bit_length, {}),
5052

@@ -95,6 +97,10 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
9597
NativeFunction("convert_fromUTF8", {"convert_fromutf8"}, DataTypeVector{binary()},
9698
utf8(), kResultNullIfNull, "convert_fromUTF8_binary",
9799
NativeFunction::kNeedsContext),
100+
101+
NativeFunction("locate", {"position"}, DataTypeVector{utf8(), utf8(), int32()},
102+
int32(), kResultNullIfNull, "locate_utf8_utf8_int32",
103+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
98104
};
99105

100106
return string_fn_registry_;

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,29 @@ int32 utf8_length(int64 context, const char* data, int32 data_len) {
138138
return count;
139139
}
140140

141+
// Get the byte position corresponding to a character position for a non-empty utf8
142+
// sequence
143+
FORCE_INLINE
144+
int32 utf8_byte_pos(int64 context, const char* str, int32 str_len, int32 char_pos) {
145+
int char_len = 0;
146+
int byte_index = 0;
147+
for (int32 char_index = 0; char_index < char_pos && byte_index < str_len;
148+
char_index++) {
149+
char_len = utf8_char_length(str[byte_index]);
150+
if (char_len == 0 ||
151+
byte_index + char_len > str_len) { // invalid byte or incomplete glyph
152+
set_error_for_invalid_utf(context, str[byte_index]);
153+
return -1;
154+
}
155+
byte_index += char_len;
156+
}
157+
if (byte_index >= str_len) {
158+
gdv_fn_context_set_error_msg(context, "Invalid character position argument");
159+
return -1;
160+
}
161+
return byte_index;
162+
}
163+
141164
#define UTF8_LENGTH(NAME, TYPE) \
142165
FORCE_INLINE \
143166
int32 NAME##_##TYPE(int64 context, TYPE in, int32 in_len) { \
@@ -410,4 +433,36 @@ const char* convert_fromUTF8_binary(int64 context, const char* bin_in, int32 len
410433
return ret;
411434
}
412435

436+
// Search for a string within another string
437+
FORCE_INLINE
438+
int32 locate_utf8_utf8(int64 context, const char* sub_str, int32 sub_str_len,
439+
const char* str, int32 str_len) {
440+
return locate_utf8_utf8_int32(context, sub_str, sub_str_len, str, str_len, 1);
441+
}
442+
443+
// Search for a string within another string starting at position start-pos (1-indexed)
444+
FORCE_INLINE
445+
int32 locate_utf8_utf8_int32(int64 context, const char* sub_str, int32 sub_str_len,
446+
const char* str, int32 str_len, int32 start_pos) {
447+
if (start_pos < 1) {
448+
gdv_fn_context_set_error_msg(context, "Start position must be greater than 0");
449+
return 0;
450+
}
451+
452+
if (str_len == 0 || sub_str_len == 0) {
453+
return 0;
454+
}
455+
456+
int32 byte_pos = utf8_byte_pos(context, str, str_len, start_pos - 1);
457+
if (byte_pos < 0) {
458+
return 0;
459+
}
460+
for (int32 i = byte_pos; i <= str_len - sub_str_len; ++i) {
461+
if (memcmp(str + i, sub_str, sub_str_len) == 0) {
462+
return utf8_length(context, str, i) + 1;
463+
}
464+
}
465+
return 0;
466+
}
467+
413468
} // extern "C"

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -309,4 +309,62 @@ TEST(TestStringOps, TestReverse) {
309309
ctx.Reset();
310310
}
311311

312+
TEST(TestStringOps, TestLocate) {
313+
gandiva::ExecutionContext ctx;
314+
uint64_t ctx_ptr = reinterpret_cast<int64>(&ctx);
315+
316+
int pos;
317+
318+
pos = locate_utf8_utf8(ctx_ptr, "String", 6, "TestString", 10);
319+
EXPECT_EQ(pos, 5);
320+
EXPECT_FALSE(ctx.has_error());
321+
322+
pos = locate_utf8_utf8_int32(ctx_ptr, "String", 6, "TestString", 10, 1);
323+
EXPECT_EQ(pos, 5);
324+
EXPECT_FALSE(ctx.has_error());
325+
326+
pos = locate_utf8_utf8_int32(ctx_ptr, "abc", 3, "abcabc", 6, 2);
327+
EXPECT_EQ(pos, 4);
328+
EXPECT_FALSE(ctx.has_error());
329+
330+
pos = locate_utf8_utf8(ctx_ptr, "çåå", 6, "s†å†emçåå†d", 21);
331+
EXPECT_EQ(pos, 7);
332+
EXPECT_FALSE(ctx.has_error());
333+
334+
pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "†barbar", 9, 3);
335+
EXPECT_EQ(pos, 5);
336+
EXPECT_FALSE(ctx.has_error());
337+
338+
pos = locate_utf8_utf8_int32(ctx_ptr, "sub", 3, "", 0, 1);
339+
EXPECT_EQ(pos, 0);
340+
EXPECT_FALSE(ctx.has_error());
341+
342+
pos = locate_utf8_utf8_int32(ctx_ptr, "", 0, "str", 3, 1);
343+
EXPECT_EQ(pos, 0);
344+
EXPECT_FALSE(ctx.has_error());
345+
346+
pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 0);
347+
EXPECT_EQ(pos, 0);
348+
EXPECT_THAT(ctx.get_error(),
349+
::testing::HasSubstr("Start position must be greater than 0"));
350+
ctx.Reset();
351+
352+
pos = locate_utf8_utf8_int32(ctx_ptr, "bar", 3, "barbar", 6, 7);
353+
EXPECT_EQ(pos, 0);
354+
EXPECT_THAT(ctx.get_error(),
355+
::testing::HasSubstr("Invalid character position argument"));
356+
ctx.Reset();
357+
358+
std::string d(
359+
"a\xff"
360+
"c");
361+
pos =
362+
locate_utf8_utf8_int32(ctx_ptr, "c", 1, d.data(), static_cast<int>(d.length()), 3);
363+
EXPECT_EQ(pos, 0);
364+
EXPECT_THAT(ctx.get_error(),
365+
::testing::HasSubstr(
366+
"unexpected byte \\ff encountered while decoding utf8 string"));
367+
ctx.Reset();
368+
}
369+
312370
} // namespace gandiva

cpp/src/gandiva/precompiled/types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@ const char* lower_utf8(int64 context, const char* data, int32 data_len,
184184

185185
const char* reverse_utf8(int64 context, const char* data, int32 data_len,
186186
int32_t* out_len);
187+
188+
int32 locate_utf8_utf8(int64 context, const char* sub_str, int32 sub_str_len,
189+
const char* str, int32 str_len);
190+
191+
int32 locate_utf8_utf8_int32(int64 context, const char* sub_str, int32 sub_str_len,
192+
const char* str, int32 str_len, int32 start_pos);
187193
} // extern "C"
188194

189195
#endif // PRECOMPILED_TYPES_H

0 commit comments

Comments
 (0)