Skip to content

Commit c270fb1

Browse files
committed
Add base implementation and tests for RPAD functions
1 parent 08d2053 commit c270fb1

3 files changed

Lines changed: 195 additions & 0 deletions

File tree

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1489,6 +1489,73 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
14891489
}
14901490
}
14911491

1492+
FORCE_INLINE
1493+
const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len,
1494+
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
1495+
gdv_int32* out_len) {
1496+
// if the text length or the defined return length (number of characters to return)
1497+
// is <=0, then return an empty string.
1498+
if (text_len == 0 || return_length <= 0) {
1499+
*out_len = 0;
1500+
return "";
1501+
}
1502+
1503+
// initially counts the number of utf8 characters in the defined text and fill_text
1504+
int32_t text_char_count = utf8_length(context, text, text_len);
1505+
int32_t fill_char_count = utf8_length(context, fill_text, fill_text_len);
1506+
// text_char_count is zero if input has invalid utf8 char
1507+
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1508+
if (text_char_count == 0 || (fill_text_len > 0 && fill_char_count == 0)) {
1509+
*out_len = 0;
1510+
return "";
1511+
}
1512+
1513+
if (return_length == text_char_count ||
1514+
(return_length > text_char_count && fill_text_len == 0)) {
1515+
// case where the return length is same as the text's length, or if it need to
1516+
// fill into text but "fill_text" is empty, then return text directly.
1517+
*out_len = text_len;
1518+
return text;
1519+
} else if (return_length < text_char_count) {
1520+
// case where it truncates the result on return length.
1521+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1522+
return text;
1523+
} else {
1524+
// case (return_length > text_char_count)
1525+
// case where it needs to copy "fill_text" on the string right
1526+
char* ret =
1527+
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(context, return_length));
1528+
if (ret == nullptr) {
1529+
gdv_fn_context_set_error_msg(context,
1530+
"Could not allocate memory for output string");
1531+
*out_len = 0;
1532+
return "";
1533+
}
1534+
// fulfill the initial text copying the main input string
1535+
memcpy(ret, text, text_len);
1536+
// try to fulfill the return string with the "fill_text" continuously
1537+
int32_t copied_chars_count = 0;
1538+
int32_t copied_chars_position = 0;
1539+
while (text_char_count + copied_chars_count < return_length) {
1540+
int32_t char_len;
1541+
int32_t fill_length;
1542+
// for each char, evaluate its length to consider it when mem copying
1543+
for (fill_length = 0; fill_length < fill_text_len; fill_length += char_len) {
1544+
if (text_char_count + copied_chars_count >= return_length) {
1545+
break;
1546+
}
1547+
char_len = utf8_char_length(fill_text[fill_length]);
1548+
copied_chars_count++;
1549+
}
1550+
memcpy(ret + text_len + copied_chars_position, fill_text, fill_length);
1551+
copied_chars_position += fill_length;
1552+
}
1553+
*out_len = copied_chars_position + text_len;
1554+
return ret;
1555+
}
1556+
}
1557+
1558+
14921559
FORCE_INLINE
14931560
const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len,
14941561
gdv_int32 return_length, gdv_int32* out_len) {
@@ -1541,6 +1608,58 @@ const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 tex
15411608
}
15421609
}
15431610

1611+
FORCE_INLINE
1612+
const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len,
1613+
gdv_int32 return_length, gdv_int32* out_len) {
1614+
// if the text length or the defined return length (number of characters to return)
1615+
// is <=0, then return an empty string.
1616+
if (text_len == 0 || return_length <= 0) {
1617+
*out_len = 0;
1618+
return "";
1619+
}
1620+
1621+
// initially counts the number of utf8 characters in the defined text and fill_text
1622+
int32_t text_char_count = utf8_length(context, text, text_len);
1623+
// text_char_count is zero if input has invalid utf8 char
1624+
// fill_char_count is zero if fill_text_len is > 0 and its value has invalid utf8 char
1625+
if (text_char_count == 0) {
1626+
*out_len = 0;
1627+
return "";
1628+
}
1629+
1630+
if (return_length == text_char_count) {
1631+
// case where the return length is same as the text's length, or if it need to
1632+
// fill into text but "fill_text" is empty, then return text directly.
1633+
*out_len = text_len;
1634+
return text;
1635+
} else if (return_length < text_char_count) {
1636+
// case where it truncates the result on return length.
1637+
*out_len = utf8_byte_pos(context, text, text_len, return_length);
1638+
return text;
1639+
} else {
1640+
// case (return_length > text_char_count)
1641+
// case where it needs to copy "fill_text" on the string right
1642+
char* ret =
1643+
reinterpret_cast<gdv_binary>(gdv_fn_context_arena_malloc(
1644+
context,
1645+
text_len + (return_length - text_char_count)));
1646+
if (ret == nullptr) {
1647+
gdv_fn_context_set_error_msg(context,
1648+
"Could not allocate memory for output string");
1649+
*out_len = 0;
1650+
return "";
1651+
}
1652+
// fulfill the initial text copying the main string input
1653+
memcpy(ret, text, text_len);
1654+
const char* blank_space = " ";
1655+
for (int i = 0; i < return_length - text_char_count; ++i) {
1656+
ret[text_len + i] = blank_space[0];
1657+
}
1658+
*out_len = text_len + (return_length - text_char_count);
1659+
return ret;
1660+
}
1661+
}
1662+
15441663
FORCE_INLINE
15451664
const char* split_part(gdv_int64 context, const char* text, gdv_int32 text_len,
15461665
const char* delimiter, gdv_int32 delim_len, gdv_int32 index,

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,75 @@ TEST(TestStringOps, TestLpadString) {
765765
EXPECT_EQ(std::string(out_str, out_len), " абвгд");
766766
}
767767

768+
TEST(TestStringOps, TestRpadString) {
769+
gandiva::ExecutionContext ctx;
770+
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);
771+
gdv_int32 out_len = 0;
772+
const char* out_str;
773+
774+
// RPAD function tests - with defined fill pad text
775+
out_str = rpad(ctx_ptr, "TestString", 10, 4, "fill", 4, &out_len);
776+
EXPECT_EQ(std::string(out_str, out_len), "Test");
777+
778+
out_str = rpad(ctx_ptr, "TestString", 10, 10, "fill", 4, &out_len);
779+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
780+
781+
out_str = rpad(ctx_ptr, "TestString", 0, 10, "fill", 4, &out_len);
782+
EXPECT_EQ(std::string(out_str, out_len), "");
783+
784+
out_str = rpad(ctx_ptr, "TestString", 10, 0, "fill", 4, &out_len);
785+
EXPECT_EQ(std::string(out_str, out_len), "");
786+
787+
out_str = rpad(ctx_ptr, "TestString", 10, -500, "fill", 4, &out_len);
788+
EXPECT_EQ(std::string(out_str, out_len), "");
789+
790+
out_str = rpad(ctx_ptr, "TestString", 10, 500, "", 0, &out_len);
791+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
792+
793+
out_str = rpad(ctx_ptr, "TestString", 10, 18, "Fill", 4, &out_len);
794+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFill");
795+
796+
out_str = rpad(ctx_ptr, "TestString", 10, 15, "Fill", 4, &out_len);
797+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillF");
798+
799+
out_str = rpad(ctx_ptr, "TestString", 10, 20, "Fill", 4, &out_len);
800+
EXPECT_EQ(std::string(out_str, out_len), "TestStringFillFillFi");
801+
802+
out_str = rpad(ctx_ptr, "абвгд", 10, 7, "д", 2, &out_len);
803+
EXPECT_EQ(std::string(out_str, out_len), "абвгддд");
804+
805+
out_str = rpad(ctx_ptr, "абвгд", 10, 20, "абвгд", 10, &out_len);
806+
EXPECT_EQ(std::string(out_str, out_len), "абвгдабвгдабвгдабвгд");
807+
808+
out_str = rpad(ctx_ptr, "hello", 5, 6, "д", 2, &out_len);
809+
EXPECT_EQ(std::string(out_str, out_len), "helloд");
810+
811+
// RPAD function tests - with NO pad text
812+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 4, &out_len);
813+
EXPECT_EQ(std::string(out_str, out_len), "Test");
814+
815+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 10, &out_len);
816+
EXPECT_EQ(std::string(out_str, out_len), "TestString");
817+
818+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 0, 10, &out_len);
819+
EXPECT_EQ(std::string(out_str, out_len), "");
820+
821+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 0,&out_len);
822+
EXPECT_EQ(std::string(out_str, out_len), "");
823+
824+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, -500, &out_len);
825+
EXPECT_EQ(std::string(out_str, out_len), "");
826+
827+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 18, &out_len);
828+
EXPECT_EQ(std::string(out_str, out_len), "TestString ");
829+
830+
out_str = rpad_no_fill_text(ctx_ptr, "TestString", 10, 15, &out_len);
831+
EXPECT_EQ(std::string(out_str, out_len), "TestString ");
832+
833+
out_str = rpad_no_fill_text(ctx_ptr, "абвгд", 10, 7, &out_len);
834+
EXPECT_EQ(std::string(out_str, out_len), "абвгд ");
835+
}
836+
768837
TEST(TestStringOps, TestRtrim) {
769838
gandiva::ExecutionContext ctx;
770839
uint64_t ctx_ptr = reinterpret_cast<gdv_int64>(&ctx);

cpp/src/gandiva/precompiled/types.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -411,9 +411,16 @@ const char* lpad(gdv_int64 context, const char* text, gdv_int32 text_len,
411411
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
412412
gdv_int32* out_len);
413413

414+
const char* rpad(gdv_int64 context, const char* text, gdv_int32 text_len,
415+
gdv_int32 return_length, const char* fill_text, gdv_int32 fill_text_len,
416+
gdv_int32* out_len);
417+
414418
const char* lpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len,
415419
gdv_int32 return_length, gdv_int32* out_len);
416420

421+
const char* rpad_no_fill_text(gdv_int64 context, const char* text, gdv_int32 text_len,
422+
gdv_int32 return_length, gdv_int32* out_len);
423+
417424
const char* replace_with_max_len_utf8_utf8_utf8(gdv_int64 context, const char* text,
418425
gdv_int32 text_len, const char* from_str,
419426
gdv_int32 from_str_len,

0 commit comments

Comments
 (0)