Skip to content

Commit a3f778c

Browse files
anthonylouisbsbpraveenbingo
authored andcommitted
ARROW-13190: [C++] [Gandiva] Change behavior of INITCAP function
The current behavior of the INITCAP function is to turn the first character of each word uppercase and remains the other as is. The desired behavior is to turn the first letter uppercase and the other lowercase. Any character except the [lowercase letters](https://www.compart.com/en/unicode/category/Ll), [uppercase letters](https://www.compart.com/en/unicode/category/Lu) and [decimal numbers](https://www.compart.com/en/unicode/category/Nd) ones should be considered as a word separator. That behavior is based on these database systems: - [Oracle](https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions065.htm) - [Postgres](https://w3resource.com/PostgreSQL/initcap-function.php) - [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_INITCAP.html) - [Splice Machine](https://doc.splicemachine.com/sqlref_builtinfcns_initcap.html) Closes apache#10604 from anthonylouisbsb/fixbug/fix-initcap-behavior and squashes the following commits: 68a4399 <Anthony Louis> Change call to get_char_len 8e05abe <Anthony Louis> Add force inline option for MSVC compiler 9146c01 <Anthony Louis> Remove GANDIVA_EXPORT for helper functions ca0b0d0 <Anthony Louis> Add FORCE_INLINE in functions 1f4cfc7 <Anthony Louis> Add tests to modified letters 4a1a584 <Anthony Louis> Add more tests for other characters groups 32a2c2d <Anthony Louis> Fix java tests for function 4445e51 <Anthony Louis> Fix tests after changes in function faa2169 <Anthony Louis> Change comments for is space c98db7a <Anthony Louis> Change initcap function behavior Authored-by: Anthony Louis <anthony@simbioseventures.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent 5fcd4d5 commit a3f778c

4 files changed

Lines changed: 80 additions & 47 deletions

File tree

cpp/src/gandiva/gdv_function_stubs.cc

Lines changed: 35 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
482482
#undef GDV_FN_CAST_VARCHAR_INTEGER
483483
#undef GDV_FN_CAST_VARCHAR_REAL
484484

485-
GANDIVA_EXPORT
485+
GDV_FORCE_INLINE
486486
int32_t gdv_fn_utf8_char_length(char c) {
487487
if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
488488
return 1;
@@ -497,7 +497,7 @@ int32_t gdv_fn_utf8_char_length(char c) {
497497
return 0;
498498
}
499499

500-
GANDIVA_EXPORT
500+
GDV_FORCE_INLINE
501501
void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
502502
char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
503503
int size = static_cast<int>(strlen(fmt)) + 64;
@@ -651,24 +651,27 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
651651
return out;
652652
}
653653

654-
// Checks if the character is a whitespace by its code point. To check the list
655-
// of the existent whitespaces characters in UTF8, take a look at this link
656-
// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
654+
// Any codepoint, except the ones for lowercase letters, uppercase letters,
655+
// titlecase letters, decimal digits and letter numbers categories will be
656+
// considered as word separators.
657657
//
658658
// The Unicode characters also are divided between categories. This link
659-
// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
659+
// https://www.compart.com/en/unicode/category shows
660660
// more information about characters categories.
661-
GANDIVA_EXPORT
661+
GDV_FORCE_INLINE
662662
bool gdv_fn_is_codepoint_for_space(uint32_t val) {
663663
auto category = utf8proc_category(val);
664664

665-
return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
666-
category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
667-
category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
665+
return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
666+
category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
667+
category != utf8proc_category_t::UTF8PROC_CATEGORY_LT &&
668+
category != utf8proc_category_t::UTF8PROC_CATEGORY_NL &&
669+
category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
668670
}
669671

670-
// For a given text, initialize the first letter of each word, e.g:
671-
// - "it is a text str" -> "It Is A Text Str"
672+
// For a given text, initialize the first letter after a word-separator and lowercase
673+
// the others e.g:
674+
// - "IT is a tEXt str" -> "It Is A Text Str"
672675
GANDIVA_EXPORT
673676
const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
674677
int32_t* out_len) {
@@ -691,35 +694,38 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
691694
int32_t out_char_len = 0;
692695
int32_t out_idx = 0;
693696
uint32_t char_codepoint;
697+
698+
// Any character is considered as space, except if it is alphanumeric
694699
bool last_char_was_space = true;
695700

696701
for (int32_t i = 0; i < data_len; i += char_len) {
697-
char_len = gdv_fn_utf8_char_length(data[i]);
698-
// For single byte characters:
699-
// If it is a lowercase ASCII character, set the output to its corresponding uppercase
700-
// character; else, set the output to the read character
701-
if (char_len == 1) {
702+
// An optimization for single byte characters:
703+
if (static_cast<signed char>(data[i]) >= 0) { // 1-byte char (0x00 ~ 0x7F)
704+
char_len = 1;
702705
char cur = data[i];
703706

704707
if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
705-
// 'A' - 'Z' : 0x41 - 0x5a
706-
// 'a' - 'z' : 0x61 - 0x7a
708+
// Check if the character is the first one of the word and it is
709+
// lowercase -> 'a' - 'z' : 0x61 - 0x7a.
710+
// Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
707711
out[out_idx++] = static_cast<char>(cur - 0x20);
708712
last_char_was_space = false;
713+
} else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
714+
out[out_idx++] = static_cast<char>(cur + 0x20);
709715
} else {
710-
// Check if the ASCII character is one of these:
711-
// - space : 0x20
712-
// - character tabulation : 0x9
713-
// - line feed : 0xA
714-
// - line tabulation : 0xB
715-
// - form feed : 0xC
716-
// - carriage return : 0xD
717-
last_char_was_space = cur <= 0x20;
716+
// Check if the ASCII character is not an alphanumeric character:
717+
// '0' - '9': 0x30 - 0x39
718+
// 'a' - 'z' : 0x61 - 0x7a
719+
// 'A' - 'Z' : 0x41 - 0x5a
720+
last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
721+
(cur > 0x5a && cur < 0x61) || (cur > 0x7a);
718722
out[out_idx++] = cur;
719723
}
720724
continue;
721725
}
722726

727+
char_len = gdv_fn_utf8_char_length(data[i]);
728+
723729
// Control reaches here when we encounter a multibyte character
724730
const auto* in_char = (const uint8_t*)(data + i);
725731

@@ -738,18 +744,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
738744

739745
int32_t formatted_codepoint;
740746
if (last_char_was_space && !is_char_space) {
741-
// Convert the encoded codepoint to its uppercase codepoint
742747
formatted_codepoint = utf8proc_toupper(char_codepoint);
743748
} else {
744-
// Leave the codepoint as is
745-
formatted_codepoint = char_codepoint;
749+
formatted_codepoint = utf8proc_tolower(char_codepoint);
746750
}
747751

748752
// UTF8Encode advances the pointer by the number of bytes present in the character
749753
auto* out_char = (uint8_t*)(out + out_idx);
750754
uint8_t* out_char_start = out_char;
751755

752-
// Encode the uppercase character
756+
// Encode the character
753757
out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
754758

755759
out_char_len = static_cast<int32_t>(out_char - out_char_start);

cpp/src/gandiva/gdv_function_stubs.h

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,17 @@ using gdv_utf8 = char*;
4343
using gdv_binary = char*;
4444
using gdv_day_time_interval = int64_t;
4545

46+
#ifdef GANDIVA_UNIT_TEST
47+
// unit tests may be compiled without O2, so inlining may not happen.
48+
#define GDV_FORCE_INLINE
49+
#else
50+
#ifdef _MSC_VER
51+
#define GDV_FORCE_INLINE __forceinline
52+
#else
53+
#define GDV_FORCE_INLINE inline __attribute__((always_inline))
54+
#endif
55+
#endif
56+
4657
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
4758
const char* pattern, int pattern_len);
4859

@@ -135,9 +146,6 @@ const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int6
135146
GANDIVA_EXPORT
136147
int32_t gdv_fn_utf8_char_length(char c);
137148

138-
GANDIVA_EXPORT
139-
void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val);
140-
141149
GANDIVA_EXPORT
142150
const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
143151
int32_t* out_len);
@@ -146,9 +154,6 @@ GANDIVA_EXPORT
146154
const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
147155
int32_t* out_len);
148156

149-
GANDIVA_EXPORT
150-
bool gdv_fn_is_codepoint_for_space(uint32_t val);
151-
152157
GANDIVA_EXPORT
153158
const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
154159
int32_t* out_len);

cpp/src/gandiva/gdv_function_stubs_test.cc

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -546,20 +546,20 @@ TEST(TestGdvFnStubs, TestInitCap) {
546546
EXPECT_EQ(std::string(out_str, out_len), "Asdfj\nHlqf");
547547
EXPECT_FALSE(ctx.has_error());
548548

549-
out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!L", 11, &out_len);
550-
EXPECT_EQ(std::string(out_str, out_len), "S;DCgs,Jo!L");
549+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!l", 11, &out_len);
550+
EXPECT_EQ(std::string(out_str, out_len), "S;Dcgs,Jo!L");
551551
EXPECT_FALSE(ctx.has_error());
552552

553553
out_str = gdv_fn_initcap_utf8(ctx_ptr, " mÜNCHEN", 9, &out_len);
554-
EXPECT_EQ(std::string(out_str, out_len), " MÜNCHEN");
554+
EXPECT_EQ(std::string(out_str, out_len), " München");
555555
EXPECT_FALSE(ctx.has_error());
556556

557557
out_str = gdv_fn_initcap_utf8(ctx_ptr, "citroën CaR", 12, &out_len);
558-
EXPECT_EQ(std::string(out_str, out_len), "Citroën CaR");
558+
EXPECT_EQ(std::string(out_str, out_len), "Citroën Car");
559559
EXPECT_FALSE(ctx.has_error());
560560

561561
out_str = gdv_fn_initcap_utf8(ctx_ptr, "ÂbĆDËFgh\néll", 16, &out_len);
562-
EXPECT_EQ(std::string(out_str, out_len), "ÂbĆDËFgh\nÉll");
562+
EXPECT_EQ(std::string(out_str, out_len), "Âbćdëfgh\nÉll");
563563
EXPECT_FALSE(ctx.has_error());
564564

565565
out_str = gdv_fn_initcap_utf8(ctx_ptr, " øhpqršvñ \n\n", 17, &out_len);
@@ -572,7 +572,31 @@ TEST(TestGdvFnStubs, TestInitCap) {
572572
EXPECT_FALSE(ctx.has_error());
573573

574574
out_str = gdv_fn_initcap_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
575-
EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,pqśv}Ń+");
575+
EXPECT_EQ(std::string(out_str, out_len), "{Õhp,Pqśv}Ń+");
576+
EXPECT_FALSE(ctx.has_error());
577+
578+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "sɦasasdsɦsd\"sdsdɦ", 19, &out_len);
579+
EXPECT_EQ(std::string(out_str, out_len), "Sɦasasdsɦsd\"Sdsdɦ");
580+
EXPECT_FALSE(ctx.has_error());
581+
582+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "mysuperscipt@number²isfine", 27, &out_len);
583+
EXPECT_EQ(std::string(out_str, out_len), "Mysuperscipt@Number²Isfine");
584+
EXPECT_FALSE(ctx.has_error());
585+
586+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "Ő<tŵas̓老ƕɱ¢vIYwށ", 25, &out_len);
587+
EXPECT_EQ(std::string(out_str, out_len), "Ő<Tŵas̓老Ƕɱ¢Viywށ");
588+
EXPECT_FALSE(ctx.has_error());
589+
590+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "ↆcheckↆnumberisspace", 24, &out_len);
591+
EXPECT_EQ(std::string(out_str, out_len), "ↆcheckↆnumberisspace");
592+
EXPECT_FALSE(ctx.has_error());
593+
594+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "testing ᾌTitleᾌcase", 23, &out_len);
595+
EXPECT_EQ(std::string(out_str, out_len), "Testing ᾌtitleᾄcase");
596+
EXPECT_FALSE(ctx.has_error());
597+
598+
out_str = gdv_fn_initcap_utf8(ctx_ptr, "ʳTesting mʳodified", 20, &out_len);
599+
EXPECT_EQ(std::string(out_str, out_len), "ʳTesting MʳOdified");
576600
EXPECT_FALSE(ctx.has_error());
577601

578602
out_str = gdv_fn_initcap_utf8(ctx_ptr, "", 0, &out_len);

java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2310,17 +2310,17 @@ public void testInitCap() throws Exception {
23102310
byte[] validity = new byte[]{(byte) 15, 0};
23112311
String[] valuesX = new String[]{
23122312
" øhpqršvñ \n\n",
2313-
"möbelträgerfüße \nmöbelträgerfüße",
2313+
"möbelträger1füße \nmöbelträge'rfüße",
23142314
"ÂbĆDËFgh\néll",
23152315
"citroën CaR",
23162316
"kjk"
23172317
};
23182318

23192319
String[] expected = new String[]{
23202320
" Øhpqršvñ \n\n",
2321-
"Möbelträgerfüße \nMöbelträgerfüße",
2322-
"ÂbĆDËFgh\nÉll",
2323-
"Citroën CaR",
2321+
"Möbelträger1füße \nMöbelträge'Rfüße",
2322+
"Âbćdëfgh\nÉll",
2323+
"Citroën Car",
23242324
null
23252325
};
23262326

0 commit comments

Comments
 (0)