Skip to content

Commit c98db7a

Browse files
Change initcap function behavior
1 parent dd9261e commit c98db7a

1 file changed

Lines changed: 24 additions & 25 deletions

File tree

cpp/src/gandiva/gdv_function_stubs.cc

Lines changed: 24 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -595,9 +595,8 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
595595
return out;
596596
}
597597

598-
// Checks if the character is a whitespace by its code point. To check the list
599-
// of the existent whitespaces characters in UTF8, take a look at this link
600-
// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
598+
// Any codepoint, except the ones for lowercase letters, uppercase letters and decimal
599+
// digits will be considered as word separators.
601600
//
602601
// The Unicode characters also are divided between categories. This link
603602
// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
@@ -606,13 +605,14 @@ GANDIVA_EXPORT
606605
bool gdv_fn_is_codepoint_for_space(uint32_t val) {
607606
auto category = utf8proc_category(val);
608607

609-
return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
610-
category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
611-
category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
608+
return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
609+
category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
610+
category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
612611
}
613612

614-
// For a given text, initialize the first letter of each word, e.g:
615-
// - "it is a text str" -> "It Is A Text Str"
613+
// For a given text, initialize the first letter of each word and lowercase
614+
// the others e.g:
615+
// - "IT is a tEXt str" -> "It Is A Text Str"
616616
GANDIVA_EXPORT
617617
const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
618618
int32_t* out_len) {
@@ -635,30 +635,31 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
635635
int32_t out_char_len = 0;
636636
int32_t out_idx = 0;
637637
uint32_t char_codepoint;
638+
639+
// Any character is considered as space, except if it is alphanumeric
638640
bool last_char_was_space = true;
639641

640642
for (int32_t i = 0; i < data_len; i += char_len) {
641643
char_len = gdv_fn_utf8_char_length(data[i]);
642-
// For single byte characters:
643-
// If it is a lowercase ASCII character, set the output to its corresponding uppercase
644-
// character; else, set the output to the read character
644+
// An optimization for single byte characters:
645645
if (char_len == 1) {
646646
char cur = data[i];
647647

648648
if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
649-
// 'A' - 'Z' : 0x41 - 0x5a
650-
// 'a' - 'z' : 0x61 - 0x7a
649+
// Check if the character is the first one of the word and it is
650+
// lowercase -> 'a' - 'z' : 0x61 - 0x7a.
651+
// Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
651652
out[out_idx++] = static_cast<char>(cur - 0x20);
652653
last_char_was_space = false;
654+
} else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
655+
out[out_idx++] = static_cast<char>(cur + 0x20);
653656
} else {
654-
// Check if the ASCII character is one of these:
655-
// - space : 0x20
656-
// - character tabulation : 0x9
657-
// - line feed : 0xA
658-
// - line tabulation : 0xB
659-
// - form feed : 0xC
660-
// - carriage return : 0xD
661-
last_char_was_space = cur <= 0x20;
657+
// Check if the ASCII character is not an alphanumeric character:
658+
// '0' - '9': 0x30 - 0x39
659+
// 'a' - 'z' : 0x61 - 0x7a
660+
// 'A' - 'Z' : 0x41 - 0x5a
661+
last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
662+
(cur > 0x5a && cur < 0x61) || (cur > 0x7a);
662663
out[out_idx++] = cur;
663664
}
664665
continue;
@@ -682,18 +683,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
682683

683684
int32_t formatted_codepoint;
684685
if (last_char_was_space && !is_char_space) {
685-
// Convert the encoded codepoint to its uppercase codepoint
686686
formatted_codepoint = utf8proc_toupper(char_codepoint);
687687
} else {
688-
// Leave the codepoint as is
689-
formatted_codepoint = char_codepoint;
688+
formatted_codepoint = utf8proc_tolower(char_codepoint);
690689
}
691690

692691
// UTF8Encode advances the pointer by the number of bytes present in the character
693692
auto* out_char = (uint8_t*)(out + out_idx);
694693
uint8_t* out_char_start = out_char;
695694

696-
// Encode the uppercase character
695+
// Encode the character
697696
out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
698697

699698
out_char_len = static_cast<int32_t>(out_char - out_char_start);

0 commit comments

Comments
 (0)