@@ -595,9 +595,8 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
595595 return out;
596596}
597597
598- // Checks if the character is a whitespace by its code point. To check the list
599- // of the existent whitespaces characters in UTF8, take a look at this link
600- // https://en.wikipedia.org/wiki/Whitespace_character#Unicode
598+ // Any codepoint, except the ones for lowercase letters, uppercase letters and decimal
599+ // digits will be considered as word separators.
601600//
602601// The Unicode characters also are divided between categories. This link
603602// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
@@ -606,13 +605,14 @@ GANDIVA_EXPORT
606605bool gdv_fn_is_codepoint_for_space (uint32_t val) {
607606 auto category = utf8proc_category (val);
608607
609- return category == utf8proc_category_t ::UTF8PROC_CATEGORY_ZS ||
610- category == utf8proc_category_t ::UTF8PROC_CATEGORY_ZL ||
611- category == utf8proc_category_t :: UTF8PROC_CATEGORY_ZP ;
608+ return category != utf8proc_category_t ::UTF8PROC_CATEGORY_LU &&
609+ category != utf8proc_category_t ::UTF8PROC_CATEGORY_LL &&
610+ category != utf8proc_category_t :: UTF8PROC_CATEGORY_ND ;
612611}
613612
614- // For a given text, initialize the first letter of each word, e.g:
615- // - "it is a text str" -> "It Is A Text Str"
613+ // For a given text, initialize the first letter of each word and lowercase
614+ // the others e.g:
615+ // - "IT is a tEXt str" -> "It Is A Text Str"
616616GANDIVA_EXPORT
617617const char * gdv_fn_initcap_utf8 (int64_t context, const char * data, int32_t data_len,
618618 int32_t * out_len) {
@@ -635,30 +635,31 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
635635 int32_t out_char_len = 0 ;
636636 int32_t out_idx = 0 ;
637637 uint32_t char_codepoint;
638+
639+ // Any character is considered as space, except if it is alphanumeric
638640 bool last_char_was_space = true ;
639641
640642 for (int32_t i = 0 ; i < data_len; i += char_len) {
641643 char_len = gdv_fn_utf8_char_length (data[i]);
642- // For single byte characters:
643- // If it is a lowercase ASCII character, set the output to its corresponding uppercase
644- // character; else, set the output to the read character
644+ // An optimization for single byte characters:
645645 if (char_len == 1 ) {
646646 char cur = data[i];
647647
648648 if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
649- // 'A' - 'Z' : 0x41 - 0x5a
650- // 'a' - 'z' : 0x61 - 0x7a
649+ // Check if the character is the first one of the word and it is
650+ // lowercase -> 'a' - 'z' : 0x61 - 0x7a.
651+ // Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
651652 out[out_idx++] = static_cast <char >(cur - 0x20 );
652653 last_char_was_space = false ;
654+ } else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
655+ out[out_idx++] = static_cast <char >(cur + 0x20 );
653656 } else {
654- // Check if the ASCII character is one of these:
655- // - space : 0x20
656- // - character tabulation : 0x9
657- // - line feed : 0xA
658- // - line tabulation : 0xB
659- // - form feed : 0xC
660- // - carriage return : 0xD
661- last_char_was_space = cur <= 0x20 ;
657+ // Check if the ASCII character is not an alphanumeric character:
658+ // '0' - '9': 0x30 - 0x39
659+ // 'a' - 'z' : 0x61 - 0x7a
660+ // 'A' - 'Z' : 0x41 - 0x5a
661+ last_char_was_space = (cur < 0x30 ) || (cur > 0x39 && cur < 0x41 ) ||
662+ (cur > 0x5a && cur < 0x61 ) || (cur > 0x7a );
662663 out[out_idx++] = cur;
663664 }
664665 continue ;
@@ -682,18 +683,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
682683
683684 int32_t formatted_codepoint;
684685 if (last_char_was_space && !is_char_space) {
685- // Convert the encoded codepoint to its uppercase codepoint
686686 formatted_codepoint = utf8proc_toupper (char_codepoint);
687687 } else {
688- // Leave the codepoint as is
689- formatted_codepoint = char_codepoint;
688+ formatted_codepoint = utf8proc_tolower (char_codepoint);
690689 }
691690
692691 // UTF8Encode advances the pointer by the number of bytes present in the character
693692 auto * out_char = (uint8_t *)(out + out_idx);
694693 uint8_t * out_char_start = out_char;
695694
696- // Encode the uppercase character
695+ // Encode the character
697696 out_char = arrow::util::UTF8Encode (out_char, formatted_codepoint);
698697
699698 out_char_len = static_cast <int32_t >(out_char - out_char_start);
0 commit comments