Change initcap function behavior

anthonylouisbsb · anthonylouisbsb · commit c98db7af64d4 · 2021-07-14T20:32:22.000-03:00
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -595,9 +595,8 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
-// Checks if the character is a whitespace by its code point. To check the list
-// of the existent whitespaces characters in UTF8, take a look at this link
-// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
+// Any codepoint, except the ones for lowercase letters, uppercase letters and decimal
+// digits will be considered as word separators.
 //
 // The Unicode characters also are divided between categories. This link
 // https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
@@ -606,13 +605,14 @@ GANDIVA_EXPORT
 bool gdv_fn_is_codepoint_for_space(uint32_t val) {
   auto category = utf8proc_category(val);
 
-  return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
-         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
-         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
+  return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
+         category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
+         category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
 }
 
-// For a given text, initialize the first letter of each word, e.g:
-//     - "it is a text str" -> "It Is A Text Str"
+// For a given text, initialize the first letter of each word and lowercase
+// the others e.g:
+//     - "IT is a tEXt str" -> "It Is A Text Str"
 GANDIVA_EXPORT
 const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
                                 int32_t* out_len) {
@@ -635,30 +635,31 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
   int32_t out_char_len = 0;
   int32_t out_idx = 0;
   uint32_t char_codepoint;
+
+  // Any character is considered as space, except if it is alphanumeric
   bool last_char_was_space = true;
 
   for (int32_t i = 0; i < data_len; i += char_len) {
     char_len = gdv_fn_utf8_char_length(data[i]);
-    // For single byte characters:
-    // If it is a lowercase ASCII character, set the output to its corresponding uppercase
-    // character; else, set the output to the read character
+    // An optimization for single byte characters:
     if (char_len == 1) {
       char cur = data[i];
 
       if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
-        // 'A' - 'Z' : 0x41 - 0x5a
-        // 'a' - 'z' : 0x61 - 0x7a
+        // Check if the character is the first one of the word and it is
+        // lowercase -> 'a' - 'z' : 0x61 - 0x7a.
+        // Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
         out[out_idx++] = static_cast<char>(cur - 0x20);
         last_char_was_space = false;
+      } else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
       } else {
-        // Check if the ASCII character is one of these:
-        // - space : 0x20
-        // - character tabulation : 0x9
-        // - line feed : 0xA
-        // - line tabulation : 0xB
-        // - form feed : 0xC
-        // - carriage return : 0xD
-        last_char_was_space = cur <= 0x20;
+        // Check if the ASCII character is not an alphanumeric character:
+        // '0' - '9': 0x30 - 0x39
+        // 'a' - 'z' : 0x61 - 0x7a
+        // 'A' - 'Z' : 0x41 - 0x5a
+        last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
+                              (cur > 0x5a && cur < 0x61) || (cur > 0x7a);
         out[out_idx++] = cur;
       }
       continue;
@@ -682,18 +683,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
 
     int32_t formatted_codepoint;
     if (last_char_was_space && !is_char_space) {
-      // Convert the encoded codepoint to its uppercase codepoint
       formatted_codepoint = utf8proc_toupper(char_codepoint);
     } else {
-      // Leave the codepoint as is
-      formatted_codepoint = char_codepoint;
+      formatted_codepoint = utf8proc_tolower(char_codepoint);
     }
 
     // UTF8Encode advances the pointer by the number of bytes present in the character
     auto* out_char = (uint8_t*)(out + out_idx);
     uint8_t* out_char_start = out_char;
 
-    // Encode the uppercase character
+    // Encode the character
     out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
 
     out_char_len = static_cast<int32_t>(out_char - out_char_start);