ARROW-13190: [C++] [Gandiva] Change behavior of INITCAP function

anthonylouisbsb · praveenbingo · commit a3f778cd17cd · 2021-07-15T21:34:49.000+05:30
The current behavior of the INITCAP function is to turn the first character of each word uppercase and remains the other as is. The desired behavior is to turn the first letter uppercase and the other lowercase. Any character except the [lowercase letters](https://www.compart.com/en/unicode/category/Ll), [uppercase letters](https://www.compart.com/en/unicode/category/Lu) and [decimal numbers](https://www.compart.com/en/unicode/category/Nd) ones should be considered as a word separator. That behavior is based on these database systems: - [Oracle](https://docs.oracle.com/cd/B19306_01/server.102/b14200/functions065.htm) - [Postgres](https://w3resource.com/PostgreSQL/initcap-function.php) - [Redshift](https://docs.aws.amazon.com/redshift/latest/dg/r_INITCAP.html) - [Splice Machine](https://doc.splicemachine.com/sqlref_builtinfcns_initcap.html) Closes apache#10604 from anthonylouisbsb/fixbug/fix-initcap-behavior and squashes the following commits: 68a4399 <Anthony Louis> Change call to get_char_len 8e05abe <Anthony Louis> Add force inline option for MSVC compiler 9146c01 <Anthony Louis> Remove GANDIVA_EXPORT for helper functions ca0b0d0 <Anthony Louis> Add FORCE_INLINE in functions 1f4cfc7 <Anthony Louis> Add tests to modified letters 4a1a584 <Anthony Louis> Add more tests for other characters groups 32a2c2d <Anthony Louis> Fix java tests for function 4445e51 <Anthony Louis> Fix tests after changes in function faa2169 <Anthony Louis> Change comments for is space c98db7a <Anthony Louis> Change initcap function behavior Authored-by: Anthony Louis <anthony@simbioseventures.com> Signed-off-by: Praveen <praveen@dremio.com>
diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc
@@ -482,7 +482,7 @@ CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
 #undef GDV_FN_CAST_VARCHAR_INTEGER
 #undef GDV_FN_CAST_VARCHAR_REAL
 
-GANDIVA_EXPORT
+GDV_FORCE_INLINE
 int32_t gdv_fn_utf8_char_length(char c) {
   if ((signed char)c >= 0) {  // 1-byte char (0x00 ~ 0x7F)
     return 1;
@@ -497,7 +497,7 @@ int32_t gdv_fn_utf8_char_length(char c) {
   return 0;
 }
 
-GANDIVA_EXPORT
+GDV_FORCE_INLINE
 void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
   char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
   int size = static_cast<int>(strlen(fmt)) + 64;
@@ -651,24 +651,27 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
   return out;
 }
 
-// Checks if the character is a whitespace by its code point. To check the list
-// of the existent whitespaces characters in UTF8, take a look at this link
-// https://en.wikipedia.org/wiki/Whitespace_character#Unicode
+// Any codepoint, except the ones for lowercase letters, uppercase letters,
+// titlecase letters, decimal digits and letter numbers categories will be
+// considered as word separators.
 //
 // The Unicode characters also are divided between categories. This link
-// https://en.wikipedia.org/wiki/Unicode_character_property#General_Category shows
+// https://www.compart.com/en/unicode/category shows
 // more information about characters categories.
-GANDIVA_EXPORT
+GDV_FORCE_INLINE
 bool gdv_fn_is_codepoint_for_space(uint32_t val) {
   auto category = utf8proc_category(val);
 
-  return category == utf8proc_category_t::UTF8PROC_CATEGORY_ZS ||
-         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZL ||
-         category == utf8proc_category_t::UTF8PROC_CATEGORY_ZP;
+  return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
+         category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
+         category != utf8proc_category_t::UTF8PROC_CATEGORY_LT &&
+         category != utf8proc_category_t::UTF8PROC_CATEGORY_NL &&
+         category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
 }
 
-// For a given text, initialize the first letter of each word, e.g:
-//     - "it is a text str" -> "It Is A Text Str"
+// For a given text, initialize the first letter after a word-separator and lowercase
+// the others e.g:
+//     - "IT is a tEXt str" -> "It Is A Text Str"
 GANDIVA_EXPORT
 const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
                                 int32_t* out_len) {
@@ -691,35 +694,38 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
   int32_t out_char_len = 0;
   int32_t out_idx = 0;
   uint32_t char_codepoint;
+
+  // Any character is considered as space, except if it is alphanumeric
   bool last_char_was_space = true;
 
   for (int32_t i = 0; i < data_len; i += char_len) {
-    char_len = gdv_fn_utf8_char_length(data[i]);
-    // For single byte characters:
-    // If it is a lowercase ASCII character, set the output to its corresponding uppercase
-    // character; else, set the output to the read character
-    if (char_len == 1) {
+    // An optimization for single byte characters:
+    if (static_cast<signed char>(data[i]) >= 0) {  // 1-byte char (0x00 ~ 0x7F)
+      char_len = 1;
       char cur = data[i];
 
       if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
-        // 'A' - 'Z' : 0x41 - 0x5a
-        // 'a' - 'z' : 0x61 - 0x7a
+        // Check if the character is the first one of the word and it is
+        // lowercase -> 'a' - 'z' : 0x61 - 0x7a.
+        // Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
         out[out_idx++] = static_cast<char>(cur - 0x20);
         last_char_was_space = false;
+      } else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
+        out[out_idx++] = static_cast<char>(cur + 0x20);
       } else {
-        // Check if the ASCII character is one of these:
-        // - space : 0x20
-        // - character tabulation : 0x9
-        // - line feed : 0xA
-        // - line tabulation : 0xB
-        // - form feed : 0xC
-        // - carriage return : 0xD
-        last_char_was_space = cur <= 0x20;
+        // Check if the ASCII character is not an alphanumeric character:
+        // '0' - '9': 0x30 - 0x39
+        // 'a' - 'z' : 0x61 - 0x7a
+        // 'A' - 'Z' : 0x41 - 0x5a
+        last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
+                              (cur > 0x5a && cur < 0x61) || (cur > 0x7a);
         out[out_idx++] = cur;
       }
       continue;
     }
 
+    char_len = gdv_fn_utf8_char_length(data[i]);
+
     // Control reaches here when we encounter a multibyte character
     const auto* in_char = (const uint8_t*)(data + i);
 
@@ -738,18 +744,16 @@ const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_
 
     int32_t formatted_codepoint;
     if (last_char_was_space && !is_char_space) {
-      // Convert the encoded codepoint to its uppercase codepoint
       formatted_codepoint = utf8proc_toupper(char_codepoint);
     } else {
-      // Leave the codepoint as is
-      formatted_codepoint = char_codepoint;
+      formatted_codepoint = utf8proc_tolower(char_codepoint);
     }
 
     // UTF8Encode advances the pointer by the number of bytes present in the character
     auto* out_char = (uint8_t*)(out + out_idx);
     uint8_t* out_char_start = out_char;
 
-    // Encode the uppercase character
+    // Encode the character
     out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
 
     out_char_len = static_cast<int32_t>(out_char - out_char_start);
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
@@ -43,6 +43,17 @@ using gdv_utf8 = char*;
 using gdv_binary = char*;
 using gdv_day_time_interval = int64_t;
 
+#ifdef GANDIVA_UNIT_TEST
+// unit tests may be compiled without O2, so inlining may not happen.
+#define GDV_FORCE_INLINE
+#else
+#ifdef _MSC_VER
+#define GDV_FORCE_INLINE __forceinline
+#else
+#define GDV_FORCE_INLINE inline __attribute__((always_inline))
+#endif
+#endif
+
 bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
                            const char* pattern, int pattern_len);
 
@@ -135,9 +146,6 @@ const char* gdv_fn_castVARCHAR_float64_int64(int64_t context, double value, int6
 GANDIVA_EXPORT
 int32_t gdv_fn_utf8_char_length(char c);
 
-GANDIVA_EXPORT
-void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val);
-
 GANDIVA_EXPORT
 const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
                               int32_t* out_len);
@@ -146,9 +154,6 @@ GANDIVA_EXPORT
 const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
                               int32_t* out_len);
 
-GANDIVA_EXPORT
-bool gdv_fn_is_codepoint_for_space(uint32_t val);
-
 GANDIVA_EXPORT
 const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
                                 int32_t* out_len);
diff --git a/cpp/src/gandiva/gdv_function_stubs_test.cc b/cpp/src/gandiva/gdv_function_stubs_test.cc
@@ -546,20 +546,20 @@ TEST(TestGdvFnStubs, TestInitCap) {
   EXPECT_EQ(std::string(out_str, out_len), "Asdfj\nHlqf");
   EXPECT_FALSE(ctx.has_error());
 
-  out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!L", 11, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "S;DCgs,Jo!L");
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "s;DCgs,Jo!l", 11, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "S;Dcgs,Jo!L");
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, " mÜNCHEN", 9, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), " MÜNCHEN");
+  EXPECT_EQ(std::string(out_str, out_len), " München");
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, "citroën CaR", 12, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "Citroën CaR");
+  EXPECT_EQ(std::string(out_str, out_len), "Citroën Car");
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, "ÂbĆDËFgh\néll", 16, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "ÂbĆDËFgh\nÉll");
+  EXPECT_EQ(std::string(out_str, out_len), "Âbćdëfgh\nÉll");
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, "  øhpqršvñ  \n\n", 17, &out_len);
@@ -572,7 +572,31 @@ TEST(TestGdvFnStubs, TestInitCap) {
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, "{ÕHP,pqśv}Ń+", 15, &out_len);
-  EXPECT_EQ(std::string(out_str, out_len), "{ÕHP,pqśv}Ń+");
+  EXPECT_EQ(std::string(out_str, out_len), "{Õhp,Pqśv}Ń+");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "sɦasasdsɦsd\"sdsdɦ", 19, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Sɦasasdsɦsd\"Sdsdɦ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "mysuperscipt@number²isfine", 27, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Mysuperscipt@Number²Isfine");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "Ő<tŵas̓老ƕɱ¢vIYwށ", 25, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Ő<Tŵas̓老Ƕɱ¢Viywށ");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "ↆcheckↆnumberisspace", 24, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ↆcheckↆnumberisspace");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "testing ᾌTitleᾌcase", 23, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "Testing ᾌtitleᾄcase");
+  EXPECT_FALSE(ctx.has_error());
+
+  out_str = gdv_fn_initcap_utf8(ctx_ptr, "ʳTesting mʳodified", 20, &out_len);
+  EXPECT_EQ(std::string(out_str, out_len), "ʳTesting MʳOdified");
   EXPECT_FALSE(ctx.has_error());
 
   out_str = gdv_fn_initcap_utf8(ctx_ptr, "", 0, &out_len);
diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java
@@ -2310,17 +2310,17 @@ public void testInitCap() throws Exception {
     byte[] validity = new byte[]{(byte) 15, 0};
     String[] valuesX = new String[]{
         "  øhpqršvñ  \n\n",
-        "möbelträgerfüße   \nmöbelträgerfüße",
+        "möbelträger1füße   \nmöbelträge'rfüße",
         "ÂbĆDËFgh\néll",
         "citroën CaR",
         "kjk"
     };
 
     String[] expected = new String[]{
         "  Øhpqršvñ  \n\n",
-        "Möbelträgerfüße   \nMöbelträgerfüße",
-        "ÂbĆDËFgh\nÉll",
-        "Citroën CaR",
+        "Möbelträger1füße   \nMöbelträge'Rfüße",
+        "Âbćdëfgh\nÉll",
+        "Citroën Car",
         null
     };