ARROW-9343: [C++][Gandiva] CastInt/Float from string functions should handle leading/trailing white spaces

projjal · praveenbingo · commit 6074a07b257e · 2020-07-08T14:15:14.000+05:30
Also refactored the code to remove the parse_float helper function from context. This was done earlier to save the constructor cost when the stringconverter object from arrow/util/parsing was needed to be constructed first. Now that it has changed to parametric functions, its not needed to keep them in the executioncontext. Closes apache#7653 from projjal/parsefloat and squashes the following commits: 073061d <Projjal Chanda> added the string to error message 2b089dd <Projjal Chanda> trim strings before parsing float/int 14de7e6 <Projjal Chanda> refactored gandiva parse float code Authored-by: Projjal Chanda <iam@pchanda.com> Signed-off-by: Praveen <praveen@dremio.com>
diff --git a/cpp/src/gandiva/context_helper.cc b/cpp/src/gandiva/context_helper.cc
@@ -50,22 +50,6 @@ void ExportedContextFunctions::AddMappings(Engine* engine) const {
 
   engine->AddGlobalMappingForFunc("gdv_fn_context_arena_reset", types->void_type(), args,
                                   reinterpret_cast<void*>(gdv_fn_context_arena_reset));
-
-  args = {types->i64_type(),                      // int64_t context_ptr
-          types->i8_ptr_type(),                   // const char* data
-          types->i32_type(),                      // int32_t lenr
-          types->ptr_type(types->float_type())};  // float* val
-
-  engine->AddGlobalMappingForFunc("gdv_fn_context_parse_float32", types->i1_type(), args,
-                                  reinterpret_cast<void*>(gdv_fn_context_parse_float32));
-
-  args = {types->i64_type(),                       // int64_t context_ptr
-          types->i8_ptr_type(),                    // const char* data
-          types->i32_type(),                       // int32_t lenr
-          types->ptr_type(types->double_type())};  // double* val
-
-  engine->AddGlobalMappingForFunc("gdv_fn_context_parse_float64", types->i1_type(), args,
-                                  reinterpret_cast<void*>(gdv_fn_context_parse_float64));
 }
 
 }  // namespace gandiva
@@ -89,16 +73,4 @@ void gdv_fn_context_arena_reset(int64_t context_ptr) {
   auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
   return context->arena()->Reset();
 }
-
-bool gdv_fn_context_parse_float32(int64_t context_ptr, const char* data, int32_t len,
-                                  float* val) {
-  auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
-  return context->parse_float(data, len, val);
-}
-
-bool gdv_fn_context_parse_float64(int64_t context_ptr, const char* data, int32_t len,
-                                  double* val) {
-  auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
-  return context->parse_double(data, len, val);
-}
 }
diff --git a/cpp/src/gandiva/execution_context.h b/cpp/src/gandiva/execution_context.h
@@ -19,16 +19,12 @@
 
 #include <memory>
 #include <string>
-#include "arrow/util/value_parsing.h"
 #include "gandiva/simple_arena.h"
 
 namespace gandiva {
 
 /// Execution context during llvm evaluation
 class ExecutionContext {
-  using FloatConverter = arrow::internal::StringConverter<arrow::FloatType>;
-  using DoubleConverter = arrow::internal::StringConverter<arrow::DoubleType>;
-
  public:
   explicit ExecutionContext(arrow::MemoryPool* pool = arrow::default_memory_pool())
       : arena_(pool) {}
@@ -50,14 +46,6 @@ class ExecutionContext {
     arena_.Reset();
   }
 
-  bool parse_float(const char* data, int32_t len, float* val) {
-    return FloatConverter::Convert(data, len, val);
-  }
-
-  bool parse_double(const char* data, int32_t len, double* val) {
-    return DoubleConverter::Convert(data, len, val);
-  }
-
  private:
   std::string error_msg_;
   SimpleArena arena_;
diff --git a/cpp/src/gandiva/function_registry_arithmetic.cc b/cpp/src/gandiva/function_registry_arithmetic.cc
@@ -64,11 +64,6 @@ std::vector<NativeFunction> GetArithmeticFunctionRegistry() {
       UNARY_SAFE_NULL_IF_NULL(castDATE, {}, int32, date32),
       UNARY_SAFE_NULL_IF_NULL(castDATE, {}, date32, date64),
 
-      UNARY_UNSAFE_NULL_IF_NULL(castINT, {}, utf8, int32),
-      UNARY_UNSAFE_NULL_IF_NULL(castBIGINT, {}, utf8, int64),
-      UNARY_UNSAFE_NULL_IF_NULL(castFLOAT4, {}, utf8, float32),
-      UNARY_UNSAFE_NULL_IF_NULL(castFLOAT8, {}, utf8, float64),
-
       // add/sub/multiply/divide/mod
       BINARY_SYMMETRIC_FN(add, {}), BINARY_SYMMETRIC_FN(subtract, {}),
       BINARY_SYMMETRIC_FN(multiply, {}),
diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc
@@ -60,6 +60,11 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
       UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
       UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
 
+      UNARY_UNSAFE_NULL_IF_NULL(castINT, {}, utf8, int32),
+      UNARY_UNSAFE_NULL_IF_NULL(castBIGINT, {}, utf8, int64),
+      UNARY_UNSAFE_NULL_IF_NULL(castFLOAT4, {}, utf8, float32),
+      UNARY_UNSAFE_NULL_IF_NULL(castFLOAT8, {}, utf8, float64),
+
       NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
                      "upper_utf8", NativeFunction::kNeedsContext),
 
diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h
@@ -37,12 +37,6 @@ uint8_t* gdv_fn_context_arena_malloc(int64_t context_ptr, int32_t data_len);
 
 void gdv_fn_context_arena_reset(int64_t context_ptr);
 
-bool gdv_fn_context_parse_float32(int64_t context_ptr, const char* data, int32_t len,
-                                  float* val);
-
-bool gdv_fn_context_parse_float64(int64_t context_ptr, const char* data, int32_t len,
-                                  double* val);
-
 bool in_expr_lookup_int32(int64_t ptr, int32_t value, bool in_validity);
 
 bool in_expr_lookup_int64(int64_t ptr, int64_t value, bool in_validity);
diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops.cc b/cpp/src/gandiva/precompiled/arithmetic_ops.cc
@@ -15,8 +15,6 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "arrow/util/value_parsing.h"
-
 extern "C" {
 
 #include <math.h>
@@ -234,36 +232,6 @@ DIV(int64)
 DIV_FLOAT(float32)
 DIV_FLOAT(float64)
 
-#define CAST_INT_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME)                       \
-  FORCE_INLINE                                                                      \
-  gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data,          \
-                                        int32_t len) {                              \
-    gdv_##OUT_TYPE val = 0;                                                         \
-    if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, len, &val)) {  \
-      gdv_fn_context_set_error_msg(context,                                         \
-                                   "Failed parsing the string to required format"); \
-    }                                                                               \
-    return val;                                                                     \
-  }
-
-CAST_INT_FROM_STRING(int32, arrow::Int32Type, INT)
-CAST_INT_FROM_STRING(int64, arrow::Int64Type, BIGINT)
-
-#define CAST_FLOAT_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME)                     \
-  FORCE_INLINE                                                                      \
-  gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data,          \
-                                        int32_t len) {                              \
-    gdv_##OUT_TYPE val = 0;                                                         \
-    if (!gdv_fn_context_parse_##OUT_TYPE(context, data, len, &val)) {               \
-      gdv_fn_context_set_error_msg(context,                                         \
-                                   "Failed parsing the string to required format"); \
-    }                                                                               \
-    return val;                                                                     \
-  }
-
-CAST_FLOAT_FROM_STRING(float32, arrow::FloatType, FLOAT4)
-CAST_FLOAT_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
-
 #undef DIV_FLOAT
 
 #undef DATE_FUNCTION
@@ -272,7 +240,5 @@ CAST_FLOAT_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
 #undef NUMERIC_DATE_TYPES
 #undef NUMERIC_FUNCTION
 #undef NUMERIC_TYPES
-#undef CAST_INT_FROM_STRING
-#undef CAST_FLOAT_FROM_STRING
 
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc b/cpp/src/gandiva/precompiled/arithmetic_ops_test.cc
@@ -101,132 +101,4 @@ TEST(TestArithmeticOps, TestDiv) {
   context.Reset();
 }
 
-TEST(TestArithmeticOps, TestCastINT) {
-  gandiva::ExecutionContext ctx;
-
-  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
-
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "-45", 3), -45);
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "0", 1), 0);
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "2147483647", 10), 2147483647);
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "02147483647", 11), 2147483647);
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "-2147483648", 11), -2147483648LL);
-  EXPECT_EQ(castINT_utf8(ctx_ptr, "-02147483648", 12), -2147483648LL);
-
-  castINT_utf8(ctx_ptr, "2147483648", 10);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castINT_utf8(ctx_ptr, "-2147483649", 11);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castINT_utf8(ctx_ptr, "12.34", 5);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castINT_utf8(ctx_ptr, "abc", 3);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castINT_utf8(ctx_ptr, "", 0);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castINT_utf8(ctx_ptr, "-", 1);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-}
-
-TEST(TestArithmeticOps, TestCastBIGINT) {
-  gandiva::ExecutionContext ctx;
-
-  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
-
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-45", 3), -45);
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "0", 1), 0);
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "9223372036854775807", 19), 9223372036854775807LL);
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "09223372036854775807", 20), 9223372036854775807LL);
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-9223372036854775808", 20),
-            -9223372036854775807LL - 1);
-  EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-009223372036854775808", 22),
-            -9223372036854775807LL - 1);
-
-  castBIGINT_utf8(ctx_ptr, "9223372036854775808", 19);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castBIGINT_utf8(ctx_ptr, "-9223372036854775809", 20);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castBIGINT_utf8(ctx_ptr, "12.34", 5);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castBIGINT_utf8(ctx_ptr, "abc", 3);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castBIGINT_utf8(ctx_ptr, "", 0);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castBIGINT_utf8(ctx_ptr, "-", 1);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-}
-
-TEST(TestArithmeticOps, TestCastFloat4) {
-  gandiva::ExecutionContext ctx;
-
-  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
-
-  EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "-45.34", 6), -45.34f);
-  EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "0", 1), 0.0f);
-  EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "5", 1), 5.0f);
-
-  castFLOAT4_utf8(ctx_ptr, "", 0);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castFLOAT4_utf8(ctx_ptr, "e", 1);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-}
-
-TEST(TestParseStringHolder, TestCastFloat8) {
-  gandiva::ExecutionContext ctx;
-
-  int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
-
-  EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "-45.34", 6), -45.34);
-  EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "0", 1), 0.0);
-  EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "5", 1), 5.0);
-
-  castFLOAT8_utf8(ctx_ptr, "", 0);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-
-  castFLOAT8_utf8(ctx_ptr, "e", 1);
-  EXPECT_THAT(ctx.get_error(),
-              ::testing::HasSubstr("Failed parsing the string to required format"));
-  ctx.Reset();
-}
-
 }  // namespace gandiva
diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc
@@ -17,6 +17,8 @@
 
 // String functions
 
+#include "arrow/util/value_parsing.h"
+
 extern "C" {
 
 #include <limits.h>
@@ -672,4 +674,28 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
                                              out_len);
 }
 
+#define CAST_NUMERIC_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME)                       \
+  FORCE_INLINE                                                                          \
+  gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data,              \
+                                        int32_t len) {                                  \
+    gdv_##OUT_TYPE val = 0;                                                             \
+    int32_t trimmed_len;                                                                \
+    data = trim_utf8(context, data, len, &trimmed_len);                                 \
+    if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, trimmed_len,       \
+                                                               &val)) {                 \
+      std::string err = "Failed to cast the string " + std::string(data, trimmed_len) + \
+                        " to " #OUT_TYPE;                                               \
+      gdv_fn_context_set_error_msg(context, err.c_str());                               \
+    }                                                                                   \
+    return val;                                                                         \
+  }
+
+CAST_NUMERIC_FROM_STRING(int32, arrow::Int32Type, INT)
+CAST_NUMERIC_FROM_STRING(int64, arrow::Int64Type, BIGINT)
+CAST_NUMERIC_FROM_STRING(float32, arrow::FloatType, FLOAT4)
+CAST_NUMERIC_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
+
+#undef CAST_INT_FROM_STRING
+#undef CAST_FLOAT_FROM_STRING
+
 }  // extern "C"
diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc