Skip to content

Commit 6074a07

Browse files
projjalpraveenbingo
authored andcommitted
ARROW-9343: [C++][Gandiva] CastInt/Float from string functions should handle leading/trailing white spaces
Also refactored the code to remove the parse_float helper function from context. This was done earlier to save the constructor cost when the stringconverter object from arrow/util/parsing was needed to be constructed first. Now that it has changed to parametric functions, its not needed to keep them in the executioncontext. Closes apache#7653 from projjal/parsefloat and squashes the following commits: 073061d <Projjal Chanda> added the string to error message 2b089dd <Projjal Chanda> trim strings before parsing float/int 14de7e6 <Projjal Chanda> refactored gandiva parse float code Authored-by: Projjal Chanda <iam@pchanda.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent f24dea2 commit 6074a07

9 files changed

Lines changed: 165 additions & 213 deletions

cpp/src/gandiva/context_helper.cc

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,6 @@ void ExportedContextFunctions::AddMappings(Engine* engine) const {
5050

5151
engine->AddGlobalMappingForFunc("gdv_fn_context_arena_reset", types->void_type(), args,
5252
reinterpret_cast<void*>(gdv_fn_context_arena_reset));
53-
54-
args = {types->i64_type(), // int64_t context_ptr
55-
types->i8_ptr_type(), // const char* data
56-
types->i32_type(), // int32_t lenr
57-
types->ptr_type(types->float_type())}; // float* val
58-
59-
engine->AddGlobalMappingForFunc("gdv_fn_context_parse_float32", types->i1_type(), args,
60-
reinterpret_cast<void*>(gdv_fn_context_parse_float32));
61-
62-
args = {types->i64_type(), // int64_t context_ptr
63-
types->i8_ptr_type(), // const char* data
64-
types->i32_type(), // int32_t lenr
65-
types->ptr_type(types->double_type())}; // double* val
66-
67-
engine->AddGlobalMappingForFunc("gdv_fn_context_parse_float64", types->i1_type(), args,
68-
reinterpret_cast<void*>(gdv_fn_context_parse_float64));
6953
}
7054

7155
} // namespace gandiva
@@ -89,16 +73,4 @@ void gdv_fn_context_arena_reset(int64_t context_ptr) {
8973
auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
9074
return context->arena()->Reset();
9175
}
92-
93-
bool gdv_fn_context_parse_float32(int64_t context_ptr, const char* data, int32_t len,
94-
float* val) {
95-
auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
96-
return context->parse_float(data, len, val);
97-
}
98-
99-
bool gdv_fn_context_parse_float64(int64_t context_ptr, const char* data, int32_t len,
100-
double* val) {
101-
auto context = reinterpret_cast<gandiva::ExecutionContext*>(context_ptr);
102-
return context->parse_double(data, len, val);
103-
}
10476
}

cpp/src/gandiva/execution_context.h

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,12 @@
1919

2020
#include <memory>
2121
#include <string>
22-
#include "arrow/util/value_parsing.h"
2322
#include "gandiva/simple_arena.h"
2423

2524
namespace gandiva {
2625

2726
/// Execution context during llvm evaluation
2827
class ExecutionContext {
29-
using FloatConverter = arrow::internal::StringConverter<arrow::FloatType>;
30-
using DoubleConverter = arrow::internal::StringConverter<arrow::DoubleType>;
31-
3228
public:
3329
explicit ExecutionContext(arrow::MemoryPool* pool = arrow::default_memory_pool())
3430
: arena_(pool) {}
@@ -50,14 +46,6 @@ class ExecutionContext {
5046
arena_.Reset();
5147
}
5248

53-
bool parse_float(const char* data, int32_t len, float* val) {
54-
return FloatConverter::Convert(data, len, val);
55-
}
56-
57-
bool parse_double(const char* data, int32_t len, double* val) {
58-
return DoubleConverter::Convert(data, len, val);
59-
}
60-
6149
private:
6250
std::string error_msg_;
6351
SimpleArena arena_;

cpp/src/gandiva/function_registry_arithmetic.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,6 @@ std::vector<NativeFunction> GetArithmeticFunctionRegistry() {
6464
UNARY_SAFE_NULL_IF_NULL(castDATE, {}, int32, date32),
6565
UNARY_SAFE_NULL_IF_NULL(castDATE, {}, date32, date64),
6666

67-
UNARY_UNSAFE_NULL_IF_NULL(castINT, {}, utf8, int32),
68-
UNARY_UNSAFE_NULL_IF_NULL(castBIGINT, {}, utf8, int64),
69-
UNARY_UNSAFE_NULL_IF_NULL(castFLOAT4, {}, utf8, float32),
70-
UNARY_UNSAFE_NULL_IF_NULL(castFLOAT8, {}, utf8, float64),
71-
7267
// add/sub/multiply/divide/mod
7368
BINARY_SYMMETRIC_FN(add, {}), BINARY_SYMMETRIC_FN(subtract, {}),
7469
BINARY_SYMMETRIC_FN(multiply, {}),

cpp/src/gandiva/function_registry_string.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
6060
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnull, {}),
6161
UNARY_SAFE_NULL_NEVER_BOOL_FN(isnotnull, {}),
6262

63+
UNARY_UNSAFE_NULL_IF_NULL(castINT, {}, utf8, int32),
64+
UNARY_UNSAFE_NULL_IF_NULL(castBIGINT, {}, utf8, int64),
65+
UNARY_UNSAFE_NULL_IF_NULL(castFLOAT4, {}, utf8, float32),
66+
UNARY_UNSAFE_NULL_IF_NULL(castFLOAT8, {}, utf8, float64),
67+
6368
NativeFunction("upper", {}, DataTypeVector{utf8()}, utf8(), kResultNullIfNull,
6469
"upper_utf8", NativeFunction::kNeedsContext),
6570

cpp/src/gandiva/gdv_function_stubs.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,6 @@ uint8_t* gdv_fn_context_arena_malloc(int64_t context_ptr, int32_t data_len);
3737

3838
void gdv_fn_context_arena_reset(int64_t context_ptr);
3939

40-
bool gdv_fn_context_parse_float32(int64_t context_ptr, const char* data, int32_t len,
41-
float* val);
42-
43-
bool gdv_fn_context_parse_float64(int64_t context_ptr, const char* data, int32_t len,
44-
double* val);
45-
4640
bool in_expr_lookup_int32(int64_t ptr, int32_t value, bool in_validity);
4741

4842
bool in_expr_lookup_int64(int64_t ptr, int64_t value, bool in_validity);

cpp/src/gandiva/precompiled/arithmetic_ops.cc

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,6 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
#include "arrow/util/value_parsing.h"
19-
2018
extern "C" {
2119

2220
#include <math.h>
@@ -234,36 +232,6 @@ DIV(int64)
234232
DIV_FLOAT(float32)
235233
DIV_FLOAT(float64)
236234

237-
#define CAST_INT_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
238-
FORCE_INLINE \
239-
gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data, \
240-
int32_t len) { \
241-
gdv_##OUT_TYPE val = 0; \
242-
if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, len, &val)) { \
243-
gdv_fn_context_set_error_msg(context, \
244-
"Failed parsing the string to required format"); \
245-
} \
246-
return val; \
247-
}
248-
249-
CAST_INT_FROM_STRING(int32, arrow::Int32Type, INT)
250-
CAST_INT_FROM_STRING(int64, arrow::Int64Type, BIGINT)
251-
252-
#define CAST_FLOAT_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
253-
FORCE_INLINE \
254-
gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data, \
255-
int32_t len) { \
256-
gdv_##OUT_TYPE val = 0; \
257-
if (!gdv_fn_context_parse_##OUT_TYPE(context, data, len, &val)) { \
258-
gdv_fn_context_set_error_msg(context, \
259-
"Failed parsing the string to required format"); \
260-
} \
261-
return val; \
262-
}
263-
264-
CAST_FLOAT_FROM_STRING(float32, arrow::FloatType, FLOAT4)
265-
CAST_FLOAT_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
266-
267235
#undef DIV_FLOAT
268236

269237
#undef DATE_FUNCTION
@@ -272,7 +240,5 @@ CAST_FLOAT_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
272240
#undef NUMERIC_DATE_TYPES
273241
#undef NUMERIC_FUNCTION
274242
#undef NUMERIC_TYPES
275-
#undef CAST_INT_FROM_STRING
276-
#undef CAST_FLOAT_FROM_STRING
277243

278244
} // extern "C"

cpp/src/gandiva/precompiled/arithmetic_ops_test.cc

Lines changed: 0 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -101,132 +101,4 @@ TEST(TestArithmeticOps, TestDiv) {
101101
context.Reset();
102102
}
103103

104-
TEST(TestArithmeticOps, TestCastINT) {
105-
gandiva::ExecutionContext ctx;
106-
107-
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
108-
109-
EXPECT_EQ(castINT_utf8(ctx_ptr, "-45", 3), -45);
110-
EXPECT_EQ(castINT_utf8(ctx_ptr, "0", 1), 0);
111-
EXPECT_EQ(castINT_utf8(ctx_ptr, "2147483647", 10), 2147483647);
112-
EXPECT_EQ(castINT_utf8(ctx_ptr, "02147483647", 11), 2147483647);
113-
EXPECT_EQ(castINT_utf8(ctx_ptr, "-2147483648", 11), -2147483648LL);
114-
EXPECT_EQ(castINT_utf8(ctx_ptr, "-02147483648", 12), -2147483648LL);
115-
116-
castINT_utf8(ctx_ptr, "2147483648", 10);
117-
EXPECT_THAT(ctx.get_error(),
118-
::testing::HasSubstr("Failed parsing the string to required format"));
119-
ctx.Reset();
120-
121-
castINT_utf8(ctx_ptr, "-2147483649", 11);
122-
EXPECT_THAT(ctx.get_error(),
123-
::testing::HasSubstr("Failed parsing the string to required format"));
124-
ctx.Reset();
125-
126-
castINT_utf8(ctx_ptr, "12.34", 5);
127-
EXPECT_THAT(ctx.get_error(),
128-
::testing::HasSubstr("Failed parsing the string to required format"));
129-
ctx.Reset();
130-
131-
castINT_utf8(ctx_ptr, "abc", 3);
132-
EXPECT_THAT(ctx.get_error(),
133-
::testing::HasSubstr("Failed parsing the string to required format"));
134-
ctx.Reset();
135-
136-
castINT_utf8(ctx_ptr, "", 0);
137-
EXPECT_THAT(ctx.get_error(),
138-
::testing::HasSubstr("Failed parsing the string to required format"));
139-
ctx.Reset();
140-
141-
castINT_utf8(ctx_ptr, "-", 1);
142-
EXPECT_THAT(ctx.get_error(),
143-
::testing::HasSubstr("Failed parsing the string to required format"));
144-
ctx.Reset();
145-
}
146-
147-
TEST(TestArithmeticOps, TestCastBIGINT) {
148-
gandiva::ExecutionContext ctx;
149-
150-
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
151-
152-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-45", 3), -45);
153-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "0", 1), 0);
154-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "9223372036854775807", 19), 9223372036854775807LL);
155-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "09223372036854775807", 20), 9223372036854775807LL);
156-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-9223372036854775808", 20),
157-
-9223372036854775807LL - 1);
158-
EXPECT_EQ(castBIGINT_utf8(ctx_ptr, "-009223372036854775808", 22),
159-
-9223372036854775807LL - 1);
160-
161-
castBIGINT_utf8(ctx_ptr, "9223372036854775808", 19);
162-
EXPECT_THAT(ctx.get_error(),
163-
::testing::HasSubstr("Failed parsing the string to required format"));
164-
ctx.Reset();
165-
166-
castBIGINT_utf8(ctx_ptr, "-9223372036854775809", 20);
167-
EXPECT_THAT(ctx.get_error(),
168-
::testing::HasSubstr("Failed parsing the string to required format"));
169-
ctx.Reset();
170-
171-
castBIGINT_utf8(ctx_ptr, "12.34", 5);
172-
EXPECT_THAT(ctx.get_error(),
173-
::testing::HasSubstr("Failed parsing the string to required format"));
174-
ctx.Reset();
175-
176-
castBIGINT_utf8(ctx_ptr, "abc", 3);
177-
EXPECT_THAT(ctx.get_error(),
178-
::testing::HasSubstr("Failed parsing the string to required format"));
179-
ctx.Reset();
180-
181-
castBIGINT_utf8(ctx_ptr, "", 0);
182-
EXPECT_THAT(ctx.get_error(),
183-
::testing::HasSubstr("Failed parsing the string to required format"));
184-
ctx.Reset();
185-
186-
castBIGINT_utf8(ctx_ptr, "-", 1);
187-
EXPECT_THAT(ctx.get_error(),
188-
::testing::HasSubstr("Failed parsing the string to required format"));
189-
ctx.Reset();
190-
}
191-
192-
TEST(TestArithmeticOps, TestCastFloat4) {
193-
gandiva::ExecutionContext ctx;
194-
195-
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
196-
197-
EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "-45.34", 6), -45.34f);
198-
EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "0", 1), 0.0f);
199-
EXPECT_EQ(castFLOAT4_utf8(ctx_ptr, "5", 1), 5.0f);
200-
201-
castFLOAT4_utf8(ctx_ptr, "", 0);
202-
EXPECT_THAT(ctx.get_error(),
203-
::testing::HasSubstr("Failed parsing the string to required format"));
204-
ctx.Reset();
205-
206-
castFLOAT4_utf8(ctx_ptr, "e", 1);
207-
EXPECT_THAT(ctx.get_error(),
208-
::testing::HasSubstr("Failed parsing the string to required format"));
209-
ctx.Reset();
210-
}
211-
212-
TEST(TestParseStringHolder, TestCastFloat8) {
213-
gandiva::ExecutionContext ctx;
214-
215-
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
216-
217-
EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "-45.34", 6), -45.34);
218-
EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "0", 1), 0.0);
219-
EXPECT_EQ(castFLOAT8_utf8(ctx_ptr, "5", 1), 5.0);
220-
221-
castFLOAT8_utf8(ctx_ptr, "", 0);
222-
EXPECT_THAT(ctx.get_error(),
223-
::testing::HasSubstr("Failed parsing the string to required format"));
224-
ctx.Reset();
225-
226-
castFLOAT8_utf8(ctx_ptr, "e", 1);
227-
EXPECT_THAT(ctx.get_error(),
228-
::testing::HasSubstr("Failed parsing the string to required format"));
229-
ctx.Reset();
230-
}
231-
232104
} // namespace gandiva

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
// String functions
1919

20+
#include "arrow/util/value_parsing.h"
21+
2022
extern "C" {
2123

2224
#include <limits.h>
@@ -672,4 +674,28 @@ const char* replace_utf8_utf8_utf8(gdv_int64 context, const char* text,
672674
out_len);
673675
}
674676

677+
#define CAST_NUMERIC_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
678+
FORCE_INLINE \
679+
gdv_##OUT_TYPE cast##TYPE_NAME##_utf8(int64_t context, const char* data, \
680+
int32_t len) { \
681+
gdv_##OUT_TYPE val = 0; \
682+
int32_t trimmed_len; \
683+
data = trim_utf8(context, data, len, &trimmed_len); \
684+
if (!arrow::internal::StringConverter<ARROW_TYPE>::Convert(data, trimmed_len, \
685+
&val)) { \
686+
std::string err = "Failed to cast the string " + std::string(data, trimmed_len) + \
687+
" to " #OUT_TYPE; \
688+
gdv_fn_context_set_error_msg(context, err.c_str()); \
689+
} \
690+
return val; \
691+
}
692+
693+
CAST_NUMERIC_FROM_STRING(int32, arrow::Int32Type, INT)
694+
CAST_NUMERIC_FROM_STRING(int64, arrow::Int64Type, BIGINT)
695+
CAST_NUMERIC_FROM_STRING(float32, arrow::FloatType, FLOAT4)
696+
CAST_NUMERIC_FROM_STRING(float64, arrow::DoubleType, FLOAT8)
697+
698+
#undef CAST_INT_FROM_STRING
699+
#undef CAST_FLOAT_FROM_STRING
700+
675701
} // extern "C"

0 commit comments

Comments
 (0)