Skip to content

Commit c4e53e0

Browse files
jvictorhugueninpraveenbingo
authored andcommitted
ARROW-12388: [C++][Gandiva] Implement cast numbers from varbinary functions in gandiva
Closes apache#10033 from jvictorhuguenin/feature/add-cast-numbers-from-varbinary and squashes the following commits: 63d9635 <frank400> Fix projector bad construction 52bf306 <frank400> Fix checkstyle 6641e1e <frank400> Remove miss placed tests 4bef9c3 <frank400> Fix checkstyle 07c75dd <frank400> Fix unnecessary functions 9a0a32c <frank400> Add tests with hex strings 7fb41bd <frank400> Add initial support for castFLOAT4 and castFLOAT8 for varbinary 71a3265 <frank400> Restructures the castINT and castBIGINT functions implementation 578aac9 <frank400> Fix checkstyle a61388f <frank400> Unify macros used to cast numbers from strings and varbinary 4bf6a53 <frank400> Add java tests 41a147c <frank400> Implement cast varbinary to number types Authored-by: frank400 <j.victorhuguenin2018@gmail.com> Signed-off-by: Praveen <praveen@dremio.com>
1 parent 7ee8edb commit c4e53e0

8 files changed

Lines changed: 391 additions & 3 deletions

File tree

cpp/src/gandiva/function_registry_string.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,22 @@ std::vector<NativeFunction> GetStringFunctionRegistry() {
102102
kResultNullIfNull, "gdv_fn_castFLOAT8_utf8",
103103
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
104104

105+
NativeFunction("castINT", {}, DataTypeVector{binary()}, int32(), kResultNullIfNull,
106+
"gdv_fn_castINT_varbinary",
107+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
108+
109+
NativeFunction("castBIGINT", {}, DataTypeVector{binary()}, int64(),
110+
kResultNullIfNull, "gdv_fn_castBIGINT_varbinary",
111+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
112+
113+
NativeFunction("castFLOAT4", {}, DataTypeVector{binary()}, float32(),
114+
kResultNullIfNull, "gdv_fn_castFLOAT4_varbinary",
115+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
116+
117+
NativeFunction("castFLOAT8", {}, DataTypeVector{binary()}, float64(),
118+
kResultNullIfNull, "gdv_fn_castFLOAT8_varbinary",
119+
NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors),
120+
105121
NativeFunction("castVARCHAR", {}, DataTypeVector{boolean(), int64()}, utf8(),
106122
kResultNullIfNull, "castVARCHAR_bool_int64",
107123
NativeFunction::kNeedsContext),

cpp/src/gandiva/gdv_function_stubs.cc

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <vector>
2424

2525
#include "arrow/util/base64.h"
26+
#include "arrow/util/double_conversion.h"
2627
#include "arrow/util/formatting.h"
2728
#include "arrow/util/utf8.h"
2829
#include "arrow/util/value_parsing.h"
@@ -364,10 +365,10 @@ const char* gdv_fn_base64_decode_utf8(int64_t context, const char* in, int32_t i
364365
return ret;
365366
}
366367

367-
#define CAST_NUMERIC_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
368+
#define CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, INNER_TYPE) \
368369
GANDIVA_EXPORT \
369-
OUT_TYPE gdv_fn_cast##TYPE_NAME##_utf8(int64_t context, const char* data, \
370-
int32_t len) { \
370+
OUT_TYPE gdv_fn_cast##TYPE_NAME##_##INNER_TYPE(int64_t context, const char* data, \
371+
int32_t len) { \
371372
OUT_TYPE val = 0; \
372373
/* trim leading and trailing spaces */ \
373374
int32_t trimmed_len; \
@@ -388,13 +389,26 @@ const char* gdv_fn_base64_decode_utf8(int64_t context, const char* in, int32_t i
388389
return val; \
389390
}
390391

392+
#define CAST_NUMERIC_FROM_STRING(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
393+
CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, utf8)
394+
391395
CAST_NUMERIC_FROM_STRING(int32_t, arrow::Int32Type, INT)
392396
CAST_NUMERIC_FROM_STRING(int64_t, arrow::Int64Type, BIGINT)
393397
CAST_NUMERIC_FROM_STRING(float, arrow::FloatType, FLOAT4)
394398
CAST_NUMERIC_FROM_STRING(double, arrow::DoubleType, FLOAT8)
395399

396400
#undef CAST_NUMERIC_FROM_STRING
397401

402+
#define CAST_NUMERIC_FROM_VARBINARY(OUT_TYPE, ARROW_TYPE, TYPE_NAME) \
403+
CAST_NUMERIC_FROM_VARLEN_TYPES(OUT_TYPE, ARROW_TYPE, TYPE_NAME, varbinary)
404+
405+
CAST_NUMERIC_FROM_VARBINARY(int32_t, arrow::Int32Type, INT)
406+
CAST_NUMERIC_FROM_VARBINARY(int64_t, arrow::Int64Type, BIGINT)
407+
CAST_NUMERIC_FROM_VARBINARY(float, arrow::FloatType, FLOAT4)
408+
CAST_NUMERIC_FROM_VARBINARY(double, arrow::DoubleType, FLOAT8)
409+
410+
#undef CAST_NUMERIC_STRING
411+
398412
#define GDV_FN_CAST_VARLEN_TYPE_FROM_INTEGER(IN_TYPE, CAST_NAME, ARROW_TYPE) \
399413
GANDIVA_EXPORT \
400414
const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \
@@ -1056,6 +1070,36 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const {
10561070
"gdv_fn_castVARCHAR_float64_int64", types->i8_ptr_type() /*return_type*/, args,
10571071
reinterpret_cast<void*>(gdv_fn_castVARCHAR_float64_int64));
10581072

1073+
args = {types->i64_type(), // int64_t context_ptr
1074+
types->i8_ptr_type(), // const char* data
1075+
types->i32_type()}; // int32_t lenr
1076+
1077+
engine->AddGlobalMappingForFunc("gdv_fn_castINT_varbinary", types->i32_type(), args,
1078+
reinterpret_cast<void*>(gdv_fn_castINT_varbinary));
1079+
1080+
args = {types->i64_type(), // int64_t context_ptr
1081+
types->i8_ptr_type(), // const char* data
1082+
types->i32_type()}; // int32_t lenr
1083+
1084+
engine->AddGlobalMappingForFunc("gdv_fn_castBIGINT_varbinary", types->i64_type(), args,
1085+
reinterpret_cast<void*>(gdv_fn_castBIGINT_varbinary));
1086+
1087+
args = {types->i64_type(), // int64_t context_ptr
1088+
types->i8_ptr_type(), // const char* data
1089+
types->i32_type()}; // int32_t lenr
1090+
1091+
engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT4_varbinary", types->float_type(),
1092+
args,
1093+
reinterpret_cast<void*>(gdv_fn_castFLOAT4_varbinary));
1094+
1095+
args = {types->i64_type(), // int64_t context_ptr
1096+
types->i8_ptr_type(), // const char* data
1097+
types->i32_type()}; // int32_t lenr
1098+
1099+
engine->AddGlobalMappingForFunc("gdv_fn_castFLOAT8_varbinary", types->double_type(),
1100+
args,
1101+
reinterpret_cast<void*>(gdv_fn_castFLOAT8_varbinary));
1102+
10591103
// gdv_fn_sha1_int8
10601104
args = {
10611105
types->i64_type(), // context

cpp/src/gandiva/gdv_function_stubs.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,4 +157,16 @@ const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_le
157157
GANDIVA_EXPORT
158158
const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
159159
int32_t* out_len);
160+
161+
GANDIVA_EXPORT
162+
int32_t gdv_fn_castINT_varbinary(gdv_int64 context, const char* in, int32_t in_len);
163+
164+
GANDIVA_EXPORT
165+
int64_t gdv_fn_castBIGINT_varbinary(gdv_int64 context, const char* in, int32_t in_len);
166+
167+
GANDIVA_EXPORT
168+
float gdv_fn_castFLOAT4_varbinary(gdv_int64 context, const char* in, int32_t in_len);
169+
170+
GANDIVA_EXPORT
171+
double gdv_fn_castFLOAT8_varbinary(gdv_int64 context, const char* in, int32_t in_len);
160172
}

cpp/src/gandiva/gdv_function_stubs_test.cc

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -623,4 +623,147 @@ TEST(TestGdvFnStubs, TestInitCap) {
623623
"unexpected byte \\e0 encountered while decoding utf8 string"));
624624
ctx.Reset();
625625
}
626+
627+
TEST(TestGdvFnStubs, TestCastVarbinaryINT) {
628+
gandiva::ExecutionContext ctx;
629+
630+
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
631+
632+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-45", 3), -45);
633+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "0", 1), 0);
634+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "2147483647", 10), 2147483647);
635+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "\x32\x33", 2), 23);
636+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "02147483647", 11), 2147483647);
637+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-2147483648", 11), -2147483648LL);
638+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, "-02147483648", 12), -2147483648LL);
639+
EXPECT_EQ(gdv_fn_castINT_varbinary(ctx_ptr, " 12 ", 4), 12);
640+
641+
gdv_fn_castINT_varbinary(ctx_ptr, "2147483648", 10);
642+
EXPECT_THAT(ctx.get_error(),
643+
::testing::HasSubstr("Failed to cast the string 2147483648 to int32"));
644+
ctx.Reset();
645+
646+
gdv_fn_castINT_varbinary(ctx_ptr, "-2147483649", 11);
647+
EXPECT_THAT(ctx.get_error(),
648+
::testing::HasSubstr("Failed to cast the string -2147483649 to int32"));
649+
ctx.Reset();
650+
651+
gdv_fn_castINT_varbinary(ctx_ptr, "12.34", 5);
652+
EXPECT_THAT(ctx.get_error(),
653+
::testing::HasSubstr("Failed to cast the string 12.34 to int32"));
654+
ctx.Reset();
655+
656+
gdv_fn_castINT_varbinary(ctx_ptr, "abc", 3);
657+
EXPECT_THAT(ctx.get_error(),
658+
::testing::HasSubstr("Failed to cast the string abc to int32"));
659+
ctx.Reset();
660+
661+
gdv_fn_castINT_varbinary(ctx_ptr, "", 0);
662+
EXPECT_THAT(ctx.get_error(),
663+
::testing::HasSubstr("Failed to cast the string to int32"));
664+
ctx.Reset();
665+
666+
gdv_fn_castINT_varbinary(ctx_ptr, "-", 1);
667+
EXPECT_THAT(ctx.get_error(),
668+
::testing::HasSubstr("Failed to cast the string - to int32"));
669+
ctx.Reset();
670+
}
671+
672+
TEST(TestGdvFnStubs, TestCastVarbinaryBIGINT) {
673+
gandiva::ExecutionContext ctx;
674+
675+
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
676+
677+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-45", 3), -45);
678+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "0", 1), 0);
679+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "9223372036854775807", 19),
680+
9223372036854775807LL);
681+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "09223372036854775807", 20),
682+
9223372036854775807LL);
683+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-9223372036854775808", 20),
684+
-9223372036854775807LL - 1);
685+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, "-009223372036854775808", 22),
686+
-9223372036854775807LL - 1);
687+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr, " 12 ", 4), 12);
688+
689+
EXPECT_EQ(gdv_fn_castBIGINT_varbinary(ctx_ptr,
690+
"\x39\x39\x39\x39\x39\x39\x39\x39\x39\x39", 10),
691+
9999999999LL);
692+
693+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "9223372036854775808", 19);
694+
EXPECT_THAT(
695+
ctx.get_error(),
696+
::testing::HasSubstr("Failed to cast the string 9223372036854775808 to int64"));
697+
ctx.Reset();
698+
699+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "-9223372036854775809", 20);
700+
EXPECT_THAT(
701+
ctx.get_error(),
702+
::testing::HasSubstr("Failed to cast the string -9223372036854775809 to int64"));
703+
ctx.Reset();
704+
705+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "12.34", 5);
706+
EXPECT_THAT(ctx.get_error(),
707+
::testing::HasSubstr("Failed to cast the string 12.34 to int64"));
708+
ctx.Reset();
709+
710+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "abc", 3);
711+
EXPECT_THAT(ctx.get_error(),
712+
::testing::HasSubstr("Failed to cast the string abc to int64"));
713+
ctx.Reset();
714+
715+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "", 0);
716+
EXPECT_THAT(ctx.get_error(),
717+
::testing::HasSubstr("Failed to cast the string to int64"));
718+
ctx.Reset();
719+
720+
gdv_fn_castBIGINT_varbinary(ctx_ptr, "-", 1);
721+
EXPECT_THAT(ctx.get_error(),
722+
::testing::HasSubstr("Failed to cast the string - to int64"));
723+
ctx.Reset();
724+
}
725+
726+
TEST(TestGdvFnStubs, TestCastVarbinaryFloat4) {
727+
gandiva::ExecutionContext ctx;
728+
729+
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
730+
731+
EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "-45.34", 6), -45.34f);
732+
EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "0", 1), 0.0f);
733+
EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, "5", 1), 5.0f);
734+
EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, " 3.4 ", 5), 3.4f);
735+
EXPECT_EQ(gdv_fn_castFLOAT4_varbinary(ctx_ptr, " \x33\x2E\x34 ", 5), 3.4f);
736+
737+
gdv_fn_castFLOAT4_varbinary(ctx_ptr, "", 0);
738+
EXPECT_THAT(ctx.get_error(),
739+
::testing::HasSubstr("Failed to cast the string to float"));
740+
ctx.Reset();
741+
742+
gdv_fn_castFLOAT4_varbinary(ctx_ptr, "e", 1);
743+
EXPECT_THAT(ctx.get_error(),
744+
::testing::HasSubstr("Failed to cast the string e to float"));
745+
ctx.Reset();
746+
}
747+
748+
TEST(TestGdvFnStubs, TestCastVarbinaryFloat8) {
749+
gandiva::ExecutionContext ctx;
750+
751+
int64_t ctx_ptr = reinterpret_cast<int64_t>(&ctx);
752+
753+
EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "-45.34", 6), -45.34);
754+
EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "0", 1), 0.0);
755+
EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, "5", 1), 5.0);
756+
EXPECT_EQ(gdv_fn_castFLOAT8_varbinary(ctx_ptr, " \x33\x2E\x34 ", 5), 3.4);
757+
758+
gdv_fn_castFLOAT8_varbinary(ctx_ptr, "", 0);
759+
EXPECT_THAT(ctx.get_error(),
760+
::testing::HasSubstr("Failed to cast the string to double"));
761+
ctx.Reset();
762+
763+
gdv_fn_castFLOAT8_varbinary(ctx_ptr, "e", 1);
764+
EXPECT_THAT(ctx.get_error(),
765+
::testing::HasSubstr("Failed to cast the string e to double"));
766+
ctx.Reset();
767+
}
768+
626769
} // namespace gandiva

cpp/src/gandiva/precompiled/string_ops.cc

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,6 +2052,69 @@ const char* binary_string(gdv_int64 context, const char* text, gdv_int32 text_le
20522052
return ret;
20532053
}
20542054

2055+
#define CAST_INT_BIGINT_VARBINARY(OUT_TYPE, TYPE_NAME) \
2056+
FORCE_INLINE \
2057+
OUT_TYPE \
2058+
cast##TYPE_NAME##_varbinary(gdv_int64 context, const char* in, int32_t in_len) { \
2059+
if (in_len == 0) { \
2060+
gdv_fn_context_set_error_msg(context, "Can't cast an empty string."); \
2061+
return -1; \
2062+
} \
2063+
char sign = in[0]; \
2064+
\
2065+
bool negative = false; \
2066+
if (sign == '-') { \
2067+
negative = true; \
2068+
/* Ignores the sign char in the hexadecimal string */ \
2069+
in++; \
2070+
in_len--; \
2071+
} \
2072+
\
2073+
if (negative && in_len == 0) { \
2074+
gdv_fn_context_set_error_msg(context, \
2075+
"Can't cast hexadecimal with only a minus sign."); \
2076+
return -1; \
2077+
} \
2078+
\
2079+
OUT_TYPE result = 0; \
2080+
int digit; \
2081+
\
2082+
int read_index = 0; \
2083+
while (read_index < in_len) { \
2084+
char c1 = in[read_index]; \
2085+
if (isxdigit(c1)) { \
2086+
digit = to_binary_from_hex(c1); \
2087+
\
2088+
OUT_TYPE next = result * 16 - digit; \
2089+
\
2090+
if (next > result) { \
2091+
gdv_fn_context_set_error_msg(context, "Integer overflow."); \
2092+
return -1; \
2093+
} \
2094+
result = next; \
2095+
read_index++; \
2096+
} else { \
2097+
gdv_fn_context_set_error_msg(context, \
2098+
"The hexadecimal given has invalid characters."); \
2099+
return -1; \
2100+
} \
2101+
} \
2102+
if (!negative) { \
2103+
result *= -1; \
2104+
\
2105+
if (result < 0) { \
2106+
gdv_fn_context_set_error_msg(context, "Integer overflow."); \
2107+
return -1; \
2108+
} \
2109+
} \
2110+
return result; \
2111+
}
2112+
2113+
CAST_INT_BIGINT_VARBINARY(int32_t, INT)
2114+
CAST_INT_BIGINT_VARBINARY(int64_t, BIGINT)
2115+
2116+
#undef CAST_INT_BIGINT_VARBINARY
2117+
20552118
// Produces the binary representation of a string y characters long derived by starting
20562119
// at offset 'x' and considering the defined length 'y'. Notice that the offset index
20572120
// may be a negative number (starting from the end of the string), or a positive number

cpp/src/gandiva/precompiled/string_ops_test.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1683,4 +1683,5 @@ TEST(TestStringOps, TestConvertToBigEndian) {
16831683
}
16841684
#endif
16851685
}
1686+
16861687
} // namespace gandiva

0 commit comments

Comments
 (0)