Skip to content

Commit 7d17a5b

Browse files
authored
apacheGH-15239: [C++][Parquet] Parquet writer writes decimal as int32/64 (apache#15244)
As the parquet [specs](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal) states, DECIMAL can be used to annotate the following types: - int32: for 1 <= precision <= 9 - int64: for 1 <= precision <= 18; precision < 10 will produce a warning - fixed_len_byte_array: precision is limited by the array size. Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits - binary: precision is not limited, but is required. The minimum number of bytes to store the unscaled value should be used. The aim of this patch is to provide a writer option to use int32 to annotate decimal when 1 <= precision <= 9 and int64 when 10 <= precision <= 18. * Closes: apache#15239 Authored-by: Gang Wu <ustcwg@gmail.com> Signed-off-by: Will Jones <willjones127@gmail.com>
1 parent 0da51b7 commit 7d17a5b

6 files changed

Lines changed: 215 additions & 32 deletions

File tree

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@
2525

2626
#include <cstdint>
2727
#include <functional>
28-
#include <iostream>
2928
#include <sstream>
3029
#include <vector>
3130

@@ -3432,6 +3431,25 @@ TEST(ArrowReadWrite, NestedRequiredOuterOptionalDecimal) {
34323431
}
34333432
}
34343433

3434+
TEST(ArrowReadWrite, Decimal256AsInt) {
3435+
using ::arrow::Decimal256;
3436+
using ::arrow::field;
3437+
3438+
auto type = ::arrow::decimal256(8, 4);
3439+
3440+
const char* json = R"(["1.0000", null, "-1.2345", "-1000.5678",
3441+
"-9999.9999", "9999.9999"])";
3442+
auto array = ::arrow::ArrayFromJSON(type, json);
3443+
auto table = ::arrow::Table::Make(::arrow::schema({field("root", type)}), {array});
3444+
3445+
parquet::WriterProperties::Builder builder;
3446+
// Enforce integer type to annotate decimal type
3447+
auto writer_properties = builder.enable_integer_annotate_decimal()->build();
3448+
auto props_store_schema = ArrowWriterProperties::Builder().store_schema()->build();
3449+
3450+
CheckConfiguredRoundtrip(table, table, writer_properties, props_store_schema);
3451+
}
3452+
34353453
class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
34363454
protected:
34373455
// make it *3 to make it easily divisible by 3
@@ -4796,5 +4814,83 @@ std::vector<NestedFilterTestCase> GenerateMapFilteredTestCases() {
47964814
INSTANTIATE_TEST_SUITE_P(MapFilteredReads, TestNestedSchemaFilteredReader,
47974815
::testing::ValuesIn(GenerateMapFilteredTestCases()));
47984816

4817+
template <typename TestType>
4818+
class TestIntegerAnnotateDecimalTypeParquetIO : public TestParquetIO<TestType> {
4819+
public:
4820+
void WriteColumn(const std::shared_ptr<Array>& values) {
4821+
auto arrow_schema = ::arrow::schema({::arrow::field("a", values->type())});
4822+
4823+
parquet::WriterProperties::Builder builder;
4824+
// Enforce integer type to annotate decimal type
4825+
auto writer_properties = builder.enable_integer_annotate_decimal()->build();
4826+
std::shared_ptr<SchemaDescriptor> parquet_schema;
4827+
ASSERT_OK_NO_THROW(ToParquetSchema(arrow_schema.get(), *writer_properties,
4828+
*default_arrow_writer_properties(),
4829+
&parquet_schema));
4830+
4831+
this->sink_ = CreateOutputStream();
4832+
auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema_root());
4833+
4834+
std::unique_ptr<FileWriter> writer;
4835+
ASSERT_OK_NO_THROW(FileWriter::Make(
4836+
::arrow::default_memory_pool(),
4837+
ParquetFileWriter::Open(this->sink_, schema_node, writer_properties),
4838+
arrow_schema, default_arrow_writer_properties(), &writer));
4839+
ASSERT_OK_NO_THROW(writer->NewRowGroup(values->length()));
4840+
ASSERT_OK_NO_THROW(writer->WriteColumnChunk(*values));
4841+
ASSERT_OK_NO_THROW(writer->Close());
4842+
}
4843+
4844+
void ReadAndCheckSingleDecimalColumnFile(const Array& values) {
4845+
std::shared_ptr<Array> out;
4846+
std::unique_ptr<FileReader> reader;
4847+
this->ReaderFromSink(&reader);
4848+
this->ReadSingleColumnFile(std::move(reader), &out);
4849+
4850+
// Reader always read values as DECIMAL128 type
4851+
ASSERT_EQ(out->type()->id(), ::arrow::Type::DECIMAL128);
4852+
4853+
if (values.type()->id() == ::arrow::Type::DECIMAL128) {
4854+
AssertArraysEqual(values, *out);
4855+
} else {
4856+
auto& expected_values = dynamic_cast<const ::arrow::Decimal256Array&>(values);
4857+
auto& read_values = dynamic_cast<const ::arrow::Decimal128Array&>(*out);
4858+
ASSERT_EQ(expected_values.length(), read_values.length());
4859+
ASSERT_EQ(expected_values.null_count(), read_values.null_count());
4860+
ASSERT_EQ(expected_values.length(), read_values.length());
4861+
for (int64_t i = 0; i < expected_values.length(); ++i) {
4862+
ASSERT_EQ(expected_values.IsNull(i), read_values.IsNull(i));
4863+
if (!expected_values.IsNull(i)) {
4864+
ASSERT_EQ(::arrow::Decimal256(expected_values.Value(i)).ToString(0),
4865+
::arrow::Decimal128(read_values.Value(i)).ToString(0));
4866+
}
4867+
}
4868+
}
4869+
}
4870+
};
4871+
4872+
typedef ::testing::Types<
4873+
DecimalWithPrecisionAndScale<1>, DecimalWithPrecisionAndScale<5>,
4874+
DecimalWithPrecisionAndScale<10>, DecimalWithPrecisionAndScale<18>,
4875+
Decimal256WithPrecisionAndScale<1>, Decimal256WithPrecisionAndScale<5>,
4876+
Decimal256WithPrecisionAndScale<10>, Decimal256WithPrecisionAndScale<18>>
4877+
DecimalTestTypes;
4878+
4879+
TYPED_TEST_SUITE(TestIntegerAnnotateDecimalTypeParquetIO, DecimalTestTypes);
4880+
4881+
TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, SingleNonNullableDecimalColumn) {
4882+
std::shared_ptr<Array> values;
4883+
ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
4884+
ASSERT_NO_FATAL_FAILURE(this->WriteColumn(values));
4885+
ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
4886+
}
4887+
4888+
TYPED_TEST(TestIntegerAnnotateDecimalTypeParquetIO, SingleNullableDecimalColumn) {
4889+
std::shared_ptr<Array> values;
4890+
ASSERT_OK(NullableArray<TypeParam>(SMALL_SIZE, SMALL_SIZE / 2, kDefaultSeed, &values));
4891+
ASSERT_NO_FATAL_FAILURE(this->WriteColumn(values));
4892+
ASSERT_NO_FATAL_FAILURE(this->ReadAndCheckSingleDecimalColumnFile(*values));
4893+
}
4894+
47994895
} // namespace arrow
48004896
} // namespace parquet

cpp/src/parquet/arrow/reader_internal.cc

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -638,20 +638,15 @@ struct DecimalConverter<DecimalArrayType, ByteArrayType> {
638638
/// small enough to fit in less 4 bytes or less than 8 bytes, respectively.
639639
/// This function implements the conversion from int32 and int64 arrays to decimal arrays.
640640
template <
641-
typename ParquetIntegerType,
641+
typename DecimalArrayType, typename ParquetIntegerType,
642642
typename = ::arrow::enable_if_t<std::is_same<ParquetIntegerType, Int32Type>::value ||
643643
std::is_same<ParquetIntegerType, Int64Type>::value>>
644644
static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
645645
const std::shared_ptr<Field>& field, Datum* out) {
646646
// Decimal128 and Decimal256 are only Arrow constructs. Parquet does not
647647
// specifically distinguish between decimal byte widths.
648-
// Decimal256 isn't relevant here because the Arrow-Parquet C++ bindings never
649-
// write Decimal values as integers and if the decimal value can fit in an
650-
// integer it is wasteful to use Decimal256. Put another way, the only
651-
// way an integer column could be construed as Decimal256 is if an arrow
652-
// schema was stored as metadata in the file indicating the column was
653-
// Decimal256. The current Arrow-Parquet C++ bindings will never do this.
654-
DCHECK(field->type()->id() == ::arrow::Type::DECIMAL128);
648+
DCHECK(field->type()->id() == ::arrow::Type::DECIMAL128 ||
649+
field->type()->id() == ::arrow::Type::DECIMAL256);
655650

656651
const int64_t length = reader->values_written();
657652

@@ -674,16 +669,21 @@ static Status DecimalIntegerTransfer(RecordReader* reader, MemoryPool* pool,
674669
// sign/zero extend int32_t values, otherwise a no-op
675670
const auto value = static_cast<int64_t>(values[i]);
676671

677-
::arrow::Decimal128 decimal(value);
678-
decimal.ToBytes(out_ptr);
672+
if constexpr (std::is_same_v<DecimalArrayType, Decimal128Array>) {
673+
::arrow::Decimal128 decimal(value);
674+
decimal.ToBytes(out_ptr);
675+
} else {
676+
::arrow::Decimal256 decimal(value);
677+
decimal.ToBytes(out_ptr);
678+
}
679679
}
680680

681681
if (reader->nullable_values() && field->nullable()) {
682682
std::shared_ptr<ResizableBuffer> is_valid = reader->ReleaseIsValid();
683-
*out = std::make_shared<Decimal128Array>(field->type(), length, std::move(data),
684-
is_valid, reader->null_count());
683+
*out = std::make_shared<DecimalArrayType>(field->type(), length, std::move(data),
684+
is_valid, reader->null_count());
685685
} else {
686-
*out = std::make_shared<Decimal128Array>(field->type(), length, std::move(data));
686+
*out = std::make_shared<DecimalArrayType>(field->type(), length, std::move(data));
687687
}
688688
return Status::OK();
689689
}
@@ -776,11 +776,11 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& va
776776
case ::arrow::Type::DECIMAL128: {
777777
switch (descr->physical_type()) {
778778
case ::parquet::Type::INT32: {
779-
auto fn = DecimalIntegerTransfer<Int32Type>;
779+
auto fn = DecimalIntegerTransfer<Decimal128Array, Int32Type>;
780780
RETURN_NOT_OK(fn(reader, pool, value_field, &result));
781781
} break;
782782
case ::parquet::Type::INT64: {
783-
auto fn = &DecimalIntegerTransfer<Int64Type>;
783+
auto fn = &DecimalIntegerTransfer<Decimal128Array, Int64Type>;
784784
RETURN_NOT_OK(fn(reader, pool, value_field, &result));
785785
} break;
786786
case ::parquet::Type::BYTE_ARRAY: {
@@ -799,6 +799,14 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& va
799799
} break;
800800
case ::arrow::Type::DECIMAL256:
801801
switch (descr->physical_type()) {
802+
case ::parquet::Type::INT32: {
803+
auto fn = DecimalIntegerTransfer<Decimal256Array, Int32Type>;
804+
RETURN_NOT_OK(fn(reader, pool, value_field, &result));
805+
} break;
806+
case ::parquet::Type::INT64: {
807+
auto fn = &DecimalIntegerTransfer<Decimal256Array, Int64Type>;
808+
RETURN_NOT_OK(fn(reader, pool, value_field, &result));
809+
} break;
802810
case ::parquet::Type::BYTE_ARRAY: {
803811
auto fn = &TransferDecimal<Decimal256Array, ByteArrayType>;
804812
RETURN_NOT_OK(fn(reader, pool, value_field, &result));
@@ -809,7 +817,8 @@ Status TransferColumnData(RecordReader* reader, const std::shared_ptr<Field>& va
809817
} break;
810818
default:
811819
return Status::Invalid(
812-
"Physical type for decimal256 must be fixed length binary");
820+
"Physical type for decimal256 must be int32, int64, byte array, or fixed "
821+
"length binary");
813822
}
814823
break;
815824

cpp/src/parquet/arrow/schema.cc

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -354,11 +354,15 @@ Status FieldToNode(const std::string& name, const std::shared_ptr<Field>& field,
354354
} break;
355355
case ArrowTypeId::DECIMAL128:
356356
case ArrowTypeId::DECIMAL256: {
357-
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
358357
const auto& decimal_type = static_cast<const ::arrow::DecimalType&>(*field->type());
359358
precision = decimal_type.precision();
360359
scale = decimal_type.scale();
361-
length = DecimalType::DecimalSize(precision);
360+
if (properties.integer_annotate_decimal() && 1 <= precision && precision <= 18) {
361+
type = precision <= 9 ? ParquetType ::INT32 : ParquetType ::INT64;
362+
} else {
363+
type = ParquetType::FIXED_LEN_BYTE_ARRAY;
364+
length = DecimalType::DecimalSize(precision);
365+
}
362366
PARQUET_CATCH_NOT_OK(logical_type = LogicalType::Decimal(precision, scale));
363367
} break;
364368
case ArrowTypeId::DATE32:

cpp/src/parquet/arrow/test_util.h

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -129,16 +129,14 @@ ::arrow::enable_if_fixed_size_binary<ArrowType, Status> NonNullArray(
129129
return builder.Finish(out);
130130
}
131131

132+
template <int32_t byte_width>
132133
static void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) {
133134
auto gen = ::arrow::random::RandomArrayGenerator(seed);
134135
std::shared_ptr<Array> decimals;
135-
int32_t byte_width = 0;
136-
if (precision <= ::arrow::Decimal128Type::kMaxPrecision) {
136+
if constexpr (byte_width == 16) {
137137
decimals = gen.Decimal128(::arrow::decimal128(precision, 0), n);
138-
byte_width = ::arrow::Decimal128Type::kByteWidth;
139138
} else {
140139
decimals = gen.Decimal256(::arrow::decimal256(precision, 0), n);
141-
byte_width = ::arrow::Decimal256Type::kByteWidth;
142140
}
143141
std::memcpy(out, decimals->data()->GetValues<uint8_t>(1, 0), byte_width * n);
144142
}
@@ -158,7 +156,8 @@ NonNullArray(size_t size, std::shared_ptr<Array>* out) {
158156
constexpr int32_t seed = 0;
159157

160158
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
161-
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
159+
random_decimals<::arrow::Decimal128Type::kByteWidth>(size, seed, kDecimalPrecision,
160+
out_buf->mutable_data());
162161

163162
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
164163
return builder.Finish(out);
@@ -179,7 +178,8 @@ NonNullArray(size_t size, std::shared_ptr<Array>* out) {
179178
constexpr int32_t seed = 0;
180179

181180
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
182-
random_decimals(size, seed, kDecimalPrecision, out_buf->mutable_data());
181+
random_decimals<::arrow::Decimal256Type::kByteWidth>(size, seed, kDecimalPrecision,
182+
out_buf->mutable_data());
183183

184184
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size));
185185
return builder.Finish(out);
@@ -341,7 +341,8 @@ NullableArray(size_t size, size_t num_nulls, uint32_t seed,
341341

342342
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
343343

344-
random_decimals(size, seed, precision, out_buf->mutable_data());
344+
random_decimals<::arrow::Decimal128Type::kByteWidth>(size, seed, precision,
345+
out_buf->mutable_data());
345346

346347
::arrow::Decimal128Builder builder(type);
347348
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));
@@ -367,7 +368,8 @@ NullableArray(size_t size, size_t num_nulls, uint32_t seed,
367368

368369
ARROW_ASSIGN_OR_RAISE(auto out_buf, ::arrow::AllocateBuffer(size * byte_width));
369370

370-
random_decimals(size, seed, precision, out_buf->mutable_data());
371+
random_decimals<::arrow::Decimal256Type::kByteWidth>(size, seed, precision,
372+
out_buf->mutable_data());
371373

372374
::arrow::Decimal256Builder builder(type);
373375
RETURN_NOT_OK(builder.AppendValues(out_buf->data(), size, valid_bytes.data()));

cpp/src/parquet/column_writer.cc

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include <cstring>
2323
#include <map>
2424
#include <memory>
25-
#include <string>
2625
#include <utility>
2726
#include <vector>
2827

@@ -41,6 +40,7 @@
4140
#include "arrow/util/endian.h"
4241
#include "arrow/util/logging.h"
4342
#include "arrow/util/rle_encoding.h"
43+
#include "arrow/util/type_traits.h"
4444
#include "arrow/visit_array_inline.h"
4545
#include "parquet/column_page.h"
4646
#include "parquet/encoding.h"
@@ -1676,6 +1676,47 @@ struct SerializeFunctor<Int32Type, ::arrow::Date64Type> {
16761676
}
16771677
};
16781678

1679+
template <typename ParquetType, typename ArrowType>
1680+
struct SerializeFunctor<
1681+
ParquetType, ArrowType,
1682+
::arrow::enable_if_t<::arrow::is_decimal_type<ArrowType>::value&& ::arrow::internal::
1683+
IsOneOf<ParquetType, Int32Type, Int64Type>::value>> {
1684+
using value_type = typename ParquetType::c_type;
1685+
1686+
Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
1687+
ArrowWriteContext* ctx, value_type* out) {
1688+
if (array.null_count() == 0) {
1689+
for (int64_t i = 0; i < array.length(); i++) {
1690+
out[i] = TransferValue<ArrowType::kByteWidth>(array.Value(i));
1691+
}
1692+
} else {
1693+
for (int64_t i = 0; i < array.length(); i++) {
1694+
out[i] =
1695+
array.IsValid(i) ? TransferValue<ArrowType::kByteWidth>(array.Value(i)) : 0;
1696+
}
1697+
}
1698+
1699+
return Status::OK();
1700+
}
1701+
1702+
template <int byte_width>
1703+
value_type TransferValue(const uint8_t* in) const {
1704+
static_assert(byte_width == 16 || byte_width == 32,
1705+
"only 16 and 32 byte Decimals supported");
1706+
value_type value = 0;
1707+
if constexpr (byte_width == 16) {
1708+
::arrow::Decimal128 decimal_value(in);
1709+
PARQUET_THROW_NOT_OK(decimal_value.ToInteger(&value));
1710+
} else {
1711+
::arrow::Decimal256 decimal_value(in);
1712+
// Decimal256 does not provide ToInteger, but we are sure it fits in the target
1713+
// integer type.
1714+
value = static_cast<value_type>(decimal_value.low_bits());
1715+
}
1716+
return value;
1717+
}
1718+
};
1719+
16791720
template <>
16801721
struct SerializeFunctor<Int32Type, ::arrow::Time32Type> {
16811722
Status Serialize(const ::arrow::Time32Array& array, ArrowWriteContext*, int32_t* out) {
@@ -1709,6 +1750,8 @@ Status TypedColumnWriterImpl<Int32Type>::WriteArrowDense(
17091750
WRITE_ZERO_COPY_CASE(DATE32, Date32Type, Int32Type)
17101751
WRITE_SERIALIZE_CASE(DATE64, Date64Type, Int32Type)
17111752
WRITE_SERIALIZE_CASE(TIME32, Time32Type, Int32Type)
1753+
WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int32Type)
1754+
WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int32Type)
17121755
default:
17131756
ARROW_UNSUPPORTED()
17141757
}
@@ -1879,6 +1922,8 @@ Status TypedColumnWriterImpl<Int64Type>::WriteArrowDense(
18791922
WRITE_SERIALIZE_CASE(UINT64, UInt64Type, Int64Type)
18801923
WRITE_ZERO_COPY_CASE(TIME64, Time64Type, Int64Type)
18811924
WRITE_ZERO_COPY_CASE(DURATION, DurationType, Int64Type)
1925+
WRITE_SERIALIZE_CASE(DECIMAL128, Decimal128Type, Int64Type)
1926+
WRITE_SERIALIZE_CASE(DECIMAL256, Decimal256Type, Int64Type)
18821927
default:
18831928
ARROW_UNSUPPORTED();
18841929
}
@@ -1997,7 +2042,11 @@ struct SerializeFunctor<
19972042
// Requires a custom serializer because decimal in parquet are in big-endian
19982043
// format. Thus, a temporary local buffer is required.
19992044
template <typename ParquetType, typename ArrowType>
2000-
struct SerializeFunctor<ParquetType, ArrowType, ::arrow::enable_if_decimal<ArrowType>> {
2045+
struct SerializeFunctor<
2046+
ParquetType, ArrowType,
2047+
::arrow::enable_if_t<
2048+
::arrow::is_decimal_type<ArrowType>::value &&
2049+
!::arrow::internal::IsOneOf<ParquetType, Int32Type, Int64Type>::value>> {
20012050
Status Serialize(const typename ::arrow::TypeTraits<ArrowType>::ArrayType& array,
20022051
ArrowWriteContext* ctx, FLBA* out) {
20032052
AllocateScratch(array, ctx);

0 commit comments

Comments
 (0)