Skip to content

Commit da841cc

Browse files
committed
ARROW-13135: [C++] Fix Status propagation from Parquet exception
Closes apache#10566 from pitrou/ARROW-13135-parquet-status-exception Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 8aeec28 commit da841cc

7 files changed

Lines changed: 107 additions & 70 deletions

File tree

cpp/src/arrow/dataset/file_parquet.cc

Lines changed: 53 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ using parquet::arrow::SchemaField;
5252
using parquet::arrow::SchemaManifest;
5353
using parquet::arrow::StatisticsAsScalars;
5454

55+
namespace {
56+
5557
/// \brief A ScanTask backed by a parquet file and a RowGroup within a parquet file.
5658
class ParquetScanTask : public ScanTask {
5759
public:
@@ -128,7 +130,7 @@ class ParquetScanTask : public ScanTask {
128130
arrow::io::CacheOptions cache_options_;
129131
};
130132

131-
static parquet::ReaderProperties MakeReaderProperties(
133+
parquet::ReaderProperties MakeReaderProperties(
132134
const ParquetFileFormat& format, ParquetFragmentScanOptions* parquet_scan_options,
133135
MemoryPool* pool = default_memory_pool()) {
134136
// Can't mutate pool after construction
@@ -144,7 +146,7 @@ static parquet::ReaderProperties MakeReaderProperties(
144146
return properties;
145147
}
146148

147-
static parquet::ArrowReaderProperties MakeArrowReaderProperties(
149+
parquet::ArrowReaderProperties MakeArrowReaderProperties(
148150
const ParquetFileFormat& format, const parquet::FileMetaData& metadata) {
149151
parquet::ArrowReaderProperties properties(/* use_threads = */ false);
150152
for (const std::string& name : format.reader_options.dict_columns) {
@@ -155,7 +157,7 @@ static parquet::ArrowReaderProperties MakeArrowReaderProperties(
155157
}
156158

157159
template <typename M>
158-
static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
160+
Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
159161
const M& metadata, const parquet::ArrowReaderProperties& properties) {
160162
auto manifest = std::make_shared<SchemaManifest>();
161163
const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata = nullptr;
@@ -164,7 +166,7 @@ static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
164166
return manifest;
165167
}
166168

167-
static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
169+
util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
168170
const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
169171
// For the remaining of this function, failure to extract/parse statistics
170172
// are ignored by returning nullptr. The goal is two fold. First
@@ -214,8 +216,8 @@ static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
214216
return util::nullopt;
215217
}
216218

217-
static void AddColumnIndices(const SchemaField& schema_field,
218-
std::vector<int>* column_projection) {
219+
void AddColumnIndices(const SchemaField& schema_field,
220+
std::vector<int>* column_projection) {
219221
if (schema_field.is_leaf()) {
220222
column_projection->push_back(schema_field.column_index);
221223
} else {
@@ -227,8 +229,8 @@ static void AddColumnIndices(const SchemaField& schema_field,
227229
}
228230

229231
// Compute the column projection out of an optional arrow::Schema
230-
static std::vector<int> InferColumnProjection(const parquet::arrow::FileReader& reader,
231-
const ScanOptions& options) {
232+
std::vector<int> InferColumnProjection(const parquet::arrow::FileReader& reader,
233+
const ScanOptions& options) {
232234
auto manifest = reader.manifest();
233235
// Checks if the field is needed in either the projection or the filter.
234236
auto field_names = options.MaterializedFields();
@@ -253,6 +255,33 @@ static std::vector<int> InferColumnProjection(const parquet::arrow::FileReader&
253255
return columns_selection;
254256
}
255257

258+
Status WrapSourceError(const Status& status, const std::string& path) {
259+
return status.WithMessage("Could not open Parquet input source '", path,
260+
"': ", status.message());
261+
}
262+
263+
Result<bool> IsSupportedParquetFile(const ParquetFileFormat& format,
264+
const FileSource& source) {
265+
BEGIN_PARQUET_CATCH_EXCEPTIONS
266+
try {
267+
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
268+
ARROW_ASSIGN_OR_RAISE(
269+
auto parquet_scan_options,
270+
GetFragmentScanOptions<ParquetFragmentScanOptions>(
271+
kParquetTypeName, nullptr, format.default_fragment_scan_options));
272+
auto reader = parquet::ParquetFileReader::Open(
273+
std::move(input), MakeReaderProperties(format, parquet_scan_options.get()));
274+
std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
275+
return metadata != nullptr && metadata->can_decompress();
276+
} catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
277+
ARROW_UNUSED(e);
278+
return false;
279+
}
280+
END_PARQUET_CATCH_EXCEPTIONS
281+
}
282+
283+
} // namespace
284+
256285
bool ParquetFileFormat::Equals(const FileFormat& other) const {
257286
if (other.type_name() != type_name()) return false;
258287

@@ -270,24 +299,11 @@ ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_pro
270299
}
271300

272301
Result<bool> ParquetFileFormat::IsSupported(const FileSource& source) const {
273-
try {
274-
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
275-
ARROW_ASSIGN_OR_RAISE(auto parquet_scan_options,
276-
GetFragmentScanOptions<ParquetFragmentScanOptions>(
277-
kParquetTypeName, nullptr, default_fragment_scan_options));
278-
auto reader = parquet::ParquetFileReader::Open(
279-
std::move(input), MakeReaderProperties(*this, parquet_scan_options.get()));
280-
std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
281-
return metadata != nullptr && metadata->can_decompress();
282-
} catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
283-
ARROW_UNUSED(e);
284-
return false;
285-
} catch (const ::parquet::ParquetException& e) {
286-
return Status::IOError("Could not open parquet input source '", source.path(),
287-
"': ", e.what());
302+
auto maybe_is_supported = IsSupportedParquetFile(*this, source);
303+
if (!maybe_is_supported.ok()) {
304+
return WrapSourceError(maybe_is_supported.status(), source.path());
288305
}
289-
290-
return true;
306+
return maybe_is_supported;
291307
}
292308

293309
Result<std::shared_ptr<Schema>> ParquetFileFormat::Inspect(
@@ -307,14 +323,18 @@ Result<std::unique_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
307323
auto properties = MakeReaderProperties(*this, parquet_scan_options.get(), pool);
308324

309325
ARROW_ASSIGN_OR_RAISE(auto input, source.Open());
310-
std::unique_ptr<parquet::ParquetFileReader> reader;
311-
try {
312-
reader = parquet::ParquetFileReader::Open(std::move(input), std::move(properties));
313-
} catch (const ::parquet::ParquetException& e) {
314-
return Status::IOError("Could not open parquet input source '", source.path(),
315-
"': ", e.what());
316-
}
317326

327+
auto make_reader = [&]() -> Result<std::unique_ptr<parquet::ParquetFileReader>> {
328+
BEGIN_PARQUET_CATCH_EXCEPTIONS
329+
return parquet::ParquetFileReader::Open(std::move(input), std::move(properties));
330+
END_PARQUET_CATCH_EXCEPTIONS
331+
};
332+
333+
auto maybe_reader = std::move(make_reader)();
334+
if (!maybe_reader.ok()) {
335+
return WrapSourceError(maybe_reader.status(), source.path());
336+
}
337+
std::unique_ptr<parquet::ParquetFileReader> reader = *std::move(maybe_reader);
318338
std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata();
319339
auto arrow_properties = MakeArrowReaderProperties(*this, *metadata);
320340

@@ -371,8 +391,7 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
371391
},
372392
[path](
373393
const Status& status) -> Result<std::shared_ptr<parquet::arrow::FileReader>> {
374-
return status.WithMessage("Could not open Parquet input source '", path,
375-
"': ", status.message());
394+
return WrapSourceError(status, path);
376395
});
377396
}
378397

cpp/src/arrow/dataset/file_parquet_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ class TestParquetFileFormat : public FileFormatFixtureMixin<ParquetFormatHelper>
176176
};
177177

178178
TEST_F(TestParquetFileFormat, InspectFailureWithRelevantError) {
179-
TestInspectFailureWithRelevantError(StatusCode::IOError, "parquet");
179+
TestInspectFailureWithRelevantError(StatusCode::Invalid, "Parquet");
180180
}
181181
TEST_F(TestParquetFileFormat, Inspect) { TestInspect(); }
182182

cpp/src/arrow/dataset/test_util.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -445,13 +445,15 @@ class FileFormatFixtureMixin : public ::testing::Test {
445445
"Error creating dataset. Could not read schema from '/herp/derp':"),
446446
::testing::HasSubstr("Is this a '" + format_->type_name() + "' file?")));
447447
}
448+
448449
void TestInspectFailureWithRelevantError(StatusCode code,
449-
const std::string format_name) {
450+
const std::string& format_name) {
450451
const std::vector<std::string> file_contents{"", "PAR0", "ASDFPAR1", "ARROW1"};
451452
for (const auto& contents : file_contents) {
452453
AssertInspectFailure(contents, code, format_name);
453454
}
454455
}
456+
455457
void TestInspect() {
456458
auto reader = GetRecordBatchReader(schema({field("f64", float64())}));
457459
auto source = GetFileSource(reader.get());

cpp/src/parquet/exception.h

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -33,23 +33,29 @@
3333

3434
// Parquet exception to Arrow Status
3535

36-
#define PARQUET_CATCH_NOT_OK(s) \
37-
try { \
38-
(s); \
39-
} catch (const ::parquet::ParquetStatusException& e) { \
40-
return e.status(); \
41-
} catch (const ::parquet::ParquetException& e) { \
42-
return ::arrow::Status::IOError(e.what()); \
36+
#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
37+
#define END_PARQUET_CATCH_EXCEPTIONS \
38+
} \
39+
catch (const ::parquet::ParquetStatusException& e) { \
40+
return e.status(); \
41+
} \
42+
catch (const ::parquet::ParquetException& e) { \
43+
return ::arrow::Status::IOError(e.what()); \
4344
}
4445

45-
#define PARQUET_CATCH_AND_RETURN(s) \
46-
try { \
47-
return (s); \
48-
} catch (const ::parquet::ParquetStatusException& e) { \
49-
return e.status(); \
50-
} catch (const ::parquet::ParquetException& e) { \
51-
return ::arrow::Status::IOError(e.what()); \
52-
}
46+
// clang-format off
47+
48+
#define PARQUET_CATCH_NOT_OK(s) \
49+
BEGIN_PARQUET_CATCH_EXCEPTIONS \
50+
(s); \
51+
END_PARQUET_CATCH_EXCEPTIONS
52+
53+
// clang-format on
54+
55+
#define PARQUET_CATCH_AND_RETURN(s) \
56+
BEGIN_PARQUET_CATCH_EXCEPTIONS \
57+
return (s); \
58+
END_PARQUET_CATCH_EXCEPTIONS
5359

5460
// Arrow Status to Parquet exception
5561

@@ -149,11 +155,4 @@ void ThrowNotOk(StatusReturnBlock&& b) {
149155
PARQUET_THROW_NOT_OK(b());
150156
}
151157

152-
#define BEGIN_PARQUET_CATCH_EXCEPTIONS try {
153-
#define END_PARQUET_CATCH_EXCEPTIONS \
154-
} \
155-
catch (const ::parquet::ParquetException& e) { \
156-
return ::arrow::Status::IOError(e.what()); \
157-
}
158-
159158
} // namespace parquet

cpp/src/parquet/reader_test.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -688,17 +688,17 @@ TEST(TestFileReader, TestOpenErrors) {
688688
::testing::HasSubstr("Couldn't deserialize thrift: No more data to read")));
689689

690690
EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
691-
IOError, ::testing::HasSubstr("Parquet file size is 0 bytes"), OpenBufferAsync(""));
691+
Invalid, ::testing::HasSubstr("Parquet file size is 0 bytes"), OpenBufferAsync(""));
692692
EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
693-
IOError, ::testing::HasSubstr("Parquet magic bytes not found"),
693+
Invalid, ::testing::HasSubstr("Parquet magic bytes not found"),
694694
OpenBufferAsync("AAAAPAR0"));
695695
EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
696-
IOError,
696+
Invalid,
697697
::testing::HasSubstr(
698698
"Parquet file size is 5 bytes, smaller than the minimum file footer"),
699699
OpenBufferAsync("APAR1"));
700700
EXPECT_FINISHES_AND_RAISES_WITH_MESSAGE_THAT(
701-
IOError,
701+
Invalid,
702702
::testing::HasSubstr(
703703
"Parquet file size is 8 bytes, smaller than the size reported by footer's"),
704704
OpenBufferAsync("\xFF\xFF\xFF\x0FPAR1"));

python/pyarrow/tests/parquet/test_basic.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -278,13 +278,25 @@ def test_relative_paths(tempdir, use_legacy_dataset, filesystem):
278278
assert result.equals(table)
279279

280280

281-
@parametrize_legacy_dataset
282-
def test_read_non_existing_file(use_legacy_dataset):
281+
def test_read_non_existing_file():
283282
# ensure we have a proper error message
284283
with pytest.raises(FileNotFoundError):
285284
pq.read_table('i-am-not-existing.parquet')
286285

287286

287+
def test_file_error_python_exception():
288+
class BogusFile(io.BytesIO):
289+
def read(self, *args):
290+
raise ZeroDivisionError("zorglub")
291+
292+
def seek(self, *args):
293+
raise ZeroDivisionError("zorglub")
294+
295+
# ensure the Python exception is restored
296+
with pytest.raises(ZeroDivisionError, match="zorglub"):
297+
pq.read_table(BogusFile(b""))
298+
299+
288300
@parametrize_legacy_dataset
289301
def test_parquet_read_from_buffer(tempdir, use_legacy_dataset):
290302
# reading from a buffer from python's open()

python/pyarrow/tests/test_hdfs.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@
3939
# HDFS tests
4040

4141

42+
def check_libhdfs_present():
43+
if not pa.have_libhdfs():
44+
message = 'No libhdfs available on system'
45+
if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'):
46+
pytest.fail(message)
47+
else:
48+
pytest.skip(message)
49+
50+
4251
def hdfs_test_client():
4352
host = os.environ.get('ARROW_HDFS_TEST_HOST', 'default')
4453
user = os.environ.get('ARROW_HDFS_TEST_USER', None)
@@ -382,12 +391,7 @@ class TestLibHdfs(HdfsTestCases, unittest.TestCase):
382391

383392
@classmethod
384393
def check_driver(cls):
385-
if not pa.have_libhdfs():
386-
message = 'No libhdfs available on system'
387-
if os.environ.get('PYARROW_HDFS_TEST_LIBHDFS_REQUIRE'):
388-
pytest.fail(message)
389-
else:
390-
pytest.skip(message)
394+
check_libhdfs_present()
391395

392396
def test_orphaned_file(self):
393397
hdfs = hdfs_test_client()
@@ -418,6 +422,7 @@ def _get_hdfs_uri(path):
418422
def test_fastparquet_read_with_hdfs():
419423
from pandas.testing import assert_frame_equal
420424

425+
check_libhdfs_present()
421426
try:
422427
import snappy # noqa
423428
except ImportError:

0 commit comments

Comments
 (0)