@@ -52,6 +52,8 @@ using parquet::arrow::SchemaField;
5252using parquet::arrow::SchemaManifest;
5353using parquet::arrow::StatisticsAsScalars;
5454
55+ namespace {
56+
5557// / \brief A ScanTask backed by a parquet file and a RowGroup within a parquet file.
5658class ParquetScanTask : public ScanTask {
5759 public:
@@ -128,7 +130,7 @@ class ParquetScanTask : public ScanTask {
128130 arrow::io::CacheOptions cache_options_;
129131};
130132
131- static parquet::ReaderProperties MakeReaderProperties (
133+ parquet::ReaderProperties MakeReaderProperties (
132134 const ParquetFileFormat& format, ParquetFragmentScanOptions* parquet_scan_options,
133135 MemoryPool* pool = default_memory_pool()) {
134136 // Can't mutate pool after construction
@@ -144,7 +146,7 @@ static parquet::ReaderProperties MakeReaderProperties(
144146 return properties;
145147}
146148
147- static parquet::ArrowReaderProperties MakeArrowReaderProperties (
149+ parquet::ArrowReaderProperties MakeArrowReaderProperties (
148150 const ParquetFileFormat& format, const parquet::FileMetaData& metadata) {
149151 parquet::ArrowReaderProperties properties (/* use_threads = */ false );
150152 for (const std::string& name : format.reader_options .dict_columns ) {
@@ -155,7 +157,7 @@ static parquet::ArrowReaderProperties MakeArrowReaderProperties(
155157}
156158
157159template <typename M>
158- static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest (
160+ Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest (
159161 const M& metadata, const parquet::ArrowReaderProperties& properties) {
160162 auto manifest = std::make_shared<SchemaManifest>();
161163 const std::shared_ptr<const ::arrow::KeyValueMetadata>& key_value_metadata = nullptr ;
@@ -164,7 +166,7 @@ static Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
164166 return manifest;
165167}
166168
167- static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression (
169+ util::optional<compute::Expression> ColumnChunkStatisticsAsExpression (
168170 const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
169171 // For the remaining of this function, failure to extract/parse statistics
170172 // are ignored by returning nullptr. The goal is two fold. First
@@ -214,8 +216,8 @@ static util::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
214216 return util::nullopt ;
215217}
216218
217- static void AddColumnIndices (const SchemaField& schema_field,
218- std::vector<int >* column_projection) {
219+ void AddColumnIndices (const SchemaField& schema_field,
220+ std::vector<int >* column_projection) {
219221 if (schema_field.is_leaf ()) {
220222 column_projection->push_back (schema_field.column_index );
221223 } else {
@@ -227,8 +229,8 @@ static void AddColumnIndices(const SchemaField& schema_field,
227229}
228230
229231// Compute the column projection out of an optional arrow::Schema
230- static std::vector<int > InferColumnProjection (const parquet::arrow::FileReader& reader,
231- const ScanOptions& options) {
232+ std::vector<int > InferColumnProjection (const parquet::arrow::FileReader& reader,
233+ const ScanOptions& options) {
232234 auto manifest = reader.manifest ();
233235 // Checks if the field is needed in either the projection or the filter.
234236 auto field_names = options.MaterializedFields ();
@@ -253,6 +255,33 @@ static std::vector<int> InferColumnProjection(const parquet::arrow::FileReader&
253255 return columns_selection;
254256}
255257
258+ Status WrapSourceError (const Status& status, const std::string& path) {
259+ return status.WithMessage (" Could not open Parquet input source '" , path,
260+ " ': " , status.message ());
261+ }
262+
263+ Result<bool > IsSupportedParquetFile (const ParquetFileFormat& format,
264+ const FileSource& source) {
265+ BEGIN_PARQUET_CATCH_EXCEPTIONS
266+ try {
267+ ARROW_ASSIGN_OR_RAISE (auto input, source.Open ());
268+ ARROW_ASSIGN_OR_RAISE (
269+ auto parquet_scan_options,
270+ GetFragmentScanOptions<ParquetFragmentScanOptions>(
271+ kParquetTypeName , nullptr , format.default_fragment_scan_options ));
272+ auto reader = parquet::ParquetFileReader::Open (
273+ std::move (input), MakeReaderProperties (format, parquet_scan_options.get ()));
274+ std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata ();
275+ return metadata != nullptr && metadata->can_decompress ();
276+ } catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
277+ ARROW_UNUSED (e);
278+ return false ;
279+ }
280+ END_PARQUET_CATCH_EXCEPTIONS
281+ }
282+
283+ } // namespace
284+
256285bool ParquetFileFormat::Equals (const FileFormat& other) const {
257286 if (other.type_name () != type_name ()) return false ;
258287
@@ -270,24 +299,11 @@ ParquetFileFormat::ParquetFileFormat(const parquet::ReaderProperties& reader_pro
270299}
271300
272301Result<bool > ParquetFileFormat::IsSupported (const FileSource& source) const {
273- try {
274- ARROW_ASSIGN_OR_RAISE (auto input, source.Open ());
275- ARROW_ASSIGN_OR_RAISE (auto parquet_scan_options,
276- GetFragmentScanOptions<ParquetFragmentScanOptions>(
277- kParquetTypeName , nullptr , default_fragment_scan_options));
278- auto reader = parquet::ParquetFileReader::Open (
279- std::move (input), MakeReaderProperties (*this , parquet_scan_options.get ()));
280- std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata ();
281- return metadata != nullptr && metadata->can_decompress ();
282- } catch (const ::parquet::ParquetInvalidOrCorruptedFileException& e) {
283- ARROW_UNUSED (e);
284- return false ;
285- } catch (const ::parquet::ParquetException& e) {
286- return Status::IOError (" Could not open parquet input source '" , source.path (),
287- " ': " , e.what ());
302+ auto maybe_is_supported = IsSupportedParquetFile (*this , source);
303+ if (!maybe_is_supported.ok ()) {
304+ return WrapSourceError (maybe_is_supported.status (), source.path ());
288305 }
289-
290- return true ;
306+ return maybe_is_supported;
291307}
292308
293309Result<std::shared_ptr<Schema>> ParquetFileFormat::Inspect (
@@ -307,14 +323,18 @@ Result<std::unique_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
307323 auto properties = MakeReaderProperties (*this , parquet_scan_options.get (), pool);
308324
309325 ARROW_ASSIGN_OR_RAISE (auto input, source.Open ());
310- std::unique_ptr<parquet::ParquetFileReader> reader;
311- try {
312- reader = parquet::ParquetFileReader::Open (std::move (input), std::move (properties));
313- } catch (const ::parquet::ParquetException& e) {
314- return Status::IOError (" Could not open parquet input source '" , source.path (),
315- " ': " , e.what ());
316- }
317326
327+ auto make_reader = [&]() -> Result<std::unique_ptr<parquet::ParquetFileReader>> {
328+ BEGIN_PARQUET_CATCH_EXCEPTIONS
329+ return parquet::ParquetFileReader::Open (std::move (input), std::move (properties));
330+ END_PARQUET_CATCH_EXCEPTIONS
331+ };
332+
333+ auto maybe_reader = std::move (make_reader)();
334+ if (!maybe_reader.ok ()) {
335+ return WrapSourceError (maybe_reader.status (), source.path ());
336+ }
337+ std::unique_ptr<parquet::ParquetFileReader> reader = *std::move (maybe_reader);
318338 std::shared_ptr<parquet::FileMetaData> metadata = reader->metadata ();
319339 auto arrow_properties = MakeArrowReaderProperties (*this , *metadata);
320340
@@ -371,8 +391,7 @@ Future<std::shared_ptr<parquet::arrow::FileReader>> ParquetFileFormat::GetReader
371391 },
372392 [path](
373393 const Status& status) -> Result<std::shared_ptr<parquet::arrow::FileReader>> {
374- return status.WithMessage (" Could not open Parquet input source '" , path,
375- " ': " , status.message ());
394+ return WrapSourceError (status, path);
376395 });
377396}
378397
0 commit comments