forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreader_internal.h
More file actions
122 lines (96 loc) · 3.5 KB
/
Copy pathreader_internal.h
File metadata and controls
122 lines (96 loc) · 3.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <algorithm>
#include <cstdint>
#include <deque>
#include <functional>
#include <memory>
#include <unordered_set>
#include <utility>
#include <vector>
#include "parquet/arrow/schema.h"
#include "parquet/column_reader.h"
#include "parquet/file_reader.h"
#include "parquet/metadata.h"
#include "parquet/platform.h"
#include "parquet/schema.h"
namespace arrow {
class Array;
class ChunkedArray;
class DataType;
class Field;
class KeyValueMetadata;
class Schema;
} // namespace arrow
using arrow::Status;
namespace parquet {
class ArrowReaderProperties;
namespace arrow {
class ColumnReaderImpl;
// ----------------------------------------------------------------------
// Iteration utilities
// Abstraction to decouple row group iteration details from the ColumnReader,
// so we can read only a single row group if we want
class FileColumnIterator {
public:
explicit FileColumnIterator(int column_index, ParquetFileReader* reader,
std::vector<int> row_groups)
: column_index_(column_index),
reader_(reader),
schema_(reader->metadata()->schema()),
row_groups_(row_groups.begin(), row_groups.end()) {}
virtual ~FileColumnIterator() {}
std::unique_ptr<::parquet::PageReader> NextChunk() {
if (row_groups_.empty()) {
return nullptr;
}
auto row_group_reader = reader_->RowGroup(row_groups_.front());
row_groups_.pop_front();
return row_group_reader->GetColumnPageReader(column_index_);
}
const SchemaDescriptor* schema() const { return schema_; }
const ColumnDescriptor* descr() const { return schema_->Column(column_index_); }
std::shared_ptr<FileMetaData> metadata() const { return reader_->metadata(); }
int column_index() const { return column_index_; }
protected:
int column_index_;
ParquetFileReader* reader_;
const SchemaDescriptor* schema_;
std::deque<int> row_groups_;
};
using FileColumnIteratorFactory =
std::function<FileColumnIterator*(int, ParquetFileReader*)>;
Status TransferColumnData(::parquet::internal::RecordReader* reader,
const std::shared_ptr<::arrow::Field>& value_field,
const ColumnDescriptor* descr, ::arrow::MemoryPool* pool,
std::shared_ptr<::arrow::ChunkedArray>* out);
struct ReaderContext {
ParquetFileReader* reader;
::arrow::MemoryPool* pool;
FileColumnIteratorFactory iterator_factory;
bool filter_leaves;
std::shared_ptr<std::unordered_set<int>> included_leaves;
bool IncludesLeaf(int leaf_index) const {
if (this->filter_leaves) {
return this->included_leaves->find(leaf_index) != this->included_leaves->end();
}
return true;
}
};
} // namespace arrow
} // namespace parquet