Skip to content

Commit 6beeaf4

Browse files
Deepak Majetiwesm
authored andcommitted
PARQUET-681: Add tool to scan a parquet file
Added a ReadBatchValues() API to the Column class. Added a parquet-scan tool Separated examples into benchmarks/tools added clang tidy and clang format to benchmarks and tools Author: Deepak Majeti <deepak.majeti@hpe.com> Closes apache#144 from majetideepak/parquetscan and squashes the following commits: cc7f183 [Deepak Majeti] Removed GetRemainingInPage API 44da480 [Deepak Majeti] add scan all in public api 20829b8 [Deepak Majeti] clang-format da62354 [Deepak Majeti] ScanAllValues API e385f61 [Deepak Majeti] put clang-* in the root directory 9ff785c [Deepak Majeti] use c++ random d854bde [Deepak Majeti] parquet scan tool Change-Id: I1e5d1e42aa5a3e8dfbe6b556dd0081bb0ed7f4d8
1 parent d54f13d commit 6beeaf4

4 files changed

Lines changed: 245 additions & 0 deletions

File tree

cpp/tools/parquet/CMakeLists.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
SET(LINK_LIBS
19+
snappystatic
20+
thriftstatic)
21+
22+
if (PARQUET_BUILD_EXECUTABLES)
23+
add_executable(parquet-dump-schema parquet-dump-schema.cc)
24+
target_link_libraries(parquet-dump-schema ${LINK_LIBS}
25+
parquet_static)
26+
27+
add_executable(parquet_reader parquet_reader.cc)
28+
target_link_libraries(parquet_reader ${LINK_LIBS}
29+
parquet_static)
30+
31+
add_executable(parquet-scan parquet-scan.cc)
32+
target_link_libraries(parquet-scan ${LINK_LIBS}
33+
parquet_static)
34+
endif()
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <iostream>
19+
20+
#include "parquet/api/reader.h"
21+
#include "parquet/api/schema.h"
22+
23+
int main(int argc, char** argv) {
24+
std::string filename = argv[1];
25+
26+
try {
27+
std::unique_ptr<parquet::ParquetFileReader> reader =
28+
parquet::ParquetFileReader::OpenFile(filename);
29+
PrintSchema(reader->metadata()->schema_descriptor()->schema().get(), std::cout);
30+
} catch (const std::exception& e) {
31+
std::cerr << "Parquet error: " << e.what() << std::endl;
32+
return -1;
33+
}
34+
35+
return 0;
36+
}

cpp/tools/parquet/parquet-scan.cc

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <ctime>
19+
#include <iostream>
20+
#include <memory>
21+
#include <list>
22+
23+
#include "parquet/api/reader.h"
24+
25+
int main(int argc, char** argv) {
26+
if (argc > 4 || argc < 1) {
27+
std::cerr << "Usage: parquet-scan [--batch-size=] [--columns=...] <file>"
28+
<< std::endl;
29+
return -1;
30+
}
31+
32+
std::string filename;
33+
34+
// Read command-line options
35+
int batch_size = 256;
36+
const std::string COLUMNS_PREFIX = "--columns=";
37+
const std::string BATCH_SIZE_PREFIX = "--batch-size=";
38+
std::vector<int> columns;
39+
int num_columns = 0;
40+
41+
char *param, *value;
42+
for (int i = 1; i < argc; i++) {
43+
if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
44+
value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
45+
while (value) {
46+
columns.push_back(std::atoi(value));
47+
value = std::strtok(nullptr, ",");
48+
num_columns++;
49+
}
50+
} else if ((param = std::strstr(argv[i], BATCH_SIZE_PREFIX.c_str()))) {
51+
value = std::strtok(param + BATCH_SIZE_PREFIX.length(), " ");
52+
if (value) { batch_size = std::atoi(value); }
53+
} else {
54+
filename = argv[i];
55+
}
56+
}
57+
58+
std::vector<int16_t> rep_levels(batch_size);
59+
std::vector<int16_t> def_levels(batch_size);
60+
try {
61+
double total_time;
62+
std::clock_t start_time = std::clock();
63+
std::unique_ptr<parquet::ParquetFileReader> reader =
64+
parquet::ParquetFileReader::OpenFile(filename);
65+
// columns are not specified explicitly. Add all columns
66+
if (num_columns == 0) {
67+
num_columns = reader->metadata()->num_columns();
68+
columns.resize(num_columns);
69+
for (int i = 0; i < num_columns; i++) {
70+
columns[i] = i;
71+
}
72+
}
73+
74+
int64_t total_rows[num_columns];
75+
76+
for (int r = 0; r < reader->metadata()->num_row_groups(); ++r) {
77+
auto group_reader = reader->RowGroup(r);
78+
int col = 0;
79+
for (auto i : columns) {
80+
total_rows[col] = 0;
81+
std::shared_ptr<parquet::ColumnReader> col_reader = group_reader->Column(i);
82+
size_t value_byte_size = GetTypeByteSize(col_reader->descr()->physical_type());
83+
std::vector<uint8_t> values(batch_size * value_byte_size);
84+
85+
int64_t values_read = 0;
86+
while (col_reader->HasNext()) {
87+
total_rows[col] += ScanAllValues(batch_size, def_levels.data(),
88+
rep_levels.data(), values.data(), &values_read, col_reader.get());
89+
}
90+
col++;
91+
}
92+
}
93+
94+
total_time = (std::clock() - start_time) / static_cast<double>(CLOCKS_PER_SEC);
95+
for (int ct = 1; ct < num_columns; ++ct) {
96+
if (total_rows[0] != total_rows[ct]) {
97+
std::cerr << "Parquet error: Total rows among columns do not match" << std::endl;
98+
}
99+
}
100+
std::cout << total_rows[0] << " rows scanned in " << total_time << " seconds."
101+
<< std::endl;
102+
} catch (const std::exception& e) {
103+
std::cerr << "Parquet error: " << e.what() << std::endl;
104+
return -1;
105+
}
106+
107+
return 0;
108+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <iostream>
19+
#include <memory>
20+
#include <list>
21+
22+
#include "parquet/api/reader.h"
23+
24+
int main(int argc, char** argv) {
25+
if (argc > 5 || argc < 2) {
26+
std::cerr << "Usage: parquet_reader [--only-metadata] [--no-memory-map] "
27+
"[--columns=...] <file>"
28+
<< std::endl;
29+
return -1;
30+
}
31+
32+
std::string filename;
33+
bool print_values = true;
34+
bool memory_map = true;
35+
36+
// Read command-line options
37+
const std::string COLUMNS_PREFIX = "--columns=";
38+
std::list<int> columns;
39+
40+
char *param, *value;
41+
for (int i = 1; i < argc; i++) {
42+
if ((param = std::strstr(argv[i], "--only-metadata"))) {
43+
print_values = false;
44+
} else if ((param = std::strstr(argv[i], "--no-memory-map"))) {
45+
memory_map = false;
46+
} else if ((param = std::strstr(argv[i], COLUMNS_PREFIX.c_str()))) {
47+
value = std::strtok(param + COLUMNS_PREFIX.length(), ",");
48+
while (value) {
49+
columns.push_back(std::atoi(value));
50+
value = std::strtok(nullptr, ",");
51+
}
52+
} else {
53+
filename = argv[i];
54+
}
55+
}
56+
57+
try {
58+
std::unique_ptr<parquet::ParquetFileReader> reader =
59+
parquet::ParquetFileReader::OpenFile(filename, memory_map);
60+
reader->DebugPrint(std::cout, columns, print_values);
61+
} catch (const std::exception& e) {
62+
std::cerr << "Parquet error: " << e.what() << std::endl;
63+
return -1;
64+
}
65+
66+
return 0;
67+
}

0 commit comments

Comments
 (0)