Skip to content

Commit 7939265

Browse files
committed
ARROW-7576: [C++][Dev] Improve fuzzing setup
Refactor IPC stream format fuzzing executable, add another fuzzing target for IPC file format. Add an executable to generate fuzzing seed corpuses, and scripts to pack them for OSS-Fuzz.
1 parent f59861c commit 7939265

13 files changed

Lines changed: 331 additions & 28 deletions

File tree

ci/scripts/cpp_build.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \
6464
-DARROW_FILESYSTEM=${ARROW_FILESYSTEM:-ON} \
6565
-DARROW_FLIGHT=${ARROW_FLIGHT:-OFF} \
6666
-DARROW_FUZZING=${ARROW_FUZZING:-OFF} \
67-
-DARROW_FUZZING=${ARROW_FUZZING:-OFF} \
6867
-DARROW_GANDIVA_JAVA=${ARROW_GANDIVA_JAVA:-OFF} \
6968
-DARROW_GANDIVA_PC_CXX_FLAGS=${ARROW_GANDIVA_PC_CXX_FLAGS:-} \
7069
-DARROW_GANDIVA=${ARROW_GANDIVA:-OFF} \
@@ -102,6 +101,7 @@ cmake -G "${CMAKE_GENERATOR:-Ninja}" \
102101
-DBUILD_WARNING_LEVEL=${DARROW_BUILD_WARNING_LEVEL:-CHECKIN} \
103102
-Dc-ares_SOURCE=${cares_SOURCE:-AUTO} \
104103
-DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \
104+
-DCMAKE_C_FLAGS=${CFLAGS:-} \
105105
-DCMAKE_CXX_FLAGS=${CXXFLAGS:-} \
106106
-DCMAKE_INSTALL_LIBDIR=${CMAKE_INSTALL_LIBDIR:-lib} \
107107
-DCMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX:-${ARROW_HOME}} \

cpp/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,10 @@ endif(UNIX)
267267
# Set up various options
268268
#
269269

270-
if(ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_TESTS OR ARROW_BUILD_INTEGRATION)
270+
if(ARROW_BUILD_BENCHMARKS
271+
OR ARROW_BUILD_TESTS
272+
OR ARROW_BUILD_INTEGRATION
273+
OR ARROW_FUZZING)
271274
set(ARROW_JSON ON)
272275
endif()
273276

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
#!/usr/bin/env bash
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
# Generate and pack seed corpus files, for OSS-Fuzz
21+
22+
if [ $# -ne 1 ]; then
23+
echo "Usage: $0 <build output dir>"
24+
exit 1
25+
fi
26+
27+
set -ex
28+
29+
CORPUS_DIR=/tmp/corpus
30+
ARROW=$(cd $(dirname $BASH_SOURCE)/../..; pwd)
31+
OUT=$1
32+
33+
rm -rf ${CORPUS_DIR}
34+
${OUT}/arrow-ipc-generate-fuzz-corpus -stream ${CORPUS_DIR}
35+
${ARROW}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-stream-fuzz_seed_corpus.zip
36+
37+
rm -rf ${CORPUS_DIR}
38+
${OUT}/arrow-ipc-generate-fuzz-corpus -file ${CORPUS_DIR}
39+
${ARROW}/build-support/fuzzing/pack_corpus.py ${CORPUS_DIR} ${OUT}/arrow-ipc-file-fuzz_seed_corpus.zip
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Licensed to the Apache Software Foundation (ASF) under one
4+
# or more contributor license agreements. See the NOTICE file
5+
# distributed with this work for additional information
6+
# regarding copyright ownership. The ASF licenses this file
7+
# to you under the Apache License, Version 2.0 (the
8+
# "License"); you may not use this file except in compliance
9+
# with the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing,
14+
# software distributed under the License is distributed on an
15+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16+
# KIND, either express or implied. See the License for the
17+
# specific language governing permissions and limitations
18+
# under the License.
19+
20+
# Rename a bunch of corpus files to their SHA1 hashes, and
21+
# pack them into a ZIP archive.
22+
23+
import hashlib
24+
from pathlib import Path
25+
import sys
26+
import zipfile
27+
28+
29+
def process_dir(corpus_dir, zip_output):
30+
seen = set()
31+
32+
for child in corpus_dir.iterdir():
33+
if not child.is_file():
34+
raise IOError("Not a file: {0}".format(child))
35+
with child.open('rb') as f:
36+
data = f.read()
37+
arcname = hashlib.sha1(data).hexdigest()
38+
if arcname in seen:
39+
raise ValueError("Duplicate hash: {0} (in file {1})"
40+
.format(arcname, child))
41+
zip_output.writestr(str(arcname), data)
42+
seen.add(arcname)
43+
44+
45+
def main(corpus_dir, zip_output_name):
46+
with zipfile.ZipFile(zip_output_name, 'w') as zip_output:
47+
process_dir(Path(corpus_dir), zip_output)
48+
49+
50+
if __name__ == "__main__":
51+
if len(sys.argv) != 3:
52+
print("Usage: {0} <corpus dir> <output zip file>".format(sys.argv[0]))
53+
sys.exit(1)
54+
main(sys.argv[1], sys.argv[2])

cpp/cmake_modules/BuildUtils.cmake

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -729,11 +729,19 @@ function(ADD_ARROW_FUZZING REL_FUZZING_NAME)
729729
set(FUZZ_LINK_LIBS arrow_shared)
730730
endif()
731731

732+
# For OSS-Fuzz
733+
# (https://google.github.io/oss-fuzz/advanced-topics/ideal-integration/)
734+
if(DEFINED ENV{LIB_FUZZING_ENGINE})
735+
set(FUZZ_LDFLAGS $ENV{LIB_FUZZING_ENGINE})
736+
else()
737+
set(FUZZ_LDFLAGS "-fsanitize=fuzzer")
738+
endif()
739+
732740
add_executable(${FUZZING_NAME} "${REL_FUZZING_NAME}.cc")
733741
target_link_libraries(${FUZZING_NAME} ${FUZZ_LINK_LIBS})
734-
target_compile_options(${FUZZING_NAME} PRIVATE "-fsanitize=fuzzer")
742+
target_compile_options(${FUZZING_NAME} PRIVATE ${FUZZ_LDFLAGS})
735743
set_target_properties(${FUZZING_NAME}
736-
PROPERTIES LINK_FLAGS "-fsanitize=fuzzer" LABELS "fuzzing")
744+
PROPERTIES LINK_FLAGS ${FUZZ_LDFLAGS} LABELS "fuzzing")
737745
endfunction()
738746

739747
function(ARROW_INSTALL_ALL_HEADERS PATH)

cpp/cmake_modules/ThirdpartyToolchain.cmake

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1522,7 +1522,10 @@ macro(build_gtest)
15221522
add_dependencies(GTest::GMock googletest_ep)
15231523
endmacro()
15241524

1525-
if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_INTEGRATION)
1525+
if(ARROW_BUILD_TESTS
1526+
OR ARROW_BUILD_BENCHMARKS
1527+
OR ARROW_BUILD_INTEGRATION
1528+
OR ARROW_FUZZING)
15261529
resolve_dependency(GTest)
15271530

15281531
if(NOT GTEST_VENDORED)

cpp/src/arrow/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,10 @@ if(ARROW_BUILD_STATIC AND WIN32)
418418
target_compile_definitions(arrow_static PUBLIC ARROW_STATIC)
419419
endif()
420420

421-
if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS OR ARROW_BUILD_INTEGRATION)
421+
if(ARROW_BUILD_TESTS
422+
OR ARROW_BUILD_BENCHMARKS
423+
OR ARROW_BUILD_INTEGRATION
424+
OR ARROW_FUZZING)
422425
# that depend on gtest
423426
add_arrow_lib(arrow_testing
424427
CMAKE_PACKAGE_NAME

cpp/src/arrow/ipc/CMakeLists.txt

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,4 +60,12 @@ if(ARROW_BUILD_UTILITIES OR ARROW_BUILD_INTEGRATION)
6060
endif()
6161

6262
add_arrow_benchmark(read_write_benchmark PREFIX "arrow-ipc")
63-
add_arrow_fuzzing(fuzzing_test PREFIX "arrow-ipc")
63+
64+
if(ARROW_FUZZING)
65+
add_executable(arrow-ipc-generate-fuzz-corpus generate_fuzz_corpus.cc)
66+
target_link_libraries(arrow-ipc-generate-fuzz-corpus ${ARROW_UTIL_LIB}
67+
${ARROW_TEST_LINK_LIBS})
68+
endif()
69+
70+
add_arrow_fuzzing(file_fuzz PREFIX "arrow-ipc")
71+
add_arrow_fuzzing(stream_fuzz PREFIX "arrow-ipc")
Lines changed: 5 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,12 @@
1717

1818
#include <memory>
1919

20-
#include <arrow/buffer.h>
21-
#include <arrow/io/memory.h>
22-
#include <arrow/ipc/reader.h>
20+
#include "arrow/ipc/reader.h"
21+
#include "arrow/status.h"
22+
#include "arrow/util/macros.h"
2323

2424
extern "C" int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) {
25-
arrow::Status status;
26-
27-
auto buffer = std::make_shared<arrow::Buffer>(data, size);
28-
arrow::io::BufferReader buffer_reader(buffer);
29-
30-
std::shared_ptr<arrow::ipc::RecordBatchReader> batch_reader;
31-
status = arrow::ipc::RecordBatchStreamReader::Open(&buffer_reader, &batch_reader);
32-
if (!status.ok()) {
33-
return 0;
34-
}
35-
36-
std::shared_ptr<arrow::RecordBatch> batch;
37-
do {
38-
status = batch_reader->ReadNext(&batch);
39-
if (!status.ok()) {
40-
return 0;
41-
}
42-
} while (batch);
25+
auto status = arrow::ipc::internal::FuzzIpcFile(data, static_cast<int64_t>(size));
26+
ARROW_UNUSED(status);
4327
return 0;
4428
}
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
// A command line executable that generates a bunch of valid IPC files
19+
// containing example record batches. Those are used as fuzzing seeds
20+
// to make fuzzing more efficient.
21+
22+
#include <cstdlib>
23+
#include <iostream>
24+
#include <memory>
25+
#include <string>
26+
#include <vector>
27+
28+
#include "arrow/io/file.h"
29+
#include "arrow/io/memory.h"
30+
#include "arrow/ipc/test_common.h"
31+
#include "arrow/ipc/writer.h"
32+
#include "arrow/record_batch.h"
33+
#include "arrow/result.h"
34+
#include "arrow/util/io_util.h"
35+
36+
namespace arrow {
37+
namespace ipc {
38+
39+
using ::arrow::internal::CreateDir;
40+
using ::arrow::internal::PlatformFilename;
41+
42+
Result<std::vector<std::shared_ptr<RecordBatch>>> Batches() {
43+
std::vector<std::shared_ptr<RecordBatch>> batches;
44+
std::shared_ptr<RecordBatch> batch;
45+
RETURN_NOT_OK(test::MakeNullRecordBatch(&batch));
46+
batches.push_back(batch);
47+
RETURN_NOT_OK(test::MakeListRecordBatch(&batch));
48+
batches.push_back(batch);
49+
RETURN_NOT_OK(test::MakeDictionary(&batch));
50+
batches.push_back(batch);
51+
RETURN_NOT_OK(test::MakeTimestamps(&batch));
52+
batches.push_back(batch);
53+
RETURN_NOT_OK(test::MakeFWBinary(&batch));
54+
batches.push_back(batch);
55+
RETURN_NOT_OK(test::MakeStruct(&batch));
56+
batches.push_back(batch);
57+
RETURN_NOT_OK(test::MakeUnion(&batch));
58+
batches.push_back(batch);
59+
RETURN_NOT_OK(test::MakeFixedSizeListRecordBatch(&batch));
60+
batches.push_back(batch);
61+
return batches;
62+
}
63+
64+
template <typename RecordBatchWriterClass>
65+
Result<std::shared_ptr<Buffer>> SerializeRecordBatch(
66+
const std::shared_ptr<RecordBatch>& batch) {
67+
ARROW_ASSIGN_OR_RAISE(auto sink, io::BufferOutputStream::Create(1024));
68+
ARROW_ASSIGN_OR_RAISE(auto writer,
69+
RecordBatchWriterClass::Open(sink.get(), batch->schema()));
70+
RETURN_NOT_OK(writer->WriteRecordBatch(*batch));
71+
RETURN_NOT_OK(writer->Close());
72+
return sink->Finish();
73+
}
74+
75+
Status DoMain(bool is_stream_format, const std::string& out_dir) {
76+
ARROW_ASSIGN_OR_RAISE(auto dir_fn, PlatformFilename::FromString(out_dir));
77+
RETURN_NOT_OK(CreateDir(dir_fn));
78+
79+
auto serialize_func = is_stream_format ? SerializeRecordBatch<RecordBatchStreamWriter>
80+
: SerializeRecordBatch<RecordBatchFileWriter>;
81+
82+
ARROW_ASSIGN_OR_RAISE(auto batches, Batches());
83+
int batch_num = 1;
84+
for (const auto& batch : batches) {
85+
RETURN_NOT_OK(batch->ValidateFull());
86+
ARROW_ASSIGN_OR_RAISE(auto buf, serialize_func(batch));
87+
auto name = "batch-" + std::to_string(batch_num++);
88+
89+
ARROW_ASSIGN_OR_RAISE(auto batch_fn, dir_fn.Join(name));
90+
std::cerr << batch_fn.ToString() << std::endl;
91+
ARROW_ASSIGN_OR_RAISE(auto file, io::FileOutputStream::Open(batch_fn.ToString()));
92+
RETURN_NOT_OK(file->Write(buf));
93+
RETURN_NOT_OK(file->Close());
94+
}
95+
return Status::OK();
96+
}
97+
98+
ARROW_NORETURN void Usage() {
99+
std::cerr << "Usage: arrow-ipc-generate-fuzz-corpus "
100+
<< "[-stream|-file] <output directory>" << std::endl;
101+
std::exit(2);
102+
}
103+
104+
int Main(int argc, char** argv) {
105+
if (argc != 3) {
106+
Usage();
107+
}
108+
auto opt = std::string(argv[1]);
109+
if (opt != "-stream" && opt != "-file") {
110+
Usage();
111+
}
112+
auto out_dir = std::string(argv[2]);
113+
114+
Status st = DoMain(opt == "-stream", out_dir);
115+
if (!st.ok()) {
116+
std::cerr << st.ToString() << std::endl;
117+
return 1;
118+
}
119+
return 0;
120+
}
121+
122+
} // namespace ipc
123+
} // namespace arrow
124+
125+
int main(int argc, char** argv) { return arrow::ipc::Main(argc, argv); }

0 commit comments

Comments
 (0)