forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbloom_filter.cc
More file actions
162 lines (131 loc) · 5.56 KB
/
Copy pathbloom_filter.cc
File metadata and controls
162 lines (131 loc) · 5.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include <cstdint>
#include <cstring>
#include "arrow/result.h"
#include "arrow/util/logging.h"
#include "parquet/bloom_filter.h"
#include "parquet/exception.h"
#include "parquet/murmur3.h"
namespace parquet {
constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock];
BlockSplitBloomFilter::BlockSplitBloomFilter()
: pool_(::arrow::default_memory_pool()),
hash_strategy_(HashStrategy::MURMUR3_X64_128),
algorithm_(Algorithm::BLOCK) {}
void BlockSplitBloomFilter::Init(uint32_t num_bytes) {
if (num_bytes < kMinimumBloomFilterBytes) {
num_bytes = kMinimumBloomFilterBytes;
}
// Get next power of 2 if it is not power of 2.
if ((num_bytes & (num_bytes - 1)) != 0) {
num_bytes = static_cast<uint32_t>(::arrow::bit_util::NextPower2(num_bytes));
}
if (num_bytes > kMaximumBloomFilterBytes) {
num_bytes = kMaximumBloomFilterBytes;
}
num_bytes_ = num_bytes;
PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
memset(data_->mutable_data(), 0, num_bytes_);
this->hasher_.reset(new MurmurHash3());
}
void BlockSplitBloomFilter::Init(const uint8_t* bitset, uint32_t num_bytes) {
DCHECK(bitset != nullptr);
if (num_bytes < kMinimumBloomFilterBytes || num_bytes > kMaximumBloomFilterBytes ||
(num_bytes & (num_bytes - 1)) != 0) {
throw ParquetException("Given length of bitset is illegal");
}
num_bytes_ = num_bytes;
PARQUET_ASSIGN_OR_THROW(data_, ::arrow::AllocateBuffer(num_bytes_, pool_));
memcpy(data_->mutable_data(), bitset, num_bytes_);
this->hasher_.reset(new MurmurHash3());
}
BlockSplitBloomFilter BlockSplitBloomFilter::Deserialize(ArrowInputStream* input) {
uint32_t len, hash, algorithm;
int64_t bytes_available;
PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &len));
if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
throw ParquetException("Failed to deserialize from input stream");
}
PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &hash));
if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
throw ParquetException("Failed to deserialize from input stream");
}
if (static_cast<HashStrategy>(hash) != HashStrategy::MURMUR3_X64_128) {
throw ParquetException("Unsupported hash strategy");
}
PARQUET_ASSIGN_OR_THROW(bytes_available, input->Read(sizeof(uint32_t), &algorithm));
if (static_cast<uint32_t>(bytes_available) != sizeof(uint32_t)) {
throw ParquetException("Failed to deserialize from input stream");
}
if (static_cast<Algorithm>(algorithm) != BloomFilter::Algorithm::BLOCK) {
throw ParquetException("Unsupported Bloom filter algorithm");
}
BlockSplitBloomFilter bloom_filter;
PARQUET_ASSIGN_OR_THROW(auto buffer, input->Read(len));
bloom_filter.Init(buffer->data(), len);
return bloom_filter;
}
void BlockSplitBloomFilter::WriteTo(ArrowOutputStream* sink) const {
DCHECK(sink != nullptr);
PARQUET_THROW_NOT_OK(
sink->Write(reinterpret_cast<const uint8_t*>(&num_bytes_), sizeof(num_bytes_)));
PARQUET_THROW_NOT_OK(sink->Write(reinterpret_cast<const uint8_t*>(&hash_strategy_),
sizeof(hash_strategy_)));
PARQUET_THROW_NOT_OK(
sink->Write(reinterpret_cast<const uint8_t*>(&algorithm_), sizeof(algorithm_)));
PARQUET_THROW_NOT_OK(sink->Write(data_->mutable_data(), num_bytes_));
}
void BlockSplitBloomFilter::SetMask(uint32_t key, BlockMask& block_mask) const {
for (int i = 0; i < kBitsSetPerBlock; ++i) {
block_mask.item[i] = key * SALT[i];
}
for (int i = 0; i < kBitsSetPerBlock; ++i) {
block_mask.item[i] = block_mask.item[i] >> 27;
}
for (int i = 0; i < kBitsSetPerBlock; ++i) {
block_mask.item[i] = UINT32_C(0x1) << block_mask.item[i];
}
}
bool BlockSplitBloomFilter::FindHash(uint64_t hash) const {
const uint32_t bucket_index =
static_cast<uint32_t>((hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1));
uint32_t key = static_cast<uint32_t>(hash);
uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
// Calculate mask for bucket.
BlockMask block_mask;
SetMask(key, block_mask);
for (int i = 0; i < kBitsSetPerBlock; ++i) {
if (0 == (bitset32[kBitsSetPerBlock * bucket_index + i] & block_mask.item[i])) {
return false;
}
}
return true;
}
void BlockSplitBloomFilter::InsertHash(uint64_t hash) {
const uint32_t bucket_index =
static_cast<uint32_t>(hash >> 32) & (num_bytes_ / kBytesPerFilterBlock - 1);
uint32_t key = static_cast<uint32_t>(hash);
uint32_t* bitset32 = reinterpret_cast<uint32_t*>(data_->mutable_data());
// Calculate mask for bucket.
BlockMask block_mask;
SetMask(key, block_mask);
for (int i = 0; i < kBitsSetPerBlock; i++) {
bitset32[bucket_index * kBitsSetPerBlock + i] |= block_mask.item[i];
}
}
} // namespace parquet