Skip to content

Commit de8df0a

Browse files
jkeiserlemire
authored andcommitted
Combined performance patch (5% overall, 15% stage 1) (simdjson#317)
* Allow -f * Support parse -s (force sse) * Simplify flatten_bits - Add directly to base instead of storing variable - Don't modify base_ptr after beginning of function - Eliminate base variable and increment base_ptr instead * De-unroll the flatten_bits loops * Decrease dependencies in stage 1 - Do all finalize_structurals work before computing the quote mask; mask out the quote mask later - Join find_whitespace_and_structurals and finalize_structurals into single find_structurals call, to reduce variable leakage - Rework pseudo_pred algorithm to refer to "primitive" for clarity and some dependency reduction - Rename quote_mask to in_string to describe what we're trying to achieve ("mask" could mean many things) - Break up find_quote_mask_and_bits into find_quote_mask and invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits or odd_ends at all to find_structural_bits) - Genericize overflow methods "follows" and "follows_odd_sequence" for descriptiveness and possible lifting into a generic simd parsing library * Mark branches as likely/unlikely * Reorder and unroll+interleave stage 1 loop * Nest the cnt > 16 branch inside cnt > 8
1 parent 53b6dea commit de8df0a

File tree

13 files changed

+412
-370
lines changed

13 files changed

+412
-370
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
/jsoncheck
1515
/jsonpointer
1616
/jsonstats
17+
/integer_tests
1718
/libsimdjson.so*
1819
/minify
1920
/numberparsingcheck

benchmark/parse.cpp

Lines changed: 24 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,18 @@
3434
#include "simdjson/parsedjson.h"
3535
#include "simdjson/stage1_find_marks.h"
3636
#include "simdjson/stage2_build_tape.h"
37+
38+
// Global arguments
39+
bool find_marks_only = false;
40+
bool verbose = false;
41+
bool dump = false;
42+
bool json_output = false;
43+
bool force_one_iteration = false;
44+
bool just_data = false;
45+
bool force_sse = false;
46+
int32_t iterations = -1;
47+
int32_t warmup_iterations = -1;
48+
3749
namespace simdjson {
3850
Architecture _find_best_supported_implementation() {
3951
constexpr uint32_t haswell_flags =
@@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
4355
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
4456
uint32_t supports = detect_supported_architectures();
4557
// Order from best to worst (within architecture)
46-
if ((haswell_flags & supports) == haswell_flags) {
58+
if ((haswell_flags & supports) == haswell_flags && !force_sse) {
4759
return Architecture::HASWELL;
4860
}
4961
if ((westmere_flags & supports) == westmere_flags) {
@@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
6375
extern stage1_functype *stage1_ptr;
6476

6577
int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
78+
if (find_marks_only) {
79+
return simdjson::SUCCESS;
80+
}
6681
Architecture best_implementation = _find_best_supported_implementation();
6782
// Selecting the best implementation
6883
switch (best_implementation) {
@@ -118,25 +133,21 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
118133
} // namespace simdjson
119134

120135
int main(int argc, char *argv[]) {
121-
bool verbose = false;
122-
bool dump = false;
123-
bool json_output = false;
124-
bool force_one_iteration = false;
125-
bool just_data = false;
126-
int32_t iterations = -1;
127-
int32_t warmup_iterations = -1;
128136

129137
#ifndef _MSC_VER
130138
int c;
131139

132-
while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
140+
while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
133141
switch (c) {
134142
case 'n':
135143
iterations = atoi(optarg);
136144
break;
137145
case 'w':
138146
warmup_iterations = atoi(optarg);
139147
break;
148+
case 's':
149+
force_sse = true;
150+
break;
140151
case 't':
141152
just_data = true;
142153
break;
@@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
152163
case '1':
153164
force_one_iteration = true;
154165
break;
166+
case 'f':
167+
find_marks_only = true;
168+
break;
155169
default:
156170
abort();
157171
}
@@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
326340
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
327341
simdjson::SUCCESS);
328342
isok = isok &&
329-
(simdjson::SUCCESS ==
343+
(simdjson::SUCCESS ==
330344
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
331345
auto end = std::chrono::steady_clock::now();
332346
std::chrono::duration<double> secs = end - start;

include/simdjson/common_defs.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,17 @@
1717
#define SIMDJSON_PADDING 32
1818
#endif
1919

20+
#if defined(__GNUC__)
21+
// Marks a block with a name so that MCA analysis can see it.
22+
#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
23+
#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
24+
#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
25+
#else
26+
#define BEGIN_DEBUG_BLOCK(name)
27+
#define END_DEBUG_BLOCK(name)
28+
#define DEBUG_BLOCK(name, block)
29+
#endif
30+
2031
#ifndef _MSC_VER
2132
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
2233
// also in Intel's compiler), but won't work in MSVC.

scripts/checkperf.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,5 +29,5 @@ make parse
2929
make perfdiff
3030

3131
echo "Running perfdiff:"
32-
echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
33-
./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
32+
echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
33+
./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"

src/arm64/simd_input.h

Lines changed: 27 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -40,33 +40,32 @@ using namespace simdjson::arm64;
4040

4141
template <>
4242
struct simd_input<Architecture::ARM64> {
43-
uint8x16_t chunks[4];
43+
const uint8x16_t chunks[4];
4444

45-
really_inline simd_input(const uint8_t *ptr) {
46-
this->chunks[0] = vld1q_u8(ptr + 0*16);
47-
this->chunks[1] = vld1q_u8(ptr + 1*16);
48-
this->chunks[2] = vld1q_u8(ptr + 2*16);
49-
this->chunks[3] = vld1q_u8(ptr + 3*16);
50-
}
45+
really_inline simd_input()
46+
: chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
5147

52-
really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
53-
this->chunks[0] = chunk0;
54-
this->chunks[1] = chunk1;
55-
this->chunks[2] = chunk2;
56-
this->chunks[3] = chunk3;
57-
}
48+
really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
49+
: chunks{chunk0, chunk1, chunk2, chunk3 } {}
50+
51+
really_inline simd_input(const uint8_t *ptr)
52+
: chunks{
53+
vld1q_u8(ptr + 0*16),
54+
vld1q_u8(ptr + 1*16),
55+
vld1q_u8(ptr + 2*16),
56+
vld1q_u8(ptr + 3*16)
57+
} {}
5858

5959
template <typename F>
60-
really_inline void each(F const& each_chunk)
61-
{
60+
really_inline void each(F const& each_chunk) const {
6261
each_chunk(this->chunks[0]);
6362
each_chunk(this->chunks[1]);
6463
each_chunk(this->chunks[2]);
6564
each_chunk(this->chunks[3]);
6665
}
6766

6867
template <typename F>
69-
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
68+
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
7069
return simd_input<Architecture::ARM64>(
7170
map_chunk(this->chunks[0]),
7271
map_chunk(this->chunks[1]),
@@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
7675
}
7776

7877
template <typename F>
79-
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
78+
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
8079
return simd_input<Architecture::ARM64>(
8180
map_chunk(this->chunks[0], b.chunks[0]),
8281
map_chunk(this->chunks[1], b.chunks[1]),
@@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
8685
}
8786

8887
template <typename F>
89-
really_inline uint8x16_t reduce(F const& reduce_pair) {
88+
really_inline uint8x16_t reduce(F const& reduce_pair) const {
9089
uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
9190
uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
9291
return reduce_pair(r01, r23);
9392
}
9493

95-
really_inline uint64_t to_bitmask() {
94+
really_inline uint64_t to_bitmask() const {
9695
return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
9796
}
9897

99-
really_inline uint64_t eq(uint8_t m) {
98+
really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
99+
const uint8x16_t mask = vmovq_n_u8(m);
100+
return this->map( [&](auto a) {
101+
return vorrq_u8(a, mask);
102+
});
103+
}
104+
105+
really_inline uint64_t eq(const uint8_t m) const {
100106
const uint8x16_t mask = vmovq_n_u8(m);
101107
return this->map( [&](auto a) {
102108
return vceqq_u8(a, mask);
103109
}).to_bitmask();
104110
}
105111

106-
really_inline uint64_t lteq(uint8_t m) {
112+
really_inline uint64_t lteq(const uint8_t m) const {
107113
const uint8x16_t mask = vmovq_n_u8(m);
108114
return this->map( [&](auto a) {
109115
return vcleq_u8(a, mask);

src/arm64/stage1_find_marks.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
namespace simdjson::arm64 {
1414

15-
really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
15+
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
1616

1717
#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
1818
return vmull_p64(-1ULL, quote_bits);
@@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
2121
#endif
2222
}
2323

24-
really_inline void find_whitespace_and_structurals(
25-
simd_input<ARCHITECTURE> in, uint64_t &whitespace,
26-
uint64_t &structurals) {
24+
really_inline void find_whitespace_and_operators(
25+
const simd_input<ARCHITECTURE> in,
26+
uint64_t &whitespace, uint64_t &op) {
2727
const uint8x16_t low_nibble_mask =
2828
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
2929
const uint8x16_t high_nibble_mask =
@@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
3838
return vandq_u8(shuf_lo, shuf_hi);
3939
});
4040

41-
const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
42-
structurals = v.map([&](auto _v) {
43-
return vtstq_u8(_v, structural_shufti_mask);
41+
const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
42+
op = v.map([&](auto _v) {
43+
return vtstq_u8(_v, operator_shufti_mask);
4444
}).to_bitmask();
4545

4646
const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);

0 commit comments

Comments
 (0)