forked from simdjson/simdjson-java
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathUtf8Validator.java
More file actions
261 lines (216 loc) · 13.3 KB
/
Utf8Validator.java
File metadata and controls
261 lines (216 loc) · 13.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
package org.simdjson;
import jdk.incubator.vector.*;
import java.util.Arrays;
class Utf8Validator {
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;
private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck();
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(StructuralIndexer.INT_SPECIES,
StructuralIndexer.INT_SPECIES.elementSize() - 1, 1, true);
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);
/**
* Validate the input bytes are valid UTF8
*
* @param inputBytes the input bytes to validate
* @throws JsonParsingException if the input is not valid UTF8
*/
static void validate(byte[] inputBytes) {
long previousIncomplete = 0;
long errors = 0;
int previousFourUtf8Bytes = 0;
int idx = 0;
for (; idx < VECTOR_SPECIES.loopBound(inputBytes.length); idx += VECTOR_SPECIES.vectorByteSize()) {
ByteVector utf8Vector = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx);
// ASCII fast path can bypass the checks that are only required for multibyte code points
if (isAscii(utf8Vector)) {
errors |= previousIncomplete;
} else {
previousIncomplete = isIncomplete(utf8Vector);
var fourBytesPrevious = fourBytesPreviousSlice(utf8Vector, previousFourUtf8Bytes);
ByteVector firstCheck = firstTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious);
ByteVector secondCheck = lastTwoByteSequenceCheck(utf8Vector.reinterpretAsInts(), fourBytesPrevious, firstCheck);
errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
}
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(StructuralIndexer.INT_SPECIES.length() - 1);
}
// if the input file doesn't align with the vector width, pad the missing bytes with zero
VectorMask<Byte> remainingBytes = VECTOR_SPECIES.indexInRange(idx, inputBytes.length);
ByteVector lastVectorChunk = ByteVector.fromArray(VECTOR_SPECIES, inputBytes, idx, remainingBytes);
if (!isAscii(lastVectorChunk)) {
previousIncomplete = isIncomplete(lastVectorChunk);
var fourBytesPrevious = fourBytesPreviousSlice(lastVectorChunk, previousFourUtf8Bytes);
ByteVector firstCheck = firstTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious);
ByteVector secondCheck = lastTwoByteSequenceCheck(lastVectorChunk.reinterpretAsInts(), fourBytesPrevious, firstCheck);
errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
}
if ((errors | previousIncomplete) != 0) {
throw new JsonParsingException("Invalid UTF8");
}
}
/* Shuffles the input forward by four bytes to make space for the previous four bytes.
The previous three bytes are required for validation, pulling in the last integer will give the previous four bytes.
The switch to integer vectors is to allow for integer shifting instead of the more expensive shuffle / slice operations */
private static IntVector fourBytesPreviousSlice(ByteVector vectorChunk, int previousFourUtf8Bytes) {
return vectorChunk.reinterpretAsInts()
.rearrange(SHIFT_FOUR_BYTES_FORWARD)
.withLane(0, previousFourUtf8Bytes);
}
// works similar to previousUtf8Vector.slice(VECTOR_SPECIES.length() - numOfBytesToInclude, utf8Vector) but without the performance cost
private static ByteVector previousVectorSlice(IntVector utf8Vector, IntVector fourBytesPrevious, int numOfPreviousBytes) {
return utf8Vector
.lanewise(VectorOperators.LSHL, Byte.SIZE * numOfPreviousBytes)
.or(fourBytesPrevious.lanewise(VectorOperators.LSHR, Byte.SIZE * (4 - numOfPreviousBytes)))
.reinterpretAsBytes();
}
private static ByteVector firstTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious) {
// shift the current input forward by 1 byte to include 1 byte from the previous input
var oneBytePrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 1);
// high nibbles of the current input (e.g. 0xC3 >> 4 = 0xC)
ByteVector byte2HighNibbles = utf8Vector.lanewise(VectorOperators.LSHR, 4)
.reinterpretAsBytes().and(LOW_NIBBLE_MASK);
// high nibbles of the shifted input
ByteVector byte1HighNibbles = oneBytePrevious.reinterpretAsInts().lanewise(VectorOperators.LSHR, 4)
.reinterpretAsBytes().and(LOW_NIBBLE_MASK);
// low nibbles of the shifted input (e.g. 0xC3 & 0xF = 0x3)
ByteVector byte1LowNibbles = oneBytePrevious.and(LOW_NIBBLE_MASK);
ByteVector byte1HighState = byte1HighNibbles.selectFrom(LookupTable.byte1High);
ByteVector byte1LowState = byte1LowNibbles.selectFrom(LookupTable.byte1Low);
ByteVector byte2HighState = byte2HighNibbles.selectFrom(LookupTable.byte2High);
return byte1HighState.and(byte1LowState).and(byte2HighState);
}
// All remaining checks are invalid 3–4 byte sequences, which either have too many continuations bytes or not enough
private static ByteVector lastTwoByteSequenceCheck(IntVector utf8Vector, IntVector fourBytesPrevious, ByteVector firstCheck) {
// the minimum 3byte lead - 1110_0000 is always greater than the max 2byte lead - 110_11111
ByteVector twoBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 2);
VectorMask<Byte> is3ByteLead = twoBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b110_11111);
// the minimum 4byte lead - 1111_0000 is always greater than the max 3byte lead - 1110_1111
ByteVector threeBytesPrevious = previousVectorSlice(utf8Vector, fourBytesPrevious, 3);
VectorMask<Byte> is4ByteLead = threeBytesPrevious.compare(VectorOperators.UNSIGNED_GT, (byte) 0b1110_1111);
// the firstCheck vector contains 0x80 values on continuation byte indexes
// the 3/4 byte lead bytes should match up with these indexes and zero them out
return firstCheck.add((byte) 0x80, is3ByteLead.or(is4ByteLead));
}
/* checks that the previous vector isn't in an incomplete state.
Previous vector is in an incomplete state if the last byte is smaller than 0xC0,
or the second last byte is smaller than 0xE0, or the third last byte is smaller than 0xF0.*/
private static ByteVector getIncompleteCheck() {
int vectorBytes = VECTOR_SPECIES.vectorByteSize();
byte[] eofArray = new byte[vectorBytes];
Arrays.fill(eofArray, (byte) 255);
eofArray[vectorBytes - 3] = (byte) 0xF0;
eofArray[vectorBytes - 2] = (byte) 0xE0;
eofArray[vectorBytes - 1] = (byte) 0xC0;
return ByteVector.fromArray(VECTOR_SPECIES, eofArray, 0);
}
private static long isIncomplete(ByteVector utf8Vector) {
return utf8Vector.compare(VectorOperators.UNSIGNED_GE, INCOMPLETE_CHECK).toLong();
}
// ASCII will never exceed 01111_1111
private static boolean isAscii(ByteVector utf8Vector) {
return utf8Vector.and(ALL_ASCII_MASK).compare(VectorOperators.EQ, 0).allTrue();
}
private static class LookupTable {
/* Bit 0 = Too Short (lead byte not followed by a continuation byte but by a lead/ASCII byte)
e.g. 11______ 0_______
11______ 11______ */
static final byte TOO_SHORT = 1;
/* Bit 1 = Too Long (ASCII followed by continuation byte)
e.g. 01111111 10_000000 */
static final byte TOO_LONG = 1 << 1;
/* Bit 2 = Overlong 3-byte
Any 3-byte sequence that could be represented by a shorter sequence
Which is any sequence smaller than 1110_0000 10_100000 10_000000 */
static final byte OVERLONG_3BYTE = 1 << 2;
/* Bit 3 = Too Large
Any decoded codepoint greater than U+10FFFF
e.g. 11110_100 10_010000 10_000000 10_000000 */
static final byte TOO_LARGE = 1 << 3;
/* Bit 4 = Surrogate
code points in the range of U+D800 - U+DFFF (inclusive) are the surrogates for UTF-16.
These 2048 code points that are reserved for UTF-16 are disallowed in UTF-8
e.g. 1110_1101 10_100000 10_000000 */
static final byte SURROGATE = 1 << 4;
/* Bit 5 = Overlong 2-byte
first valid two byte sequence: 110_00010 10_000000
anything smaller is considered overlong as it would fit into a one byte sequence / ASCII */
static final byte OVERLONG_2BYTE = 1 << 5;
/* Bit 6 = Too Large 1000
Similar to TOO_LARGE, but for cases where the continuation byte's high nibble is 1000
e.g. 11110_101 10_000000 10_000000 */
static final byte TOO_LARGE_1000 = 1 << 6;
/* Bit 6 = Overlong 4-byte
Any decoded code point below above U+FFFF / 11110_000 10_001111 10_111111 10_111111
e.g. 11110_000 10_000000 10_000000 10_000000 */
static final byte OVERLONG_4BYTE = 1 << 6;
/* Bit 7 = Two Continuations
e.g. 10_000000 10_000000 */
static final byte TWO_CONTINUATIONS = (byte) (1 << 7);
private final static ByteVector byte1High = getByte1HighLookup();
private final static ByteVector byte1Low = getByte1LowLookup();
private final static ByteVector byte2High = getByte2HighLookup();
private static ByteVector getByte1HighLookup() {
byte[] byte1HighArray = new byte[]{
/* ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table */
TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG, TOO_LONG,
/* Continuation high nibble = 1000 -> 1011 */
TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS, TWO_CONTINUATIONS,
/* Two byte lead high nibble = 1100 -> 1101 */
TOO_SHORT | OVERLONG_2BYTE, TOO_SHORT,
/* Three byte lead high nibble = 1110 */
TOO_SHORT | OVERLONG_3BYTE | SURROGATE,
/* Four byte lead high nibble = 1111 */
TOO_SHORT | TOO_LARGE | TOO_LARGE_1000 | OVERLONG_4BYTE
};
return alignArrayToVector(byte1HighArray);
}
private static ByteVector alignArrayToVector(byte[] arrayValues) {
// pad array with zeroes to align up with vector size
byte[] alignedArray = new byte[VECTOR_SPECIES.vectorByteSize()];
System.arraycopy(arrayValues, 0, alignedArray, 0, arrayValues.length);
return ByteVector.fromArray(VECTOR_SPECIES, alignedArray, 0);
}
private static ByteVector getByte1LowLookup() {
final byte CARRY = TOO_SHORT | TOO_LONG | TWO_CONTINUATIONS;
byte[] byte1LowArray = new byte[]{
/* ASCII, two Byte lead and three byte lead low nibble = 0000 -> 1111,
* Four byte lead low nibble = 0000 -> 0111
* Continuation byte low nibble is inconsequential
* Low nibble does not affect the states TOO_SHORT, TOO_LONG, TWO_CONTINUATIONS, so they will be carried over regardless */
CARRY | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE,
// 0001
CARRY | OVERLONG_2BYTE,
CARRY,
CARRY,
// 1111_0100 -> 1111 = TOO_LARGE range
CARRY | TOO_LARGE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000,
// 1110_1101
CARRY | TOO_LARGE | TOO_LARGE_1000 | SURROGATE,
CARRY | TOO_LARGE | TOO_LARGE_1000,
CARRY | TOO_LARGE | TOO_LARGE_1000
};
return alignArrayToVector(byte1LowArray);
}
private static ByteVector getByte2HighLookup() {
byte[] byte2HighArray = new byte[]{
// ASCII high nibble = 0000 -> 0111, ie 0 -> 7 index in lookup table
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT,
// Continuation high nibble - 1000 -> 1011
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | OVERLONG_4BYTE | TOO_LARGE_1000,
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | OVERLONG_3BYTE | TOO_LARGE,
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE,
TOO_LONG | TWO_CONTINUATIONS | OVERLONG_2BYTE | SURROGATE | TOO_LARGE,
// 1100 -> 1111 = unexpected lead byte
TOO_SHORT, TOO_SHORT, TOO_SHORT, TOO_SHORT
};
return alignArrayToVector(byte2HighArray);
}
}
}