-
Notifications
You must be signed in to change notification settings - Fork 507
Expand file tree
/
Copy pathUtf8Utils.java
More file actions
107 lines (98 loc) · 3.1 KB
/
Utf8Utils.java
File metadata and controls
107 lines (98 loc) · 3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.impl;
import java.nio.charset.StandardCharsets;
/**
* @since 1.6.0
*/
public final class Utf8Utils {
public static int charLength(byte[] data, int offset, int length) {
int chars = 0;
for (int i = 0; i < length; i++) {
if (isUtfStartByte(data[offset +i ])) {
chars++;
}
}
return chars;
}
/**
* Return the number of bytes required to read at most
* maxLength characters in full from a utf-8 encoded byte array provided
* by data[offset:offset+length]. This does not validate utf-8 data, but
* operates correctly on already valid utf-8 data.
*
* @param maxCharLength
* @param data
* @param offset
* @param length
*/
public static int truncateBytesTo(int maxCharLength, byte[] data, int offset, int length) {
int chars = 0;
if (length <= maxCharLength) {
return length;
}
for (int i = 0; i < length; i++) {
if (isUtfStartByte(data[offset +i ])) {
chars++;
}
if (chars > maxCharLength) {
return i;
}
}
// everything fits
return length;
}
/**
* Checks if b is the first byte of a UTF-8 character.
*
*/
public static boolean isUtfStartByte(byte b) {
return (b & 0xC0) != 0x80;
}
/**
* Find the start of the last character that ends in the current string.
* @param text the bytes of the utf-8
* @param from the first byte location
* @param until the last byte location
* @return the index of the last character
*/
public static int findLastCharacter(byte[] text, int from, int until) {
int posn = until;
/* we don't expect characters more than 5 bytes */
while (posn >= from) {
if (isUtfStartByte(text[posn])) {
return posn;
}
posn -= 1;
}
/* beginning of a valid char not found */
throw new IllegalArgumentException(
"Could not truncate string, beginning of a valid char not found");
}
/**
* Get the code point at a given location in the byte array.
* @param source the bytes of the string
* @param from the offset to start at
* @param len the number of bytes in the character
* @return the code point
*/
public static int getCodePoint(byte[] source, int from, int len) {
return new String(source, from, len, StandardCharsets.UTF_8)
.codePointAt(0);
}
}