forked from panda3d/panda3d
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstringDecoder.cxx
More file actions
222 lines (199 loc) · 6.34 KB
/
stringDecoder.cxx
File metadata and controls
222 lines (199 loc) · 6.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
/**
* PANDA 3D SOFTWARE
* Copyright (c) Carnegie Mellon University. All rights reserved.
*
* All use of this software is subject to the terms of the revised BSD
* license. You should have received a copy of this license along
* with this source code in a file named "LICENSE."
*
* @file stringDecoder.cxx
* @author drose
* @date 2002-02-11
*/
#include "stringDecoder.h"
#include "config_dtoolutil.h"
std::ostream *StringDecoder::_notify_ptr = &std::cerr;
/**
*
*/
StringDecoder::
~StringDecoder() {
}
/**
* Returns the next character in sequence.
*/
char32_t StringDecoder::
get_next_character() {
if (test_eof()) {
return -1;
}
return (unsigned char)_input[_p++];
}
/**
* Sets the ostream that is used to write error messages to. This is
* necessary because of the low-level placement of this class, before the
* definition of the NotifyCategory class, so it cannot specify its own
* notify.
*/
void StringDecoder::
set_notify_ptr(std::ostream *notify_ptr) {
_notify_ptr = notify_ptr;
}
/**
* Returns the ostream that is used to write error messages to. See
* set_notify_ptr().
*/
std::ostream *StringDecoder::
get_notify_ptr() {
return _notify_ptr;
}
/*
In UTF-8, each 16-bit Unicode character is encoded as a sequence of
one, two, three or four 8-bit bytes, depending on the value of the
character. The following table shows the format of such UTF-8 byte
sequences (where the "free bits" shown by x's in the table are
combined in the order shown, and interpreted from most significant to
least significant):
Binary format of bytes in sequence:
Number of Maximum expressible
1st byte 2nd byte 3rd byte 4th byte free bits: Unicode value:
0xxxxxxx 7 007F hex (127)
110xxxxx 10xxxxxx (5+6)=11 07FF hex (2047)
1110xxxx 10xxxxxx 10xxxxxx (4+6+6)=16 FFFF hex (65535)
11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (4+6*3)=21 10FFFF hex (1114111)
The value of each individual byte indicates its UTF-8 function, as follows:
00 to 7F hex (0 to 127): first and only byte of a sequence.
80 to BF hex (128 to 191): continuing byte in a multi-byte sequence.
C2 to DF hex (194 to 223): first byte of a two-byte sequence.
E0 to EF hex (224 to 239): first byte of a three-byte sequence.
F0 to F7 hex (240 to 247): first byte of a four-byte sequence.
*/
/**
* Returns the next character in sequence.
*/
char32_t StringUtf8Decoder::
get_next_character() {
unsigned int result;
while (!test_eof()) {
result = (unsigned char)_input[_p++];
if ((result & 0x80) == 0) {
// A 7-bit ascii value in one byte.
return result;
} if ((result & 0xe0) == 0xc0) {
// First byte of two.
unsigned int two = 0;
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
two = (unsigned char)_input[_p++];
result = ((result & 0x1f) << 6) | (two & 0x3f);
return result;
} else if ((result & 0xf0) == 0xe0) {
// First byte of three.
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int two = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int three = (unsigned char)_input[_p++];
result = ((result & 0x0f) << 12) | ((two & 0x3f) << 6) | (three & 0x3f);
return result;
} else if ((result & 0xf8) == 0xf0) {
// First byte of four.
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int two = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int three = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "utf-8 encoded string '" << _input << "' ends abruptly.\n";
}
return -1;
}
unsigned int four = (unsigned char)_input[_p++];
result = ((result & 0x07) << 18) | ((two & 0x3f) << 12) | ((three & 0x3f) << 6) | (four & 0x3f);
return result;
}
// Otherwise--the high bit is set but it is not one of the introductory
// utf-8 bytes--we have an error.
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "Non utf-8 byte in string: 0x" << std::hex << result << std::dec
<< ", string is '" << _input << "'\n";
}
return -1;
}
// End of string reached.
return -1;
}
/**
* Returns the next character in sequence.
*/
char32_t StringUtf16Decoder::
get_next_character() {
if (test_eof()) {
return -1;
}
unsigned int high = (unsigned char)_input[_p++];
if (test_eof()) {
if (_notify_ptr != nullptr) {
(*_notify_ptr)
<< "Unicode-encoded string has odd number of bytes.\n";
}
return -1;
}
unsigned int low = (unsigned char)_input[_p++];
int ch = ((high << 8) | low);
/*
using std::swap;
if (ch == 0xfffe) {
// This is a byte-swapped byte-order-marker. That means we need to swap
// the endianness of the rest of the stream.
char *data = (char *)_input.data();
for (size_t p = _p; p < _input.size() - 1; p += 2) {
std::swap(data[p], data[p + 1]);
}
ch = 0xfeff;
}
*/
if (ch >= 0xd800 && ch < 0xdc00 && (_p + 1) < _input.size()) {
// This is a high surrogate. Look for a subsequent low surrogate.
unsigned int high = (unsigned char)_input[_p];
unsigned int low = (unsigned char)_input[_p + 1];
int ch2 = ((high << 8) | low);
if (ch2 >= 0xdc00 && ch2 < 0xe000) {
// Yes, this is a low surrogate.
_p += 2;
return 0x10000 + ((ch - 0xd800) << 10) + (ch2 - 0xdc00);
}
}
// No, this is just a regular character, or an unpaired surrogate.
return ch;
}