forked from panda3d/panda3d
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtextEncoder.I
More file actions
503 lines (462 loc) · 13.6 KB
/
textEncoder.I
File metadata and controls
503 lines (462 loc) · 13.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
/**
* PANDA 3D SOFTWARE
* Copyright (c) Carnegie Mellon University. All rights reserved.
*
* All use of this software is subject to the terms of the revised BSD
* license. You should have received a copy of this license along
* with this source code in a file named "LICENSE."
*
* @file textEncoder.I
* @author drose
* @date 2003-03-26
*/
/**
*
*/
INLINE TextEncoder::
TextEncoder() {
_encoding = _default_encoding;
// Initially, since the text string is empty, we know that both _text and
// _wtext accurately reflect the empty state; so we "got" both of them.
_flags = (F_got_text | F_got_wtext);
}
/**
*
*/
INLINE TextEncoder::
TextEncoder(const TextEncoder ©) :
_flags(copy._flags),
_encoding(copy._encoding),
_text(copy._text),
_wtext(copy._wtext)
{
}
/**
* Specifies how the string set via set_text() is to be interpreted. The
* default, E_iso8859, means a standard string with one-byte characters (i.e.
* ASCII). Other encodings are possible to take advantage of character sets
* with more than 256 characters.
*
* This affects only future calls to set_text(); it does not change text that
* was set previously.
*/
INLINE void TextEncoder::
set_encoding(TextEncoder::Encoding encoding) {
// Force the previously-set strings to be encoded or decoded now.
get_text();
get_wtext();
_encoding = encoding;
}
/**
* Returns the encoding by which the string set via set_text() is to be
* interpreted. See set_encoding().
*/
INLINE TextEncoder::Encoding TextEncoder::
get_encoding() const {
return _encoding;
}
/**
* Specifies the default encoding to be used for all subsequently created
* TextEncoder objects. See set_encoding().
*/
INLINE void TextEncoder::
set_default_encoding(TextEncoder::Encoding encoding) {
_default_encoding = encoding;
}
/**
* Specifies the default encoding to be used for all subsequently created
* TextEncoder objects. See set_encoding().
*/
INLINE TextEncoder::Encoding TextEncoder::
get_default_encoding() {
return _default_encoding;
}
/**
* Changes the text that is stored in the encoder. The text should be encoded
* according to the method indicated by set_encoding(). Subsequent calls to
* get_text() will return this same string, while get_wtext() will return the
* decoded version of the string.
*/
INLINE void TextEncoder::
set_text(const std::string &text) {
if (!has_text() || _text != text) {
_text = text;
_flags = (_flags | F_got_text) & ~F_got_wtext;
text_changed();
}
}
/**
* The two-parameter version of set_text() accepts an explicit encoding; the
* text is immediately decoded and stored as a wide-character string.
* Subsequent calls to get_text() will return the same text re-encoded using
* whichever encoding is specified by set_encoding().
*/
INLINE void TextEncoder::
set_text(const std::string &text, TextEncoder::Encoding encoding) {
if (encoding == _encoding) {
set_text(text);
} else {
set_wtext(decode_text(text, encoding));
}
}
/**
* Removes the text from the TextEncoder.
*/
INLINE void TextEncoder::
clear_text() {
_text = std::string();
_wtext = std::wstring();
_flags |= (F_got_text | F_got_wtext);
text_changed();
}
/**
*
*/
INLINE bool TextEncoder::
has_text() const {
if (_flags & F_got_wtext) {
return !_wtext.empty();
} else {
return !_text.empty();
}
}
/**
* Returns the current text, as encoded via the current encoding system.
*/
INLINE std::string TextEncoder::
get_text() const {
if ((_flags & F_got_text) == 0) {
((TextEncoder *)this)->_text = encode_wtext(_wtext);
((TextEncoder *)this)->_flags |= F_got_text;
}
return _text;
}
/**
* Returns the current text, as encoded via the indicated encoding system.
*/
INLINE std::string TextEncoder::
get_text(TextEncoder::Encoding encoding) const {
return encode_wtext(get_wtext(), encoding);
}
/**
* Appends the indicates string to the end of the stored text.
*/
INLINE void TextEncoder::
append_text(const std::string &text) {
if (!text.empty()) {
_text = get_text() + text;
_flags = (_flags | F_got_text) & ~F_got_wtext;
text_changed();
}
}
/**
* Appends a single character to the end of the stored text. This may be a
* wide character, up to 16 bits in Unicode.
*/
INLINE void TextEncoder::
append_unicode_char(char32_t character) {
#if WCHAR_MAX >= 0x10FFFF
// wchar_t might be UTF-32.
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
#else
if ((character & ~0xffff) == 0) {
_wtext = get_wtext() + std::wstring(1, (wchar_t)character);
} else {
// Encode as a surrogate pair.
uint32_t v = (uint32_t)character - 0x10000u;
wchar_t wstr[2] = {
(wchar_t)((v >> 10u) | 0xd800u),
(wchar_t)((v & 0x3ffu) | 0xdc00u),
};
_wtext = get_wtext() + std::wstring(wstr, 2);
}
#endif
_flags = (_flags | F_got_wtext) & ~F_got_text;
text_changed();
}
/**
* Returns the number of characters in the stored text. This is a count of
* wide characters, after the string has been decoded according to
* set_encoding().
*/
INLINE size_t TextEncoder::
get_num_chars() const {
return get_wtext().length();
}
/**
* Returns the Unicode value of the nth character in the stored text. This
* may be a wide character (greater than 255), after the string has been
* decoded according to set_encoding().
*/
INLINE int TextEncoder::
get_unicode_char(size_t index) const {
get_wtext();
if (index < _wtext.length()) {
return _wtext[index];
}
return 0;
}
/**
* Sets the Unicode value of the nth character in the stored text. This may
* be a wide character (greater than 255), after the string has been decoded
* according to set_encoding().
*/
INLINE void TextEncoder::
set_unicode_char(size_t index, char32_t character) {
get_wtext();
if (index < _wtext.length()) {
_wtext[index] = character;
_flags &= ~F_got_text;
text_changed();
}
}
/**
* Returns the nth char of the stored text, as a one-, two-, or three-byte
* encoded string.
*/
INLINE std::string TextEncoder::
get_encoded_char(size_t index) const {
return get_encoded_char(index, get_encoding());
}
/**
* Returns the nth char of the stored text, as a one-, two-, or three-byte
* encoded string.
*/
INLINE std::string TextEncoder::
get_encoded_char(size_t index, TextEncoder::Encoding encoding) const {
std::wstring wch(1, (wchar_t)get_unicode_char(index));
return encode_wtext(wch, encoding);
}
/**
* Returns the text associated with the node, converted as nearly as possible
* to a fully-ASCII representation. This means replacing accented letters
* with their unaccented ASCII equivalents.
*
* It is possible that some characters in the string cannot be converted to
* ASCII. (The string may involve symbols like the copyright symbol, for
* instance, or it might involve letters in some other alphabet such as Greek
* or Cyrillic, or even Latin letters like thorn or eth that are not part of
* the ASCII character set.) In this case, as much of the string as possible
* will be converted to ASCII, and the nonconvertible characters will remain
* encoded in the encoding specified by set_encoding().
*/
INLINE std::string TextEncoder::
get_text_as_ascii() const {
return encode_wtext(get_wtext_as_ascii());
}
/**
* Given the indicated text string, which is assumed to be encoded via the
* encoding "from", decodes it and then reencodes it into the encoding "to",
* and returns the newly encoded string. This does not change or affect any
* properties on the TextEncoder itself.
*/
INLINE std::string TextEncoder::
reencode_text(const std::string &text, TextEncoder::Encoding from,
TextEncoder::Encoding to) {
return encode_wtext(decode_text(text, from), to);
}
/**
* Returns true if the indicated character is an alphabetic letter, false
* otherwise. This is akin to ctype's isalpha(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_isalpha(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
return false;
}
return entry->_char_type == UnicodeLatinMap::CT_upper ||
entry->_char_type == UnicodeLatinMap::CT_lower;
}
/**
* Returns true if the indicated character is a numeric digit, false
* otherwise. This is akin to ctype's isdigit(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_isdigit(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
// The digits aren't actually listed in the map.
return (character >= '0' && character <= '9');
}
// This silly test (!= 0) is necessary to prevent a VC++ warning.
return (isdigit(entry->_ascii_equiv) != 0);
}
/**
* Returns true if the indicated character is a punctuation mark, false
* otherwise. This is akin to ctype's ispunct(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_ispunct(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
// Some punctuation marks aren't listed in the map.
return (character < 128 && ispunct(character));
}
return entry->_char_type == UnicodeLatinMap::CT_punct;
}
/**
* Returns true if the indicated character is an uppercase letter, false
* otherwise. This is akin to ctype's isupper(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_isupper(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
return false;
}
return entry->_char_type == UnicodeLatinMap::CT_upper;
}
/**
* Returns true if the indicated character is a whitespace letter, false
* otherwise. This is akin to ctype's isspace(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_isspace(char32_t character) {
switch (character) {
case ' ':
case '\t':
case '\n':
return true;
default:
return false;
}
}
/**
* Returns true if the indicated character is a lowercase letter, false
* otherwise. This is akin to ctype's islower(), extended to Unicode.
*/
INLINE bool TextEncoder::
unicode_islower(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
return false;
}
return entry->_char_type == UnicodeLatinMap::CT_lower;
}
/**
* Returns the uppercase equivalent of the given Unicode character. This is
* akin to ctype's toupper(), extended to Unicode.
*/
INLINE int TextEncoder::
unicode_toupper(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
return character;
}
return entry->_toupper_character;
}
/**
* Returns the uppercase equivalent of the given Unicode character. This is
* akin to ctype's tolower(), extended to Unicode.
*/
INLINE int TextEncoder::
unicode_tolower(char32_t character) {
const UnicodeLatinMap::Entry *entry = UnicodeLatinMap::look_up(character);
if (entry == nullptr) {
return character;
}
return entry->_tolower_character;
}
/**
* Converts the string to uppercase, assuming the string is encoded in the
* default encoding.
*/
INLINE std::string TextEncoder::
upper(const std::string &source) {
return upper(source, get_default_encoding());
}
/**
* Converts the string to uppercase, assuming the string is encoded in the
* indicated encoding.
*/
INLINE std::string TextEncoder::
upper(const std::string &source, TextEncoder::Encoding encoding) {
TextEncoder encoder;
encoder.set_encoding(encoding);
encoder.set_text(source);
encoder.make_upper();
return encoder.get_text();
}
/**
* Converts the string to lowercase, assuming the string is encoded in the
* default encoding.
*/
INLINE std::string TextEncoder::
lower(const std::string &source) {
return lower(source, get_default_encoding());
}
/**
* Converts the string to lowercase, assuming the string is encoded in the
* indicated encoding.
*/
INLINE std::string TextEncoder::
lower(const std::string &source, TextEncoder::Encoding encoding) {
TextEncoder encoder;
encoder.set_encoding(encoding);
encoder.set_text(source);
encoder.make_lower();
return encoder.get_text();
}
/**
* Changes the text that is stored in the encoder. Subsequent calls to
* get_wtext() will return this same string, while get_text() will return the
* encoded version of the string.
*/
INLINE void TextEncoder::
set_wtext(const std::wstring &wtext) {
if (!has_text() || _wtext != wtext) {
_wtext = wtext;
_flags = (_flags | F_got_wtext) & ~F_got_text;
text_changed();
}
}
/**
* Returns the text associated with the TextEncoder, as a wide-character
* string.
*/
INLINE const std::wstring &TextEncoder::
get_wtext() const {
if ((_flags & F_got_wtext) == 0) {
((TextEncoder *)this)->_wtext = decode_text(_text);
((TextEncoder *)this)->_flags |= F_got_wtext;
}
return _wtext;
}
/**
* Appends the indicates string to the end of the stored wide-character text.
*/
INLINE void TextEncoder::
append_wtext(const std::wstring &wtext) {
if (!wtext.empty()) {
_wtext = get_wtext() + wtext;
_flags = (_flags | F_got_wtext) & ~F_got_text;
text_changed();
}
}
/**
* Encodes a wide-text string into a single-char string, according to the
* current encoding.
*/
INLINE std::string TextEncoder::
encode_wtext(const std::wstring &wtext) const {
return encode_wtext(wtext, _encoding);
}
/**
* Returns the given wstring decoded to a single-byte string, via the current
* encoding system.
*/
INLINE std::wstring TextEncoder::
decode_text(const std::string &text) const {
return decode_text(text, _encoding);
}
/**
* Uses the current default encoding to output the wstring.
*/
INLINE std::ostream &
operator << (std::ostream &out, const std::wstring &str) {
TextEncoder encoder;
encoder.set_wtext(str);
out << encoder.get_text();
return out;
}