-
Notifications
You must be signed in to change notification settings - Fork 60
Expand file tree
/
Copy pathEncoding.java
More file actions
1900 lines (1726 loc) · 72.8 KB
/
Copy pathEncoding.java
File metadata and controls
1900 lines (1726 loc) · 72.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
package org.python.core.stringlib;
import com.google.common.base.CharMatcher;
import com.google.common.base.Joiner;
import com.google.common.base.Predicate;
import com.google.common.collect.Lists;
import com.ibm.icu.lang.UCharacter;
import org.python.core.BufferProtocol;
import org.python.core.Py;
import org.python.core.PyBUF;
import org.python.core.PyBuffer;
import org.python.core.PyBytes;
import org.python.core.PyComplex;
import org.python.core.PyException;
import org.python.core.PyInteger;
import org.python.core.PyList;
import org.python.core.PyLong;
import org.python.core.PyObject;
import org.python.core.PySystemState;
import org.python.core.PyTuple;
import org.python.core.PyUnicode;
import org.python.core.codecs;
import org.python.modules.sys.SysModule;
import java.math.BigInteger;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Helper methods for unicode encoding shared between bytes-like object and str
*/
public class Encoding {
private static char[] hexdigit = "0123456789abcdef".toCharArray();
public static String encode_UnicodeEscape(String str, boolean use_quotes) {
int size = str.length();
StringBuilder v = new StringBuilder(str.length());
char quote = 0;
if (use_quotes) {
quote = str.indexOf('\'') >= 0 && str.indexOf('"') == -1 ? '"' : '\'';
v.append(quote);
}
for (int i = 0; size-- > 0; ) {
char ch = str.charAt(i++);
/* Escape quotes */
if ((use_quotes && ch == quote) || ch == '\\') {
v.append('\\');
v.append(ch);
continue;
}
/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
else if (size > 0 && Character.isHighSurrogate(ch)) {
char ch2 = str.charAt(i++);
size--;
if (Character.isLowSurrogate(ch2)) {
int ucs = Character.toCodePoint(ch, ch2);//((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
if (UCharacter.isPrintable(ucs)) {
v.appendCodePoint(ucs);
} else {
v.append('\\');
v.append('U');
v.append(hexdigit[(ucs >> 28) & 0xf]);
v.append(hexdigit[(ucs >> 24) & 0xf]);
v.append(hexdigit[(ucs >> 20) & 0xf]);
v.append(hexdigit[(ucs >> 16) & 0xf]);
v.append(hexdigit[(ucs >> 12) & 0xf]);
v.append(hexdigit[(ucs >> 8) & 0xf]);
v.append(hexdigit[(ucs >> 4) & 0xf]);
v.append(hexdigit[ucs & 0xf]);
}
continue;
}
/* Fall through: isolated surrogates are copied as-is */
i--;
size++;
}
/* Map 16-bit characters to '\\uxxxx' */
if (ch >= 256 && !UCharacter.isPrintable(ch)) {
v.append('\\');
v.append('u');
v.append(hexdigit[(ch >> 12) & 0xf]);
v.append(hexdigit[(ch >> 8) & 0xf]);
v.append(hexdigit[(ch >> 4) & 0xf]);
v.append(hexdigit[ch & 15]);
}
/* Map special whitespace to '\t', \n', '\r' */
else if (ch == '\t') {
v.append("\\t");
} else if (ch == '\n') {
v.append("\\n");
} else if (ch == '\r') {
v.append("\\r");
} else if (ch < ' ' || ch == 127) {
/* Map non-printable US ASCII to '\xNN' */
v.append('\\');
v.append('x');
v.append(hexdigit[(ch >> 4) & 0xf]);
v.append(hexdigit[ch & 0xf]);
} else {/* Copy everything else as-is */
v.append(ch);
}
}
if (use_quotes) {
v.append(quote);
}
return v.toString();
}
public static String decode_UnicodeEscape(String str, int start, int end, String errors,
boolean unicode) {
StringBuilder v = new StringBuilder(end - start);
for (int s = start; s < end; ) {
char ch = str.charAt(s);
/* Non-escape characters are interpreted as Unicode ordinals */
if (ch != '\\') {
v.append(ch);
s++;
continue;
}
int loopStart = s;
/* \ - Escapes */
s++;
if (s == end) {
s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
str, loopStart, s + 1, "\\ at end of string");
continue;
}
ch = str.charAt(s++);
switch (ch) {
/* \x escapes */
case '\n':
break;
case '\\':
v.append('\\');
break;
case '\'':
v.append('\'');
break;
case '\"':
v.append('\"');
break;
case 'b':
v.append('\b');
break;
case 'f':
v.append('\014');
break; /* FF */
case 't':
v.append('\t');
break;
case 'n':
v.append('\n');
break;
case 'r':
v.append('\r');
break;
case 'v':
v.append('\013');
break; /* VT */
case 'a':
v.append('\007');
break; /* BEL, not classic C */
/* \OOO (octal) escapes */
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
int x = Character.digit(ch, 8);
for (int j = 0; j < 2 && s < end; j++, s++) {
ch = str.charAt(s);
if (ch < '0' || ch > '7') {
break;
}
x = (x << 3) + Character.digit(ch, 8);
}
v.append((char) x);
break;
case 'x':
s = hexescape(v, errors, 2, s, str, end, "truncated \\xXX");
break;
case 'u':
if (!unicode) {
v.append('\\');
v.append('u');
break;
}
s = hexescape(v, errors, 4, s, str, end, "truncated \\uXXXX");
break;
case 'U':
if (!unicode) {
v.append('\\');
v.append('U');
break;
}
s = hexescape(v, errors, 8, s, str, end, "truncated \\UXXXXXXXX");
break;
case 'N':
if (!unicode) {
v.append('\\');
v.append('N');
break;
}
/*
* Ok, we need to deal with Unicode Character Names now, make sure we've
* imported the hash table data...
*/
if (str.charAt(s) == '{') {
int startName = s + 1;
int endBrace = startName;
/*
* look for either the closing brace, or we exceed the maximum length of the
* unicode character names
*/
endBrace = str.indexOf('}', startName);
if (endBrace != -1) {
int value = UCharacter.getCharFromName(str.substring(startName, endBrace));
if (storeUnicodeCharacter(value, v)) {
s = endBrace + 1;
} else {
s = codecs.insertReplacementAndGetResume( //
v, errors, "unicodeescape", //
str, loopStart, endBrace + 1, "illegal Unicode character");
}
} else {
s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
str, loopStart, endBrace, "malformed \\N character escape");
}
break;
} else {
s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //
str, loopStart, s + 1, "malformed \\N character escape");
}
break;
default:
v.append('\\');
v.append(str.charAt(s - 1));
break;
}
}
return v.toString();
}
private static int hexescape(StringBuilder partialDecode, String errors, int digits,
int hexDigitStart, String str, int size, String errorMessage) {
int i = 0;
int x = 0;
for (; i < digits; ++i) {
int index = hexDigitStart + i;
if (index >= size) {
return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",
str, hexDigitStart - 2, size, errorMessage);
}
char c = str.charAt(index);
int d = Character.digit(c, 16);
if (d == -1) {
return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",
str, hexDigitStart - 2, index + 1, errorMessage) - 1;
}
x = (x << 4) & ~0xF;
if (c >= '0' && c <= '9') {
x += c - '0';
} else if (c >= 'a' && c <= 'f') {
x += 10 + c - 'a';
} else {
x += 10 + c - 'A';
}
}
if (storeUnicodeCharacter(x, partialDecode)) {
return hexDigitStart + i;
} else {
return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",
str, hexDigitStart - 2, hexDigitStart + i + 1, "illegal Unicode character");
}
}
/* pass in an int since this can be a UCS-4 character */
private static boolean storeUnicodeCharacter(int value, StringBuilder partialDecode) {
if (value >= 0 && value <= SysModule.MAXUNICODE) {
partialDecode.appendCodePoint(value);
return true;
}
return false;
}
/**
* Helper common to the Python and Java API for <code>str.replace</code>, returning a new string
* equal to this string with ocurrences of <code>oldPiece</code> replaced by
* <code>newPiece</code>, up to a maximum of <code>count</code> occurrences, or all of them.
* This method also supports {@link PyUnicode#str_replace(PyObject, PyObject, int)}, in
* which context it returns a <code>PyUnicode</code>
*
* @param oldPiece to replace where found.
* @param newPiece replacement text.
* @param count maximum number of replacements to make, or -1 meaning all of them.
* @return PyBytes (or PyUnicode if this string is one), this string after replacements.
*/
public static final String _replace(String s, String oldPiece, String newPiece, int count) {
int len = s.length();
int oldLen = oldPiece.length();
int newLen = newPiece.length();
if (len == 0) {
if (count < 0 && oldLen == 0) {
return newPiece;
}
return s;
} else if (oldLen == 0 && newLen != 0 && count != 0) {
/*
* old="" and new != "", interleave new piece with each char in original, taking into
* account count
*/
int i = 0;
StringBuilder buffer = new StringBuilder(newPiece);
for (; i < len && (count < 0 || i < count - 1); i++) {
buffer.append(s.charAt(i)).append(newPiece);
}
buffer.append(s.substring(i));
return buffer.toString();
} else {
if (count < 0) {
count = (oldLen == 0) ? len + 1 : len;
}
return Joiner.on(newPiece).join(Pattern.compile(oldPiece, Pattern.LITERAL).split(s, count + 1));
}
}
public static final boolean isLowercase(CharSequence s) {
return s.length() != 0 && CharMatcher.JAVA_LOWER_CASE.matchesAllOf(s);
}
public static final boolean isUppercase(CharSequence s) {
return s.length() != 0 && CharMatcher.JAVA_UPPER_CASE.matchesAllOf(s);
}
public static final boolean isAlpha(CharSequence s) {
return s.length() != 0 && CharMatcher.JAVA_LETTER.matchesAllOf(s);
}
public static final boolean isAlnum(CharSequence s) {
return s.length() != 0 && CharMatcher.JAVA_LETTER_OR_DIGIT.matchesAllOf(s);
}
public static final boolean isDecimal(CharSequence s) {
return s.length() != 0 && CharMatcher.forPredicate(new Predicate<Character>() {
@Override
public boolean apply(Character ch) {
return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;
}
}).matchesAllOf(s);
}
public static final boolean isDigit(CharSequence s) {
return s.length() != 0 && CharMatcher.DIGIT.matchesAllOf(s);
}
public static final boolean isNumeric(CharSequence s) {
return s.length() != 0 && CharMatcher.forPredicate(new Predicate<Character>() {
@Override
public boolean apply(Character ch) {
int type = Character.getType(ch);
return type == Character.DECIMAL_DIGIT_NUMBER || type == Character.LETTER_NUMBER
|| type == Character.OTHER_NUMBER;
}
}).matchesAllOf(s);
}
public static final boolean isTitle(CharSequence s) {
int n = s.length();
/* Shortcut for single character strings */
if (n == 1) {
return Character.isTitleCase(s.charAt(0))
|| Character.isUpperCase(s.charAt(0));
}
boolean cased = false;
boolean previous_is_cased = false;
for (int i = 0; i < n; i++) {
char ch = s.charAt(i);
if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
if (previous_is_cased) {
return false;
}
previous_is_cased = true;
cased = true;
} else if (Character.isLowerCase(ch)) {
if (!previous_is_cased) {
return false;
}
previous_is_cased = true;
cased = true;
} else {
previous_is_cased = false;
}
}
return cased;
}
public static final boolean isSpace(CharSequence s) {
return s.length() != 0 && CharMatcher.WHITESPACE.matchesAllOf(s);
}
public static PyObject format(CharSequence s, PyObject formatSpec, boolean bytes) {
// Parse the specification
InternalFormat.Spec spec = InternalFormat.fromText(formatSpec, "__format__");
// Get a formatter for the specification
TextFormatter f = prepareFormatter(spec);
if (f == null) {
// The type code was not recognised
throw InternalFormat.Formatter.unknownFormat(spec.type, "string");
}
f.setBytes(bytes);
// Convert as per specification.
f.format(s);
// Return a result that has the same type (str or unicode) as the formatSpec argument.
return f.pad().getPyResult();
}
/**
* Common code for {@link PyBytes} and {@link PyUnicode} to prepare a {@link TextFormatter}
* from a parsed specification. The object returned has format method
* {@link TextFormatter#format(String)} that treats its argument as UTF-16 encoded unicode (not
* just <code>char</code>s). That method will format its argument ( <code>str</code> or
* <code>unicode</code>) according to the PEP 3101 formatting specification supplied here. This
* would be used during <code>text.__format__(".5s")</code> or
* <code>"{:.5s}".format(text)</code> where <code>text</code> is this Python string.
*
* @param spec a parsed PEP-3101 format specification.
* @return a formatter ready to use, or null if the type is not a string format type.
* @throws PyException(ValueError) if the specification is faulty.
*/
@SuppressWarnings("fallthrough")
public static final TextFormatter prepareFormatter(InternalFormat.Spec spec) throws PyException {
// Slight differences between format types
switch (spec.type) {
case InternalFormat.Spec.NONE:
case 's':
// Check for disallowed parts of the specification
if (spec.grouping) {
throw InternalFormat.Formatter.notAllowed("Grouping", "string", spec.type);
} else if (InternalFormat.Spec.specified(spec.sign)) {
throw InternalFormat.Formatter.signNotAllowed("string", '\0');
} else if (spec.alternate) {
throw InternalFormat.Formatter.alternateFormNotAllowed("string");
} else if (spec.align == '=') {
throw InternalFormat.Formatter.alignmentNotAllowed('=', "string");
}
// spec may be incomplete. The defaults are those commonly used for string formats.
spec = spec.withDefaults(InternalFormat.Spec.STRING);
// Get a formatter for the specification
return new TextFormatter(spec);
default:
// The type code was not recognised
return null;
}
}
public static final int[] translateIndices(CharSequence s, PyObject startObj, PyObject endObj, int len) {
int start, end;
int n = len;
int[] result = new int[4];
// Decode the start using slice semantics
if (startObj == null || startObj == Py.None) {
start = 0;
// result[2] = 0 already
} else {
// Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE
start = startObj.asIndex(null);
if (start < 0) {
// Negative value means "from the end"
start = n + start;
}
result[2] = start;
}
// Decode the end using slice semantics
if (endObj == null || endObj == Py.None) {
result[1] = result[3] = end = n;
} else {
// Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE
end = endObj.asIndex(null);
if (end < 0) {
// Negative value means "from the end"
result[3] = end = end + n;
// Ensure end is safe for String.substring(start,end).
if (end < 0) {
end = 0;
// result[1] = 0 already
} else {
result[1] = end;
}
} else {
result[3] = end;
// Ensure end is safe for String.substring(start,end).
if (end > n) {
result[1] = end = n;
} else {
result[1] = end;
}
}
}
// Ensure start is safe for String.substring(start,end).
if (start < 0) {
start = 0;
// result[0] = 0 already
} else if (start > end) {
result[0] = start = end;
} else {
result[0] = start;
}
return result;
}
public static final CharSequence getslice(CharSequence s, int start, int stop, int step, int sliceLength) {
if (step > 0 && stop < start) {
stop = start;
}
if (step == 1) {
return s.subSequence(start, stop);
}
int n = sliceLength;
char new_chars[] = new char[n];
int j = 0;
for (int i = start; j < n; i += step) {
new_chars[j++] = s.charAt(i);
}
return new String(new_chars);
}
/**
* Return a String equivalent to the argument according to the calling conventions of the
* certain methods of <code>str</code>. Those methods accept as a byte string anything bearing
* the buffer interface, or accept a <code>unicode</code> argument which they interpret from its
* UTF-16 encoded form (the internal representation returned by {@link PyUnicode#getString()}).
*
* @param obj to coerce to a String
* @return coerced value
* @throws PyException if the coercion fails
*/
public static String asUTF16StringOrError(PyObject obj) {
// PyUnicode accepted here. Care required in the client if obj is not basic plane.
String ret = asUTF16StringOrNull(obj);
if (ret != null) {
return ret;
} else {
throw Py.TypeError(String.format("must be bytes or a tuple of bytes, not '%s'", obj.getType().fastGetName()));
}
}
/**
* Return a String equivalent to the argument. This is a helper function to those methods that
* accept any byte array type (any object that supports a one-dimensional byte buffer), or
* accept a <code>unicode</code> argument which they interpret from its UTF-16 encoded form (the
* internal representation returned by {@link PyUnicode#getString()}).
*
* @param obj to coerce to a String
* @return coerced value or <code>null</code> if it can't be
*/
private static String asUTF16StringOrNull(PyObject obj) {
if (obj instanceof PyUnicode) {
return ((PyUnicode)obj).getString();
} else if (obj instanceof BufferProtocol) {
// Other object with buffer API: briefly access the buffer
try (PyBuffer buf = ((BufferProtocol)obj).getBuffer(PyBUF.FULL_RO)) {
return buf.toString();
}
}
return null;
}
/**
* Return a String equivalent to the argument. This is a helper function to those methods that
* accept any byte array type (any object that supports a one-dimensional byte buffer), but
* <b>not</b> a <code>unicode</code>.
*
* @param obj to coerce to a String
* @return coerced value or <code>null</code> if it can't be (including <code>unicode</code>)
*/
public static String asStringOrNull(PyObject obj) {
return (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);
}
/**
* Return a String equivalent to the argument. This is a helper function to those methods that
* accept any byte array type (any object that supports a one-dimensional byte buffer), but
* <b>not</b> a <code>unicode</code>.
* Added support for integer, as it can be interpreted as a byte
*
* @param obj to coerce to a String
* @return coerced value
* @throws PyException if the coercion fails (including <code>unicode</code>)
*/
public static String asStringOrError(PyObject obj) throws PyException {
return asStringOrError(obj, true);
}
public static String asStringOrError(PyObject obj, boolean allowInt) throws PyException {
if (allowInt && obj instanceof PyLong) {
int val = ((PyLong) obj).getValue().intValue();
if (val < 0 || val > 255) {
throw Py.ValueError("byte must be in range(0, 256)");
}
return String.valueOf((char) val);
}
String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);
if (ret != null) {
return ret;
}
throw Py.TypeError("expected str, bytearray or other buffer compatible object");
}
/**
* Return a String equivalent to the argument according to the calling conventions of methods
* that accept as a byte string anything bearing the buffer interface, or accept
* <code>PyNone</code>, but <b>not</b> a <code>unicode</code>. (Or the argument may be omitted,
* showing up here as null.) These include the <code>strip</code> and <code>split</code> methods
* of <code>str</code>, where a null indicates that the criterion is whitespace, and
* <code>str.translate</code>.
*
* @param obj to coerce to a String or null
* @param name of method
* @return coerced value or null
* @throws PyException if the coercion fails (including <code>unicode</code>)
*/
public static String asStringNullOrError(PyObject obj, String name) throws PyException {
if (obj == null || obj == Py.None) {
return null;
}
String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);
if (ret != null) {
return ret;
}
// A nameless method is the client
throw Py.TypeError(String.format("a bytes-like object is required, not '%s'",
obj.getType().fastGetName()));
}
/**
* Implementation of Python <code>str.rsplit()</code> common to exposed and Java API returning a
* {@link PyList} of <code>PyBytes</code>s. The <code>str</code> will be split at each
* occurrence of <code>sep</code>, working from the right. If <code>sep == null</code>,
* whitespace will be used as the criterion. If <code>sep</code> has zero length, a Python
* <code>ValueError</code> is raised. If <code>maxsplit</code> >=0 and there are more
* feasible splits than <code>maxsplit</code> the first element of the list contains the what is
* left over after the last split.
* <p>
* Implementation note: although a str contains only bytes, this method is also called by
* {@link PyUnicode#unicode_rsplit(PyObject, int)} .
*
* @param sep string to use as separator (or <code>null</code> if to split on whitespace)
* @param maxsplit maximum number of splits to make (there may be <code>maxsplit+1</code>
* parts).
* @return list(str) result
*/
public static final List<CharSequence> _rsplit(CharSequence s, String sep, int maxsplit) {
if (sep == null) {
// Split on runs of whitespace
return rsplitfields(s, maxsplit);
} else if (sep.length() == 0) {
throw Py.ValueError("empty separator");
} else {
// Split on specified (non-empty) string
return rsplitfields(s, sep, maxsplit);
}
}
/**
* Helper function for <code>.rsplit</code>, in <code>str</code> and <code>unicode</code>,
* splitting on white space and returning a list of the separated parts. If there are more than
* <code>maxsplit</code> feasible the first element of the list is the remainder of the original
* (this) string. The split sections will be {@link PyUnicode} if this object is a
* <code>PyUnicode</code>.
*
* @param maxsplit limit on the number of splits (if >=0)
* @return <code>PyList</code> of split sections
*/
public static List<CharSequence> rsplitfields(CharSequence s, int maxsplit) {
/*
* Result built here (in reverse) is a list of split parts, exactly as required for
* s.rsplit(None, maxsplit). If there are to be n splits, there will be n+1 elements.
*/
List<CharSequence> list = new ArrayList<>();
int length = s.length(), end = length - 1, splits = 0, index;
if (maxsplit < 0) {
// Make all possible splits: there can't be more than:
maxsplit = length;
}
// end is always the rightmost character not consumed into a piece on the list
while (end >= 0) {
// Find the next occurrence of non-whitespace (working leftwards)
while (end >= 0) {
if (!isWhitespace(s.charAt(end))) {
// Break leaving end pointing at non-whitespace
break;
}
--end;
}
if (end < 0) {
// Only found whitespace so there is no next segment
break;
} else if (splits >= maxsplit) {
// The next segment is the last and contains all characters back to the beginning
index = -1;
} else {
// The next segment runs back to the next next whitespace or beginning
for (index = end; index >= 0; --index) {
if (isWhitespace(s.charAt(index))) {
// Break leaving index pointing at whitespace
break;
}
}
}
// Make a piece from index+1 start up to end+1
list.add(s.subSequence(index + 1, end + 1));
splits++;
// Start next segment search at that point
end = index;
}
return Lists.reverse(list);
}
/**
* Helper function for <code>.rsplit</code>, in <code>str</code> and <code>unicode</code>,
* returning a list of the separated parts, <em>in the reverse order</em> of their occurrence in
* this string. If there are more than <code>maxsplit</code> occurrences of <code>sep</code> the
* first element of the list is the left end of the original (this) string. The split sections
* will be {@link PyUnicode} if this object is a <code>PyUnicode</code>.
*
* @param sep at occurrences of which this string should be split
* @param maxsplit limit on the number of splits (if >=0)
* @return <code>PyList</code> of split sections
*/
public static final List<CharSequence> rsplitfields(CharSequence s, String sep, int maxsplit) {
/*
* Result built here (in reverse) is a list of split parts, exactly as required for
* s.rsplit(sep, maxsplit). If there are to be n splits, there will be n+1 elements.
*/
List<CharSequence> list = new ArrayList<>();
int length = s.length();
int sepLength = sep.length();
if (maxsplit < 0) {
// Make all possible splits: there can't be more than:
maxsplit = length + 1;
}
if (maxsplit == 0) {
// Degenerate case
list.add(s);
} else if (sepLength == 0) {
// Empty separator is not allowed
throw Py.ValueError("empty separator");
} else {
// Index of first character of the last piece already on the list
int end = length;
// Add at most maxsplit pieces
for (int splits = 0; splits < maxsplit; splits++) {
// Find the next occurrence of sep (working leftwards)
int index = s.toString().lastIndexOf(sep, end - sepLength);
if (index < 0) {
// No more occurrences of sep: we're done
break;
} else {
// Make a piece from where we found sep up to end
list.add(s.subSequence(index + sepLength, end));
// New end (of next piece) is where we found sep
end = index;
}
}
// Last piece is the rest of the string (even if end==0)
list.add(s.subSequence(0, end));
}
return Lists.reverse(list);
}
/**
* Helper common to the Python and Java API returning the last index of the substring or -1 for
* not found. It accepts slice-like arguments, which may be <code>None</code> or end-relative
* (negative). This method also supports
* {@link PyUnicode#str_frind(PyObject, PyObject, PyObject)}.
*
* @param sub substring to find.
* @param startObj start of slice.
* @param endObj end of slice.
* @return index of <code>sub</code> in this object or -1 if not found.
*/
public static final int _rfind(CharSequence s, String sub, PyObject startObj, PyObject endObj, int len) {
// Interpret the slice indices as concrete values
int[] indices = translateIndices(s, startObj, endObj, len);
int subLen = sub.length();
if (subLen == 0) {
// Special case: an empty string may be found anywhere, ...
int start = indices[2], end = indices[3];
if (end < 0 || end < start || start > len) {
// ... except ln a reverse slice or beyond the end of the string,
return -1;
} else {
// ... and will be reported at the end of the overlap.
return indices[1];
}
} else {
// General case: search for first match then check against slice.
int start = indices[0], end = indices[1];
int found = s.toString().lastIndexOf(sub, end - subLen);
if (found >= start) {
return found;
} else {
return -1;
}
}
}
// only for BMP
public static final String title(CharSequence s) {
char[] chars = new char[s.length()];
int n = chars.length;
boolean previous_is_cased = false;
for (int i = 0; i < n; i++) {
char ch = s.charAt(i);
if (previous_is_cased) {
chars[i] = Character.toLowerCase(ch);
} else {
chars[i] = Character.toTitleCase(ch);
}
if (Character.isLowerCase(ch) || Character.isUpperCase(ch) || Character.isTitleCase(ch)) {
previous_is_cased = true;
} else {
previous_is_cased = false;
}
}
return new String(chars);
}
public static final String swapcase(CharSequence s) {
char[] chars = new char[s.length()];
int n = chars.length;
for (int i = 0; i < n; i++) {
char c = s.charAt(i);
if (Character.isUpperCase(c)) {
chars[i] = Character.toLowerCase(c);
} else if (Character.isLowerCase(c)) {
chars[i] = Character.toUpperCase(c);
}
}
return new String(chars);
}
/**
* Implementation of Python <code>str.rstrip()</code> common to exposed and Java API, when
* stripping whitespace. Any whitespace byte/character will be discarded from the right end of
* this <code>str</code>.
* <p>
* Implementation note: although a <code>str</code> contains only bytes, this method is also
* called by {@link PyUnicode#str_rstrip(PyObject)} when this is a basic-plane string.
*
* @return a new String, stripped of the whitespace characters/bytes
*/
public static final String _rstrip(CharSequence s) {
// Rightmost non-whitespace
int right = _stripRight(s);
if (right < 0) {
// They're all whitespace
return "";
} else {
// Substring up to and including this rightmost non-whitespace
return s.subSequence(0, right + 1).toString();
}
}
/**
* Implementation of Python <code>str.rstrip()</code> common to exposed and Java API. Any
* byte/character matching one of those in <code>stripChars</code> will be discarded from the
* right end of this <code>str</code>. If <code>stripChars == null</code>, whitespace will be
* stripped.
* <p>
* Implementation note: although a <code>str</code> contains only bytes, this method is also
* called by {@link PyUnicode#str_strip(PyObject)} when both arguments are basic-plane
* strings.
*
* @param stripChars characters to strip or null
* @return a new String, stripped of the specified characters/bytes
*/
public static final String _rstrip(CharSequence s, String stripChars) {
if (stripChars == null) {
// Divert to the whitespace version
return _rstrip(s);
} else {
// Rightmost non-matching character
int right = _stripRight(s, stripChars);
// Substring up to and including this rightmost non-matching character (or "")
return s.subSequence(0, right + 1).toString();
}
}
/**
* Helper for <code>strip</code>, <code>rstrip</code> implementation, when stripping whitespace.
*
* @param s string to search.
* @return index of rightmost non-whitespace character or -1 if they all are.
*/
private static final int _stripRight(CharSequence s) {
for (int right = s.length(); --right >= 0;) {
if (!isWhitespace(s.charAt(right))) {
return right;
}
}
return -1;
}
/**
* Helper for <code>strip</code>, <code>rstrip</code> implementation, when stripping specified
* characters.
*
* @param s string to search.
* @param stripChars specifies set of characters to strip
* @return index of rightmost character not in <code>stripChars</code> or -1 if they all are.
*/
private static final int _stripRight(CharSequence s, String stripChars) {
for (int right = s.length(); --right >= 0;) {
if (stripChars.indexOf(s.charAt(right)) < 0) {
return right;
}
}
return -1;
}
/**
* Implementation of Python <code>str.strip()</code> common to exposed and Java API, when
* stripping whitespace. Any whitespace byte/character will be discarded from either end of this
* <code>str</code>.
* <p>
* Implementation note: although a <code>str</code> contains only bytes, this method is also
* called by {@link PyUnicode#str_strip(PyObject)} when this is a basic-plane string.
*
* @return a new String, stripped of the whitespace characters/bytes
*/
public static final CharSequence _strip(CharSequence s) {
// Rightmost non-whitespace
int right = _stripRight(s);
if (right < 0) {
// They're all whitespace
return "";
} else {
// Leftmost non-whitespace character: right known not to be a whitespace
int left = _stripLeft(s, right);
return s.subSequence(left, right + 1);
}
}
/**
* Implementation of Python <code>str.strip()</code> common to exposed and Java API. Any
* byte/character matching one of those in <code>stripChars</code> will be discarded from either
* end of this <code>str</code>. If <code>stripChars == null</code>, whitespace will be
* stripped.
* <p>
* Implementation note: although a <code>str</code> contains only bytes, this method is also