jython3/src/org/python/core/stringlib/Encoding.java at master · jython/jython3

History

1900 lines (1726 loc) · 72.8 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

package org.python.core.stringlib;

import com.google.common.base.CharMatcher;

import com.google.common.base.Joiner;

import com.google.common.base.Predicate;

import com.google.common.collect.Lists;

import com.ibm.icu.lang.UCharacter;

import org.python.core.BufferProtocol;

import org.python.core.Py;

import org.python.core.PyBUF;

import org.python.core.PyBuffer;

import org.python.core.PyBytes;

import org.python.core.PyComplex;

import org.python.core.PyException;

import org.python.core.PyInteger;

import org.python.core.PyList;

import org.python.core.PyLong;

import org.python.core.PyObject;

import org.python.core.PySystemState;

import org.python.core.PyTuple;

import org.python.core.PyUnicode;

import org.python.core.codecs;

import org.python.modules.sys.SysModule;

import java.math.BigInteger;

import java.util.ArrayList;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

/**

* Helper methods for unicode encoding shared between bytes-like object and str

public class Encoding {

private static char[] hexdigit = "0123456789abcdef".toCharArray();

public static String encode_UnicodeEscape(String str, boolean use_quotes) {

int size = str.length();

StringBuilder v = new StringBuilder(str.length());

char quote = 0;

if (use_quotes) {

quote = str.indexOf('\'') >= 0 && str.indexOf('"') == -1 ? '"' : '\'';

v.append(quote);

}

for (int i = 0; size-- > 0; ) {

char ch = str.charAt(i++);

/* Escape quotes */

if ((use_quotes && ch == quote) || ch == '\\') {

v.append('\\');

v.append(ch);

continue;

}

/* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */

else if (size > 0 && Character.isHighSurrogate(ch)) {

char ch2 = str.charAt(i++);

size--;

if (Character.isLowSurrogate(ch2)) {

int ucs = Character.toCodePoint(ch, ch2);//((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;

if (UCharacter.isPrintable(ucs)) {

v.appendCodePoint(ucs);

} else {

v.append('\\');

v.append('U');

v.append(hexdigit[(ucs >> 28) & 0xf]);

v.append(hexdigit[(ucs >> 24) & 0xf]);

v.append(hexdigit[(ucs >> 20) & 0xf]);

v.append(hexdigit[(ucs >> 16) & 0xf]);

v.append(hexdigit[(ucs >> 12) & 0xf]);

v.append(hexdigit[(ucs >> 8) & 0xf]);

v.append(hexdigit[(ucs >> 4) & 0xf]);

v.append(hexdigit[ucs & 0xf]);

}

continue;

}

/* Fall through: isolated surrogates are copied as-is */

i--;

size++;

}

/* Map 16-bit characters to '\\uxxxx' */

if (ch >= 256 && !UCharacter.isPrintable(ch)) {

v.append('\\');

v.append('u');

v.append(hexdigit[(ch >> 12) & 0xf]);

v.append(hexdigit[(ch >> 8) & 0xf]);

v.append(hexdigit[(ch >> 4) & 0xf]);

v.append(hexdigit[ch & 15]);

}

/* Map special whitespace to '\t', \n', '\r' */

else if (ch == '\t') {

v.append("\\t");

} else if (ch == '\n') {

v.append("\\n");

} else if (ch == '\r') {

v.append("\\r");

} else if (ch < ' ' || ch == 127) {

/* Map non-printable US ASCII to '\xNN' */

v.append('\\');

v.append('x');

v.append(hexdigit[(ch >> 4) & 0xf]);

v.append(hexdigit[ch & 0xf]);

} else {/* Copy everything else as-is */

v.append(ch);

}

if (use_quotes) {

v.append(quote);

}

return v.toString();

}

public static String decode_UnicodeEscape(String str, int start, int end, String errors,

boolean unicode) {

StringBuilder v = new StringBuilder(end - start);

for (int s = start; s < end; ) {

char ch = str.charAt(s);

/* Non-escape characters are interpreted as Unicode ordinals */

if (ch != '\\') {

v.append(ch);

s++;

continue;

}

int loopStart = s;

/* \ - Escapes */

s++;

if (s == end) {

s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //

str, loopStart, s + 1, "\\ at end of string");

continue;

}

ch = str.charAt(s++);

switch (ch) {

/* \x escapes */

case '\n':

break;

case '\\':

v.append('\\');

break;

case '\'':

v.append('\'');

break;

case '\"':

v.append('\"');

break;

case 'b':

v.append('\b');

break;

case 'f':

v.append('\014');

break; /* FF */

case 't':

v.append('\t');

break;

case 'n':

v.append('\n');

break;

case 'r':

v.append('\r');

break;

case 'v':

v.append('\013');

break; /* VT */

case 'a':

v.append('\007');

break; /* BEL, not classic C */

/* \OOO (octal) escapes */

case '0':

case '1':

case '2':

case '3':

case '4':

case '5':

case '6':

case '7':

int x = Character.digit(ch, 8);

for (int j = 0; j < 2 && s < end; j++, s++) {

ch = str.charAt(s);

if (ch < '0' || ch > '7') {

break;

}

x = (x << 3) + Character.digit(ch, 8);

}

v.append((char) x);

break;

case 'x':

s = hexescape(v, errors, 2, s, str, end, "truncated \\xXX");

break;

case 'u':

if (!unicode) {

v.append('\\');

v.append('u');

break;

}

s = hexescape(v, errors, 4, s, str, end, "truncated \\uXXXX");

break;

case 'U':

if (!unicode) {

v.append('\\');

v.append('U');

break;

}

s = hexescape(v, errors, 8, s, str, end, "truncated \\UXXXXXXXX");

break;

case 'N':

if (!unicode) {

v.append('\\');

v.append('N');

break;

}

* Ok, we need to deal with Unicode Character Names now, make sure we've

* imported the hash table data...

if (str.charAt(s) == '{') {

int startName = s + 1;

int endBrace = startName;

* look for either the closing brace, or we exceed the maximum length of the

* unicode character names

endBrace = str.indexOf('}', startName);

if (endBrace != -1) {

int value = UCharacter.getCharFromName(str.substring(startName, endBrace));

if (storeUnicodeCharacter(value, v)) {

s = endBrace + 1;

} else {

s = codecs.insertReplacementAndGetResume( //

v, errors, "unicodeescape", //

str, loopStart, endBrace + 1, "illegal Unicode character");

}

} else {

s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //

str, loopStart, endBrace, "malformed \\N character escape");

}

break;

} else {

s = codecs.insertReplacementAndGetResume(v, errors, "unicodeescape", //

str, loopStart, s + 1, "malformed \\N character escape");

}

break;

default:

v.append('\\');

v.append(str.charAt(s - 1));

break;

}

return v.toString();

}

private static int hexescape(StringBuilder partialDecode, String errors, int digits,

int hexDigitStart, String str, int size, String errorMessage) {

int i = 0;

int x = 0;

for (; i < digits; ++i) {

int index = hexDigitStart + i;

if (index >= size) {

return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",

str, hexDigitStart - 2, size, errorMessage);

}

char c = str.charAt(index);

int d = Character.digit(c, 16);

if (d == -1) {

return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",

str, hexDigitStart - 2, index + 1, errorMessage) - 1;

}

x = (x << 4) & ~0xF;

if (c >= '0' && c <= '9') {

x += c - '0';

} else if (c >= 'a' && c <= 'f') {

x += 10 + c - 'a';

} else {

x += 10 + c - 'A';

}

if (storeUnicodeCharacter(x, partialDecode)) {

return hexDigitStart + i;

} else {

return codecs.insertReplacementAndGetResume(partialDecode, errors, "unicodeescape",

str, hexDigitStart - 2, hexDigitStart + i + 1, "illegal Unicode character");

}

/* pass in an int since this can be a UCS-4 character */

private static boolean storeUnicodeCharacter(int value, StringBuilder partialDecode) {

if (value >= 0 && value <= SysModule.MAXUNICODE) {

partialDecode.appendCodePoint(value);

return true;

}

return false;

}

/**

* Helper common to the Python and Java API for <code>str.replace</code>, returning a new string

* equal to this string with ocurrences of <code>oldPiece</code> replaced by

* <code>newPiece</code>, up to a maximum of <code>count</code> occurrences, or all of them.

* This method also supports {@link PyUnicode#str_replace(PyObject, PyObject, int)}, in

* which context it returns a <code>PyUnicode</code>

* @param oldPiece to replace where found.

* @param newPiece replacement text.

* @param count maximum number of replacements to make, or -1 meaning all of them.

* @return PyBytes (or PyUnicode if this string is one), this string after replacements.

public static final String _replace(String s, String oldPiece, String newPiece, int count) {

int len = s.length();

int oldLen = oldPiece.length();

int newLen = newPiece.length();

if (len == 0) {

if (count < 0 && oldLen == 0) {

return newPiece;

}

return s;

} else if (oldLen == 0 && newLen != 0 && count != 0) {

* old="" and new != "", interleave new piece with each char in original, taking into

* account count

int i = 0;

StringBuilder buffer = new StringBuilder(newPiece);

for (; i < len && (count < 0 || i < count - 1); i++) {

buffer.append(s.charAt(i)).append(newPiece);

}

buffer.append(s.substring(i));

return buffer.toString();

} else {

if (count < 0) {

count = (oldLen == 0) ? len + 1 : len;

}

return Joiner.on(newPiece).join(Pattern.compile(oldPiece, Pattern.LITERAL).split(s, count + 1));

}

public static final boolean isLowercase(CharSequence s) {

return s.length() != 0 && CharMatcher.JAVA_LOWER_CASE.matchesAllOf(s);

}

public static final boolean isUppercase(CharSequence s) {

return s.length() != 0 && CharMatcher.JAVA_UPPER_CASE.matchesAllOf(s);

}

public static final boolean isAlpha(CharSequence s) {

return s.length() != 0 && CharMatcher.JAVA_LETTER.matchesAllOf(s);

}

public static final boolean isAlnum(CharSequence s) {

return s.length() != 0 && CharMatcher.JAVA_LETTER_OR_DIGIT.matchesAllOf(s);

}

public static final boolean isDecimal(CharSequence s) {

return s.length() != 0 && CharMatcher.forPredicate(new Predicate<Character>() {

@Override

public boolean apply(Character ch) {

return Character.getType(ch) == Character.DECIMAL_DIGIT_NUMBER;

}

}).matchesAllOf(s);

}

public static final boolean isDigit(CharSequence s) {

return s.length() != 0 && CharMatcher.DIGIT.matchesAllOf(s);

}

public static final boolean isNumeric(CharSequence s) {

return s.length() != 0 && CharMatcher.forPredicate(new Predicate<Character>() {

@Override

public boolean apply(Character ch) {

int type = Character.getType(ch);

return type == Character.DECIMAL_DIGIT_NUMBER || type == Character.LETTER_NUMBER

|| type == Character.OTHER_NUMBER;

}

}).matchesAllOf(s);

}

public static final boolean isTitle(CharSequence s) {

int n = s.length();

/* Shortcut for single character strings */

if (n == 1) {

return Character.isTitleCase(s.charAt(0))

|| Character.isUpperCase(s.charAt(0));

}

boolean cased = false;

boolean previous_is_cased = false;

for (int i = 0; i < n; i++) {

char ch = s.charAt(i);

if (Character.isUpperCase(ch) || Character.isTitleCase(ch)) {

if (previous_is_cased) {

return false;

}

previous_is_cased = true;

cased = true;

} else if (Character.isLowerCase(ch)) {

if (!previous_is_cased) {

return false;

}

previous_is_cased = true;

cased = true;

} else {

previous_is_cased = false;

}

return cased;

}

public static final boolean isSpace(CharSequence s) {

return s.length() != 0 && CharMatcher.WHITESPACE.matchesAllOf(s);

}

public static PyObject format(CharSequence s, PyObject formatSpec, boolean bytes) {

// Parse the specification

InternalFormat.Spec spec = InternalFormat.fromText(formatSpec, "__format__");

// Get a formatter for the specification

TextFormatter f = prepareFormatter(spec);

if (f == null) {

// The type code was not recognised

throw InternalFormat.Formatter.unknownFormat(spec.type, "string");

}

f.setBytes(bytes);

// Convert as per specification.

f.format(s);

// Return a result that has the same type (str or unicode) as the formatSpec argument.

return f.pad().getPyResult();

}

/**

* Common code for {@link PyBytes} and {@link PyUnicode} to prepare a {@link TextFormatter}

* from a parsed specification. The object returned has format method

* {@link TextFormatter#format(String)} that treats its argument as UTF-16 encoded unicode (not

* just <code>char</code>s). That method will format its argument ( <code>str</code> or

* <code>unicode</code>) according to the PEP 3101 formatting specification supplied here. This

* would be used during <code>text.__format__(".5s")</code> or

* <code>"{:.5s}".format(text)</code> where <code>text</code> is this Python string.

* @param spec a parsed PEP-3101 format specification.

* @return a formatter ready to use, or null if the type is not a string format type.

* @throws PyException(ValueError) if the specification is faulty.

@SuppressWarnings("fallthrough")

public static final TextFormatter prepareFormatter(InternalFormat.Spec spec) throws PyException {

// Slight differences between format types

switch (spec.type) {

case InternalFormat.Spec.NONE:

case 's':

// Check for disallowed parts of the specification

if (spec.grouping) {

throw InternalFormat.Formatter.notAllowed("Grouping", "string", spec.type);

} else if (InternalFormat.Spec.specified(spec.sign)) {

throw InternalFormat.Formatter.signNotAllowed("string", '\0');

} else if (spec.alternate) {

throw InternalFormat.Formatter.alternateFormNotAllowed("string");

} else if (spec.align == '=') {

throw InternalFormat.Formatter.alignmentNotAllowed('=', "string");

}

// spec may be incomplete. The defaults are those commonly used for string formats.

spec = spec.withDefaults(InternalFormat.Spec.STRING);

// Get a formatter for the specification

return new TextFormatter(spec);

default:

// The type code was not recognised

return null;

}

public static final int[] translateIndices(CharSequence s, PyObject startObj, PyObject endObj, int len) {

int start, end;

int n = len;

int[] result = new int[4];

// Decode the start using slice semantics

if (startObj == null || startObj == Py.None) {

start = 0;

// result[2] = 0 already

} else {

// Convert to int but limit to Integer.MIN_VALUE <= start <= Integer.MAX_VALUE

start = startObj.asIndex(null);

if (start < 0) {

// Negative value means "from the end"

start = n + start;

}

result[2] = start;

}

// Decode the end using slice semantics

if (endObj == null || endObj == Py.None) {

result[1] = result[3] = end = n;

} else {

// Convert to int but limit to Integer.MIN_VALUE <= end <= Integer.MAX_VALUE

end = endObj.asIndex(null);

if (end < 0) {

// Negative value means "from the end"

result[3] = end = end + n;

// Ensure end is safe for String.substring(start,end).

if (end < 0) {

end = 0;

// result[1] = 0 already

} else {

result[1] = end;

}

} else {

result[3] = end;

// Ensure end is safe for String.substring(start,end).

if (end > n) {

result[1] = end = n;

} else {

result[1] = end;

}

// Ensure start is safe for String.substring(start,end).

if (start < 0) {

start = 0;

// result[0] = 0 already

} else if (start > end) {

result[0] = start = end;

} else {

result[0] = start;

}

return result;

}

public static final CharSequence getslice(CharSequence s, int start, int stop, int step, int sliceLength) {

if (step > 0 && stop < start) {

stop = start;

}

if (step == 1) {

return s.subSequence(start, stop);

}

int n = sliceLength;

char new_chars[] = new char[n];

int j = 0;

for (int i = start; j < n; i += step) {

new_chars[j++] = s.charAt(i);

}

return new String(new_chars);

}

/**

* Return a String equivalent to the argument according to the calling conventions of the

* certain methods of <code>str</code>. Those methods accept as a byte string anything bearing

* the buffer interface, or accept a <code>unicode</code> argument which they interpret from its

* UTF-16 encoded form (the internal representation returned by {@link PyUnicode#getString()}).

* @param obj to coerce to a String

* @return coerced value

* @throws PyException if the coercion fails

public static String asUTF16StringOrError(PyObject obj) {

// PyUnicode accepted here. Care required in the client if obj is not basic plane.

String ret = asUTF16StringOrNull(obj);

if (ret != null) {

return ret;

} else {

throw Py.TypeError(String.format("must be bytes or a tuple of bytes, not '%s'", obj.getType().fastGetName()));

}

/**

* Return a String equivalent to the argument. This is a helper function to those methods that

* accept any byte array type (any object that supports a one-dimensional byte buffer), or

* accept a <code>unicode</code> argument which they interpret from its UTF-16 encoded form (the

* internal representation returned by {@link PyUnicode#getString()}).

* @param obj to coerce to a String

* @return coerced value or <code>null</code> if it can't be

private static String asUTF16StringOrNull(PyObject obj) {

if (obj instanceof PyUnicode) {

return ((PyUnicode)obj).getString();

} else if (obj instanceof BufferProtocol) {

// Other object with buffer API: briefly access the buffer

try (PyBuffer buf = ((BufferProtocol)obj).getBuffer(PyBUF.FULL_RO)) {

return buf.toString();

}

return null;

}

/**

* Return a String equivalent to the argument. This is a helper function to those methods that

* accept any byte array type (any object that supports a one-dimensional byte buffer), but

* not a <code>unicode</code>.

* @param obj to coerce to a String

* @return coerced value or <code>null</code> if it can't be (including <code>unicode</code>)

public static String asStringOrNull(PyObject obj) {

return (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);

}

/**

* Return a String equivalent to the argument. This is a helper function to those methods that

* accept any byte array type (any object that supports a one-dimensional byte buffer), but

* not a <code>unicode</code>.

* Added support for integer, as it can be interpreted as a byte

* @param obj to coerce to a String

* @return coerced value

* @throws PyException if the coercion fails (including <code>unicode</code>)

public static String asStringOrError(PyObject obj) throws PyException {

return asStringOrError(obj, true);

}

public static String asStringOrError(PyObject obj, boolean allowInt) throws PyException {

if (allowInt && obj instanceof PyLong) {

int val = ((PyLong) obj).getValue().intValue();

if (val < 0 || val > 255) {

throw Py.ValueError("byte must be in range(0, 256)");

}

return String.valueOf((char) val);

}

String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);

if (ret != null) {

return ret;

}

throw Py.TypeError("expected str, bytearray or other buffer compatible object");

}

/**

* Return a String equivalent to the argument according to the calling conventions of methods

* that accept as a byte string anything bearing the buffer interface, or accept

* <code>PyNone</code>, but not a <code>unicode</code>. (Or the argument may be omitted,

* showing up here as null.) These include the <code>strip</code> and <code>split</code> methods

* of <code>str</code>, where a null indicates that the criterion is whitespace, and

* <code>str.translate</code>.

* @param obj to coerce to a String or null

* @param name of method

* @return coerced value or null

* @throws PyException if the coercion fails (including <code>unicode</code>)

public static String asStringNullOrError(PyObject obj, String name) throws PyException {

if (obj == null || obj == Py.None) {

return null;

}

String ret = (obj instanceof PyUnicode) ? null : asUTF16StringOrNull(obj);

if (ret != null) {

return ret;

}

// A nameless method is the client

throw Py.TypeError(String.format("a bytes-like object is required, not '%s'",

obj.getType().fastGetName()));

}

/**

* Implementation of Python <code>str.rsplit()</code> common to exposed and Java API returning a

* {@link PyList} of <code>PyBytes</code>s. The <code>str</code> will be split at each

* occurrence of <code>sep</code>, working from the right. If <code>sep == null</code>,

* whitespace will be used as the criterion. If <code>sep</code> has zero length, a Python

* <code>ValueError</code> is raised. If <code>maxsplit</code> >=0 and there are more

* feasible splits than <code>maxsplit</code> the first element of the list contains the what is

* left over after the last split.

*

* Implementation note: although a str contains only bytes, this method is also called by

* {@link PyUnicode#unicode_rsplit(PyObject, int)} .

* @param sep string to use as separator (or <code>null</code> if to split on whitespace)

* @param maxsplit maximum number of splits to make (there may be <code>maxsplit+1</code>

* parts).

* @return list(str) result

public static final List<CharSequence> _rsplit(CharSequence s, String sep, int maxsplit) {

if (sep == null) {

// Split on runs of whitespace

return rsplitfields(s, maxsplit);

} else if (sep.length() == 0) {

throw Py.ValueError("empty separator");

} else {

// Split on specified (non-empty) string

return rsplitfields(s, sep, maxsplit);

}

/**

* Helper function for <code>.rsplit</code>, in <code>str</code> and <code>unicode</code>,

* splitting on white space and returning a list of the separated parts. If there are more than

* <code>maxsplit</code> feasible the first element of the list is the remainder of the original

* (this) string. The split sections will be {@link PyUnicode} if this object is a

* <code>PyUnicode</code>.

* @param maxsplit limit on the number of splits (if >=0)

* @return <code>PyList</code> of split sections

public static List<CharSequence> rsplitfields(CharSequence s, int maxsplit) {

* Result built here (in reverse) is a list of split parts, exactly as required for

* s.rsplit(None, maxsplit). If there are to be n splits, there will be n+1 elements.

List<CharSequence> list = new ArrayList<>();

int length = s.length(), end = length - 1, splits = 0, index;

if (maxsplit < 0) {

// Make all possible splits: there can't be more than:

maxsplit = length;

}

// end is always the rightmost character not consumed into a piece on the list

while (end >= 0) {

// Find the next occurrence of non-whitespace (working leftwards)

while (end >= 0) {

if (!isWhitespace(s.charAt(end))) {

// Break leaving end pointing at non-whitespace

break;

}

--end;

}

if (end < 0) {

// Only found whitespace so there is no next segment

break;

} else if (splits >= maxsplit) {

// The next segment is the last and contains all characters back to the beginning

index = -1;

} else {

// The next segment runs back to the next next whitespace or beginning

for (index = end; index >= 0; --index) {

if (isWhitespace(s.charAt(index))) {

// Break leaving index pointing at whitespace

break;

}

// Make a piece from index+1 start up to end+1

list.add(s.subSequence(index + 1, end + 1));

splits++;

// Start next segment search at that point

end = index;

}

return Lists.reverse(list);

}

/**

* Helper function for <code>.rsplit</code>, in <code>str</code> and <code>unicode</code>,

* returning a list of the separated parts, in the reverse order of their occurrence in

* this string. If there are more than <code>maxsplit</code> occurrences of <code>sep</code> the

* first element of the list is the left end of the original (this) string. The split sections

* will be {@link PyUnicode} if this object is a <code>PyUnicode</code>.

* @param sep at occurrences of which this string should be split

* @param maxsplit limit on the number of splits (if >=0)

* @return <code>PyList</code> of split sections

public static final List<CharSequence> rsplitfields(CharSequence s, String sep, int maxsplit) {

* Result built here (in reverse) is a list of split parts, exactly as required for

* s.rsplit(sep, maxsplit). If there are to be n splits, there will be n+1 elements.

List<CharSequence> list = new ArrayList<>();

int length = s.length();

int sepLength = sep.length();

if (maxsplit < 0) {

// Make all possible splits: there can't be more than:

maxsplit = length + 1;

}

if (maxsplit == 0) {

// Degenerate case

list.add(s);

} else if (sepLength == 0) {

// Empty separator is not allowed

throw Py.ValueError("empty separator");

} else {

// Index of first character of the last piece already on the list

int end = length;

// Add at most maxsplit pieces

for (int splits = 0; splits < maxsplit; splits++) {

// Find the next occurrence of sep (working leftwards)

int index = s.toString().lastIndexOf(sep, end - sepLength);

if (index < 0) {

// No more occurrences of sep: we're done

break;

} else {

// Make a piece from where we found sep up to end

list.add(s.subSequence(index + sepLength, end));

// New end (of next piece) is where we found sep

end = index;

}

// Last piece is the rest of the string (even if end==0)

list.add(s.subSequence(0, end));

}

return Lists.reverse(list);

}

/**

* Helper common to the Python and Java API returning the last index of the substring or -1 for

* not found. It accepts slice-like arguments, which may be <code>None</code> or end-relative

* (negative). This method also supports

* {@link PyUnicode#str_frind(PyObject, PyObject, PyObject)}.

* @param sub substring to find.

* @param startObj start of slice.

* @param endObj end of slice.

* @return index of <code>sub</code> in this object or -1 if not found.

public static final int _rfind(CharSequence s, String sub, PyObject startObj, PyObject endObj, int len) {

// Interpret the slice indices as concrete values

int[] indices = translateIndices(s, startObj, endObj, len);

int subLen = sub.length();

if (subLen == 0) {

// Special case: an empty string may be found anywhere, ...

int start = indices[2], end = indices[3];

if (end < 0 || end < start || start > len) {

// ... except ln a reverse slice or beyond the end of the string,

return -1;

} else {

// ... and will be reported at the end of the overlap.

return indices[1];

}

} else {

// General case: search for first match then check against slice.

int start = indices[0], end = indices[1];

int found = s.toString().lastIndexOf(sub, end - subLen);

if (found >= start) {

return found;

} else {

return -1;

}

// only for BMP

public static final String title(CharSequence s) {

char[] chars = new char[s.length()];

int n = chars.length;

boolean previous_is_cased = false;

for (int i = 0; i < n; i++) {

char ch = s.charAt(i);

if (previous_is_cased) {

chars[i] = Character.toLowerCase(ch);

} else {

chars[i] = Character.toTitleCase(ch);

}

if (Character.isLowerCase(ch) || Character.isUpperCase(ch) || Character.isTitleCase(ch)) {

previous_is_cased = true;

} else {

previous_is_cased = false;

}

return new String(chars);

}

public static final String swapcase(CharSequence s) {

char[] chars = new char[s.length()];

int n = chars.length;

for (int i = 0; i < n; i++) {

char c = s.charAt(i);

if (Character.isUpperCase(c)) {

chars[i] = Character.toLowerCase(c);

} else if (Character.isLowerCase(c)) {

chars[i] = Character.toUpperCase(c);

}

return new String(chars);

}

/**

* Implementation of Python <code>str.rstrip()</code> common to exposed and Java API, when

* stripping whitespace. Any whitespace byte/character will be discarded from the right end of

* this <code>str</code>.

*

* Implementation note: although a <code>str</code> contains only bytes, this method is also

* called by {@link PyUnicode#str_rstrip(PyObject)} when this is a basic-plane string.

* @return a new String, stripped of the whitespace characters/bytes

public static final String _rstrip(CharSequence s) {

// Rightmost non-whitespace

int right = _stripRight(s);

if (right < 0) {

// They're all whitespace

return "";

} else {

// Substring up to and including this rightmost non-whitespace

return s.subSequence(0, right + 1).toString();

}

/**

* Implementation of Python <code>str.rstrip()</code> common to exposed and Java API. Any

* byte/character matching one of those in <code>stripChars</code> will be discarded from the

* right end of this <code>str</code>. If <code>stripChars == null</code>, whitespace will be

* stripped.

*

* Implementation note: although a <code>str</code> contains only bytes, this method is also

* called by {@link PyUnicode#str_strip(PyObject)} when both arguments are basic-plane

* strings.

* @param stripChars characters to strip or null

* @return a new String, stripped of the specified characters/bytes

public static final String _rstrip(CharSequence s, String stripChars) {

if (stripChars == null) {

// Divert to the whitespace version

return _rstrip(s);

} else {

// Rightmost non-matching character

int right = _stripRight(s, stripChars);

// Substring up to and including this rightmost non-matching character (or "")

return s.subSequence(0, right + 1).toString();

}

/**

* Helper for <code>strip</code>, <code>rstrip</code> implementation, when stripping whitespace.

* @param s string to search.

* @return index of rightmost non-whitespace character or -1 if they all are.

private static final int _stripRight(CharSequence s) {

for (int right = s.length(); --right >= 0;) {

if (!isWhitespace(s.charAt(right))) {

return right;

}

return -1;

}

/**

* Helper for <code>strip</code>, <code>rstrip</code> implementation, when stripping specified

* characters.

* @param s string to search.

* @param stripChars specifies set of characters to strip

* @return index of rightmost character not in <code>stripChars</code> or -1 if they all are.

private static final int _stripRight(CharSequence s, String stripChars) {

for (int right = s.length(); --right >= 0;) {

if (stripChars.indexOf(s.charAt(right)) < 0) {

return right;

}

return -1;

}

/**

* Implementation of Python <code>str.strip()</code> common to exposed and Java API, when

* stripping whitespace. Any whitespace byte/character will be discarded from either end of this

* <code>str</code>.

*

* Implementation note: although a <code>str</code> contains only bytes, this method is also

* called by {@link PyUnicode#str_strip(PyObject)} when this is a basic-plane string.

* @return a new String, stripped of the whitespace characters/bytes

public static final CharSequence _strip(CharSequence s) {

// Rightmost non-whitespace

int right = _stripRight(s);

if (right < 0) {

// They're all whitespace

return "";

} else {

// Leftmost non-whitespace character: right known not to be a whitespace

int left = _stripLeft(s, right);

return s.subSequence(left, right + 1);

}

/**

* Implementation of Python <code>str.strip()</code> common to exposed and Java API. Any

* byte/character matching one of those in <code>stripChars</code> will be discarded from either

* end of this <code>str</code>. If <code>stripChars == null</code>, whitespace will be

* stripped.

*

* Implementation note: although a <code>str</code> contains only bytes, this method is also

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

Encoding.java

Latest commit

History

Encoding.java

File metadata and controls