orc/java/core/src/java/org/apache/orc/impl/ReaderImpl.java at main · apache/orc

History

1163 lines (1054 loc) · 40.4 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

* Licensed to the Apache Software Foundation (ASF) under one

* or more contributor license agreements. See the NOTICE file

* distributed with this work for additional information

* regarding copyright ownership. The ASF licenses this file

* to you under the Apache License, Version 2.0 (the

* "License"); you may not use this file except in compliance

* with the License. You may obtain a copy of the License at

* http://www.apache.org/licenses/LICENSE-2.0

* Unless required by applicable law or agreed to in writing, software

* distributed under the License is distributed on an "AS IS" BASIS,

* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

* See the License for the specific language governing permissions and

* limitations under the License.

package org.apache.orc.impl;

import com.google.protobuf.CodedInputStream;

import com.google.protobuf.InvalidProtocolBufferException;

import com.google.protobuf.TextFormat;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.FSDataInputStream;

import org.apache.hadoop.fs.FileStatus;

import org.apache.hadoop.fs.FileSystem;

import org.apache.hadoop.fs.Path;

import org.apache.hadoop.hive.ql.util.JavaDataModel;

import org.apache.hadoop.io.Text;

import org.apache.orc.ColumnStatistics;

import org.apache.orc.CompressionCodec;

import org.apache.orc.CompressionKind;

import org.apache.orc.DataMaskDescription;

import org.apache.orc.EncryptionAlgorithm;

import org.apache.orc.EncryptionKey;

import org.apache.orc.EncryptionVariant;

import org.apache.orc.FileFormatException;

import org.apache.orc.FileMetadata;

import org.apache.orc.OrcConf;

import org.apache.orc.OrcFile;

import org.apache.orc.OrcProto;

import org.apache.orc.OrcUtils;

import org.apache.orc.Reader;

import org.apache.orc.RecordReader;

import org.apache.orc.StripeInformation;

import org.apache.orc.StripeStatistics;

import org.apache.orc.TypeDescription;

import org.apache.orc.impl.reader.ReaderEncryption;

import org.apache.orc.impl.reader.ReaderEncryptionVariant;

import org.slf4j.Logger;

import org.slf4j.LoggerFactory;

import java.io.IOException;

import java.nio.ByteBuffer;

import java.security.Key;

import java.util.ArrayList;

import java.util.Arrays;

import java.util.Collections;

import java.util.List;

import java.util.Objects;

import java.util.function.Supplier;

/**

* @since 1.1.0

public class ReaderImpl implements Reader {

private static final Logger LOG = LoggerFactory.getLogger(ReaderImpl.class);

private static final OrcFile.Version[] ORC_FILE_VERSION_VALUES = OrcFile.Version.values();

private static final OrcFile.WriterVersion[] ORC_FILE_WRITER_VERSION_VALUES

= OrcFile.WriterVersion.values();

private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;

public static final int DEFAULT_COMPRESSION_BLOCK_SIZE = 256 * 1024;

private final long maxLength;

protected final Path path;

protected final OrcFile.ReaderOptions options;

protected final org.apache.orc.CompressionKind compressionKind;

protected FSDataInputStream file;

protected int bufferSize;

// the unencrypted stripe statistics or null if they haven't been read yet

protected List<OrcProto.StripeStatistics> stripeStatistics;

private final int metadataSize;

protected final List<OrcProto.Type> types;

private final TypeDescription schema;

private final List<OrcProto.UserMetadataItem> userMetadata;

private final List<OrcProto.ColumnStatistics> fileStats;

private final List<StripeInformation> stripes;

protected final int rowIndexStride;

private final long contentLength, numberOfRows;

private final ReaderEncryption encryption;

private long deserializedSize = -1;

protected final Configuration conf;

protected final boolean useUTCTimestamp;

private final List<Integer> versionList;

private final OrcFile.WriterVersion writerVersion;

private final String softwareVersion;

protected final OrcTail tail;

public static class StripeInformationImpl

implements StripeInformation {

private final long stripeId;

private final long originalStripeId;

private final byte[][] encryptedKeys;

private final OrcProto.StripeInformation stripe;

public StripeInformationImpl(OrcProto.StripeInformation stripe,

long stripeId,

long previousOriginalStripeId,

byte[][] previousKeys) {

this.stripe = stripe;

this.stripeId = stripeId;

if (stripe.hasEncryptStripeId()) {

originalStripeId = stripe.getEncryptStripeId();

} else {

originalStripeId = previousOriginalStripeId + 1;

}

if (stripe.getEncryptedLocalKeysCount() != 0) {

encryptedKeys = new byte[stripe.getEncryptedLocalKeysCount()][];

for(int v=0; v < encryptedKeys.length; ++v) {

encryptedKeys[v] = stripe.getEncryptedLocalKeys(v).toByteArray();

}

} else {

encryptedKeys = previousKeys;

}

@Override

public boolean equals(Object o) {

if (this == o) {

return true;

}

if (o == null || getClass() != o.getClass()) {

return false;

}

StripeInformationImpl that = (StripeInformationImpl) o;

return stripeId == that.stripeId &&

originalStripeId == that.originalStripeId &&

Arrays.deepEquals(encryptedKeys, that.encryptedKeys) &&

stripe.equals(that.stripe);

}

@Override

public int hashCode() {

int result = Objects.hash(stripeId, originalStripeId, stripe);

result = 31 * result + Arrays.hashCode(encryptedKeys);

return result;

}

@Override

public long getOffset() {

return stripe.getOffset();

}

@Override

public long getLength() {

return stripe.getDataLength() + getIndexLength() + getFooterLength();

}

@Override

public long getDataLength() {

return stripe.getDataLength();

}

@Override

public long getFooterLength() {

return stripe.getFooterLength();

}

@Override

public long getIndexLength() {

return stripe.getIndexLength();

}

@Override

public long getNumberOfRows() {

return stripe.getNumberOfRows();

}

@Override

public long getStripeId() {

return stripeId;

}

@Override

public boolean hasEncryptionStripeId() {

return stripe.hasEncryptStripeId();

}

@Override

public long getEncryptionStripeId() {

return originalStripeId;

}

@Override

public byte[][] getEncryptedLocalKeys() {

return encryptedKeys;

}

@Override

public String toString() {

return "offset: " + getOffset() + " data: " +

getDataLength() + " rows: " + getNumberOfRows() + " tail: " +

getFooterLength() + " index: " + getIndexLength() +

(!hasEncryptionStripeId() || stripeId == originalStripeId - 1

? "" : " encryption id: " + originalStripeId);

}

@Override

public long getNumberOfRows() {

return numberOfRows;

}

@Override

public List<String> getMetadataKeys() {

List<String> result = new ArrayList<>();

for(OrcProto.UserMetadataItem item: userMetadata) {

result.add(item.getName());

}

return result;

}

@Override

public ByteBuffer getMetadataValue(String key) {

for(OrcProto.UserMetadataItem item: userMetadata) {

if (item.hasName() && item.getName().equals(key)) {

return item.getValue().asReadOnlyByteBuffer();

}

throw new IllegalArgumentException("Can't find user metadata " + key);

}

@Override

public boolean hasMetadataValue(String key) {

for(OrcProto.UserMetadataItem item: userMetadata) {

if (item.hasName() && item.getName().equals(key)) {

return true;

}

return false;

}

@Override

public org.apache.orc.CompressionKind getCompressionKind() {

return compressionKind;

}

@Override

public int getCompressionSize() {

return bufferSize;

}

@Override

public List<StripeInformation> getStripes() {

return stripes;

}

@Override

public long getContentLength() {

return contentLength;

}

@Override

public List<OrcProto.Type> getTypes() {

return OrcUtils.getOrcTypes(schema);

}

public static OrcFile.Version getFileVersion(List<Integer> versionList) {

if (versionList == null || versionList.isEmpty()) {

return OrcFile.Version.V_0_11;

}

for (OrcFile.Version version: ORC_FILE_VERSION_VALUES) {

if (version.getMajor() == versionList.get(0) &&

version.getMinor() == versionList.get(1)) {

return version;

}

return OrcFile.Version.FUTURE;

}

@Override

public OrcFile.Version getFileVersion() {

return getFileVersion(versionList);

}

@Override

public OrcFile.WriterVersion getWriterVersion() {

return writerVersion;

}

@Override

public String getSoftwareVersion() {

return softwareVersion;

}

@Override

public OrcProto.FileTail getFileTail() {

return tail.getFileTail();

}

@Override

public EncryptionKey[] getColumnEncryptionKeys() {

return encryption.getKeys();

}

@Override

public DataMaskDescription[] getDataMasks() {

return encryption.getMasks();

}

@Override

public ReaderEncryptionVariant[] getEncryptionVariants() {

return encryption.getVariants();

}

@Override

public List<StripeStatistics> getVariantStripeStatistics(EncryptionVariant variant)

throws IOException {

if (variant == null) {

if (stripeStatistics == null) {

try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {

InStream.StreamOptions options = new InStream.StreamOptions();

if (codec != null) {

options.withCodec(codec).withBufferSize(bufferSize);

}

// deserialize the unencrypted stripe statistics

stripeStatistics = deserializeStripeStats(tail.getTailBuffer(),

tail.getMetadataOffset(), tail.getMetadataSize(), options);

}

return convertFromProto(stripeStatistics);

} else {

try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {

InStream.StreamOptions compression = new InStream.StreamOptions();

if (codec != null) {

compression.withCodec(codec).withBufferSize(bufferSize);

}

return ((ReaderEncryptionVariant) variant).getStripeStatistics(null,

compression, this);

}

/**

* Internal access to our view of the encryption.

* @return the encryption information for this reader.

public ReaderEncryption getEncryption() {

return encryption;

}

@Override

public int getRowIndexStride() {

return rowIndexStride;

}

@Override

public ColumnStatistics[] getStatistics() {

ColumnStatistics[] result = deserializeStats(schema, fileStats);

if (encryption.getKeys().length > 0) {

try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {

InStream.StreamOptions compression = InStream.options();

if (codec != null) {

compression.withCodec(codec).withBufferSize(bufferSize);

}

for (int c = schema.getId(); c <= schema.getMaximumId(); ++c) {

ReaderEncryptionVariant variant = encryption.getVariant(c);

if (variant != null) {

try {

int base = variant.getRoot().getId();

ColumnStatistics[] overrides = decryptFileStats(variant,

compression, tail.getFooter());

for(int sub=0; sub < overrides.length; ++sub) {

result[base + sub] = overrides[sub];

}

} catch (IOException e) {

throw new RuntimeException("Can't decrypt file stats for " + path +

" with " + variant.getKeyDescription());

}

return result;

}

private ColumnStatistics[] decryptFileStats(ReaderEncryptionVariant encryption,

InStream.StreamOptions compression,

OrcProto.Footer footer

) throws IOException {

Key key = encryption.getFileFooterKey();

if (key == null) {

return null;

} else {

OrcProto.EncryptionVariant protoVariant =

footer.getEncryption().getVariants(encryption.getVariantId());

byte[] bytes = protoVariant.getFileStatistics().toByteArray();

BufferChunk buffer = new BufferChunk(ByteBuffer.wrap(bytes), 0);

EncryptionAlgorithm algorithm = encryption.getKeyDescription().getAlgorithm();

byte[] iv = new byte[algorithm.getIvLength()];

CryptoUtils.modifyIvForStream(encryption.getRoot().getId(),

OrcProto.Stream.Kind.FILE_STATISTICS, footer.getStripesCount() + 1)

.accept(iv);

InStream.StreamOptions options = new InStream.StreamOptions(compression)

.withEncryption(algorithm, key, iv);

InStream in = InStream.create("encrypted file stats", buffer,

0, bytes.length, options);

OrcProto.FileStatistics decrypted = OrcProto.FileStatistics.parseFrom(in);

ColumnStatistics[] result = new ColumnStatistics[decrypted.getColumnCount()];

TypeDescription root = encryption.getRoot();

for(int i= 0; i < result.length; ++i){

result[i] = ColumnStatisticsImpl.deserialize(root.findSubtype(root.getId() + i),

decrypted.getColumn(i), writerUsedProlepticGregorian(),

getConvertToProlepticGregorian());

}

return result;

}

public ColumnStatistics[] deserializeStats(

TypeDescription schema,

List<OrcProto.ColumnStatistics> fileStats) {

ColumnStatistics[] result = new ColumnStatistics[fileStats.size()];

for(int i=0; i < result.length; ++i) {

TypeDescription subschema = schema == null ? null : schema.findSubtype(i);

result[i] = ColumnStatisticsImpl.deserialize(subschema, fileStats.get(i),

writerUsedProlepticGregorian(),

getConvertToProlepticGregorian());

}

return result;

}

@Override

public TypeDescription getSchema() {

return schema;

}

/**

* Ensure this is an ORC file to prevent users from trying to read text

* files or RC files as ORC files.

* @param in the file being read

* @param path the filename for error messages

* @param psLen the postscript length

* @param buffer the tail of the file

protected static void ensureOrcFooter(FSDataInputStream in,

Path path,

int psLen,

ByteBuffer buffer) throws IOException {

int magicLength = OrcFile.MAGIC.length();

int fullLength = magicLength + 1;

if (psLen < fullLength || buffer.remaining() < fullLength) {

throw new FileFormatException("Malformed ORC file " + path +

". Invalid postscript length " + psLen);

}

int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;

byte[] array = buffer.array();

// now look for the magic string at the end of the postscript.

if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {

// If it isn't there, this may be the 0.11.0 version of ORC.

// Read the first 3 bytes of the file to check for the header

byte[] header = new byte[magicLength];

in.readFully(0, header, 0, magicLength);

// if it isn't there, this isn't an ORC file

if (!Text.decode(header, 0 , magicLength).equals(OrcFile.MAGIC)) {

throw new FileFormatException("Malformed ORC file " + path +

". Invalid postscript.");

}

/**

* Ensure this is an ORC file to prevent users from trying to read text

* files or RC files as ORC files.

* @param psLen the postscript length

* @param buffer the tail of the file

* @deprecated Use {@link ReaderImpl#ensureOrcFooter(FSDataInputStream, Path, int, ByteBuffer)} instead.

@Deprecated

protected static void ensureOrcFooter(ByteBuffer buffer, int psLen) throws IOException {

int magicLength = OrcFile.MAGIC.length();

int fullLength = magicLength + 1;

if (psLen < fullLength || buffer.remaining() < fullLength) {

throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);

}

int offset = buffer.arrayOffset() + buffer.position() + buffer.limit() - fullLength;

byte[] array = buffer.array();

// now look for the magic string at the end of the postscript.

if (!Text.decode(array, offset, magicLength).equals(OrcFile.MAGIC)) {

// if it isn't there, this may be 0.11.0 version of the ORC file.

// Read the first 3 bytes from the buffer to check for the header

if (!Text.decode(buffer.array(), 0, magicLength).equals(OrcFile.MAGIC)) {

throw new FileFormatException("Malformed ORC file. Invalid postscript length " + psLen);

}

/**

* Build a version string out of an array.

* @param version the version number as a list

* @return the human readable form of the version string

private static String versionString(List<Integer> version) {

StringBuilder buffer = new StringBuilder();

for(int i=0; i < version.size(); ++i) {

if (i != 0) {

buffer.append('.');

}

buffer.append(version.get(i));

}

return buffer.toString();

}

/**

* Check to see if this ORC file is from a future version and if so,

* warn the user that we may not be able to read all of the column encodings.

* @param path the data source path for error messages

* @param postscript the parsed postscript

protected static void checkOrcVersion(Path path,

OrcProto.PostScript postscript

) throws IOException {

List<Integer> version = postscript.getVersionList();

if (getFileVersion(version) == OrcFile.Version.FUTURE) {

throw new IOException(path + " was written by a future ORC version " +

versionString(version) + ". This file is not readable by this version of ORC.\n"+

"Postscript: " + TextFormat.shortDebugString(postscript));

}

/**

* Constructor that let's the user specify additional options.

* @param path pathname for file

* @param options options for reading

public ReaderImpl(Path path, OrcFile.ReaderOptions options) throws IOException {

this.path = path;

this.options = options;

this.conf = options.getConfiguration();

this.maxLength = options.getMaxLength();

this.useUTCTimestamp = options.getUseUTCTimestamp();

FileMetadata fileMetadata = options.getFileMetadata();

if (fileMetadata != null) {

this.compressionKind = fileMetadata.getCompressionKind();

this.bufferSize = fileMetadata.getCompressionBufferSize();

this.metadataSize = fileMetadata.getMetadataSize();

this.stripeStatistics = fileMetadata.getStripeStats();

this.versionList = fileMetadata.getVersionList();

OrcFile.WriterImplementation writer =

OrcFile.WriterImplementation.from(fileMetadata.getWriterImplementation());

this.writerVersion =

OrcFile.WriterVersion.from(writer, fileMetadata.getWriterVersionNum());

List<OrcProto.Type> types = fileMetadata.getTypes();

OrcUtils.isValidTypeTree(types, 0);

this.schema = OrcUtils.convertTypeFromProtobuf(types, 0);

this.rowIndexStride = fileMetadata.getRowIndexStride();

this.contentLength = fileMetadata.getContentLength();

this.numberOfRows = fileMetadata.getNumberOfRows();

this.fileStats = fileMetadata.getFileStats();

this.stripes = fileMetadata.getStripes();

this.tail = null;

this.userMetadata = null; // not cached and not needed here

// FileMetadata is obsolete and doesn't support encryption

this.encryption = new ReaderEncryption();

this.softwareVersion = null;

} else {

OrcTail orcTail = options.getOrcTail();

if (orcTail == null) {

tail = extractFileTail(getFileSystem(), path, options.getMaxLength());

options.orcTail(tail);

} else {

checkOrcVersion(path, orcTail.getPostScript());

tail = orcTail;

}

this.compressionKind = tail.getCompressionKind();

this.bufferSize = tail.getCompressionBufferSize();

this.metadataSize = tail.getMetadataSize();

this.versionList = tail.getPostScript().getVersionList();

this.schema = tail.getSchema();

this.rowIndexStride = tail.getFooter().getRowIndexStride();

this.contentLength = tail.getFooter().getContentLength();

this.numberOfRows = tail.getFooter().getNumberOfRows();

this.userMetadata = tail.getFooter().getMetadataList();

this.fileStats = tail.getFooter().getStatisticsList();

this.writerVersion = tail.getWriterVersion();

this.stripes = tail.getStripes();

this.stripeStatistics = null;

OrcProto.Footer footer = tail.getFooter();

this.encryption = new ReaderEncryption(footer, schema,

tail.getStripeStatisticsOffset(), tail.getTailBuffer(), stripes,

options.getKeyProvider(), conf);

this.softwareVersion = OrcUtils.getSoftwareVersion(footer.getWriter(),

footer.getSoftwareVersion());

}

this.types = OrcUtils.getOrcTypes(schema);

}

protected FileSystem getFileSystem() throws IOException {

FileSystem fileSystem = options.getFilesystem();

if (fileSystem == null) {

fileSystem = path.getFileSystem(options.getConfiguration());

options.filesystem(fileSystem);

}

return fileSystem;

}

protected Supplier<FileSystem> getFileSystemSupplier() {

return () -> {

try {

return getFileSystem();

} catch (IOException e) {

throw new RuntimeException("Can't create filesystem", e);

}

};

}

/**

* Get the WriterVersion based on the ORC file postscript.

* @param writerVersion the integer writer version

* @return the version of the software that produced the file

public static OrcFile.WriterVersion getWriterVersion(int writerVersion) {

for(OrcFile.WriterVersion version: ORC_FILE_WRITER_VERSION_VALUES) {

if (version.getId() == writerVersion) {

return version;

}

return OrcFile.WriterVersion.FUTURE;

}

public static OrcProto.Metadata extractMetadata(ByteBuffer bb, int metadataAbsPos,

int metadataSize, InStream.StreamOptions options) throws IOException {

bb.position(metadataAbsPos);

bb.limit(metadataAbsPos + metadataSize);

return OrcProto.Metadata.parseFrom(InStream.createCodedInputStream(

InStream.create("metadata", new BufferChunk(bb, 0), 0, metadataSize, options)));

}

private static OrcProto.PostScript extractPostScript(BufferChunk buffer,

Path path,

int psLen,

long psOffset

) throws IOException {

CodedInputStream in = InStream.createCodedInputStream(

InStream.create("ps", buffer, psOffset, psLen));

OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);

checkOrcVersion(path, ps);

// Check compression codec.

switch (ps.getCompression()) {

case NONE:

case ZLIB:

case SNAPPY:

case LZO:

case LZ4:

case ZSTD:

case BROTLI:

break;

default:

throw new IllegalArgumentException("Unknown compression");

}

return ps;

}

/**

* Build a virtual OrcTail for empty files.

* @return a new OrcTail

OrcTail buildEmptyTail() throws IOException {

OrcProto.PostScript.Builder postscript = OrcProto.PostScript.newBuilder();

OrcFile.Version version = OrcFile.Version.CURRENT;

postscript.setMagic(OrcFile.MAGIC)

.setCompression(OrcProto.CompressionKind.NONE)

.setFooterLength(0)

.addVersion(version.getMajor())

.addVersion(version.getMinor())

.setMetadataLength(0)

.setWriterVersion(OrcFile.CURRENT_WRITER.getId());

// Use a struct with no fields

OrcProto.Type.Builder struct = OrcProto.Type.newBuilder();

struct.setKind(OrcProto.Type.Kind.STRUCT);

OrcProto.Footer.Builder footer = OrcProto.Footer.newBuilder();

footer.setHeaderLength(0)

.setContentLength(0)

.addTypes(struct)

.setNumberOfRows(0)

.setRowIndexStride(0);

OrcProto.FileTail.Builder result = OrcProto.FileTail.newBuilder();

result.setFooter(footer);

result.setPostscript(postscript);

result.setFileLength(0);

result.setPostscriptLength(0);

return new OrcTail(result.build(), new BufferChunk(0, 0), -1, this);

}

private static void read(FSDataInputStream file,

BufferChunk chunks) throws IOException {

while (chunks != null) {

if (!chunks.hasData()) {

int len = chunks.getLength();

ByteBuffer bb = ByteBuffer.allocate(len);

file.readFully(chunks.getOffset(), bb.array(), bb.arrayOffset(), len);

chunks.setChunk(bb);

}

chunks = (BufferChunk) chunks.next;

}

/**

* @deprecated Use {@link ReaderImpl#extractFileTail(FileSystem, Path, long)} instead.

* This is for backward compatibility.

@Deprecated

public static OrcTail extractFileTail(ByteBuffer buffer)

throws IOException {

return extractFileTail(buffer, -1,-1);

}

/**

* Read compression block size from the postscript if it is set; otherwise,

* use the same 256k default the C++ implementation uses.

public static int getCompressionBlockSize(OrcProto.PostScript postScript) {

if (postScript.hasCompressionBlockSize()) {

return (int) postScript.getCompressionBlockSize();

} else {

return DEFAULT_COMPRESSION_BLOCK_SIZE;

}

/**

* @deprecated Use {@link ReaderImpl#extractFileTail(FileSystem, Path, long)} instead.

* This is for backward compatibility.

@Deprecated

public static OrcTail extractFileTail(ByteBuffer buffer, long fileLen, long modificationTime)

throws IOException {

OrcProto.PostScript ps;

long readSize = buffer.limit();

OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();

fileTailBuilder.setFileLength(fileLen != -1 ? fileLen : readSize);

int psLen = buffer.get((int) (readSize - 1)) & 0xff;

int psOffset = (int) (readSize - 1 - psLen);

ensureOrcFooter(buffer, psLen);

byte[] psBuffer = new byte[psLen];

System.arraycopy(buffer.array(), psOffset, psBuffer, 0, psLen);

ps = OrcProto.PostScript.parseFrom(psBuffer);

int footerSize = (int) ps.getFooterLength();

CompressionKind compressionKind =

CompressionKind.valueOf(ps.getCompression().name());

fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);

InStream.StreamOptions compression = new InStream.StreamOptions();

try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)){

if (codec != null) {

compression.withCodec(codec)

.withBufferSize(getCompressionBlockSize(ps));

}

OrcProto.Footer footer =

OrcProto.Footer.parseFrom(

InStream.createCodedInputStream(

InStream.create("footer", new BufferChunk(buffer, 0),

psOffset - footerSize, footerSize, compression)));

fileTailBuilder.setPostscriptLength(psLen).setFooter(footer);

}

// clear does not clear the contents but sets position to 0 and limit = capacity

buffer.clear();

return new OrcTail(fileTailBuilder.build(),

new BufferChunk(buffer.slice(), 0), modificationTime);

}

protected OrcTail extractFileTail(FileSystem fs, Path path,

long maxFileLength) throws IOException {

BufferChunk buffer;

OrcProto.PostScript ps;

OrcProto.FileTail.Builder fileTailBuilder = OrcProto.FileTail.newBuilder();

long modificationTime;

file = fs.open(path);

try {

// figure out the size of the file using the option or filesystem

long size;

if (maxFileLength == Long.MAX_VALUE) {

FileStatus fileStatus = fs.getFileStatus(path);

size = fileStatus.getLen();

modificationTime = fileStatus.getModificationTime();

} else {

size = maxFileLength;

modificationTime = -1;

}

if (size == 0) {

// Hive often creates empty files (including ORC) and has an

// optimization to create a 0 byte file as an empty ORC file.

return buildEmptyTail();

} else if (size <= OrcFile.MAGIC.length()) {

// Anything smaller than MAGIC header cannot be valid (valid ORC files

// are actually around 40 bytes, this is more conservative)

throw new FileFormatException("Not a valid ORC file " + path

+ " (maxFileLength= " + maxFileLength + ")");

}

fileTailBuilder.setFileLength(size);

//read last bytes into buffer to get PostScript

int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);

buffer = new BufferChunk(size - readSize, readSize);

read(file, buffer);

//read the PostScript

//get length of PostScript

ByteBuffer bb = buffer.getData();

int psLen = bb.get(readSize - 1) & 0xff;

ensureOrcFooter(file, path, psLen, bb);

long psOffset = size - 1 - psLen;

ps = extractPostScript(buffer, path, psLen, psOffset);

CompressionKind compressionKind =

CompressionKind.valueOf(ps.getCompression().name());

fileTailBuilder.setPostscriptLength(psLen).setPostscript(ps);

int footerSize = (int) ps.getFooterLength();

int metadataSize = (int) ps.getMetadataLength();

int stripeStatSize = (int) ps.getStripeStatisticsLength();

//check if extra bytes need to be read

int tailSize = 1 + psLen + footerSize + metadataSize + stripeStatSize;

int extra = Math.max(0, tailSize - readSize);

if (extra > 0) {

//more bytes need to be read, seek back to the right place and read extra bytes

BufferChunk orig = buffer;

buffer = new BufferChunk(size - tailSize, extra);

buffer.next = orig;

orig.prev = buffer;

read(file, buffer);

}

InStream.StreamOptions compression = new InStream.StreamOptions();

try (CompressionCodec codec = OrcCodecPool.getCodec(compressionKind)) {

if (codec != null) {

compression.withCodec(codec)

.withBufferSize(getCompressionBlockSize(ps));

}

OrcProto.Footer footer =

OrcProto.Footer.parseFrom(

InStream.createCodedInputStream(

InStream.create("footer", buffer, psOffset - footerSize,

footerSize, compression)));

fileTailBuilder.setFooter(footer);

}

} catch (Throwable thr) {

try {

close();

} catch (IOException except) {

LOG.info("Ignoring secondary exception in close of " + path, except);

}

throw thr instanceof IOException ? (IOException) thr :

new IOException("Problem reading file footer " + path, thr);

}

return new OrcTail(fileTailBuilder.build(), buffer, modificationTime, this);

}

@Override

public ByteBuffer getSerializedFileFooter() {

return tail.getSerializedTail();

}

@Override

public boolean writerUsedProlepticGregorian() {

OrcProto.Footer footer = tail.getFooter();

return footer.hasCalendar()

? footer.getCalendar() == OrcProto.CalendarKind.PROLEPTIC_GREGORIAN

: OrcConf.PROLEPTIC_GREGORIAN_DEFAULT.getBoolean(conf);

}

@Override

public boolean getConvertToProlepticGregorian() {

return options.getConvertToProlepticGregorian();

}

@Override

public Options options() {

return new Options(conf);

}

@Override

public RecordReader rows() throws IOException {

return rows(options());

}

@Override

public RecordReader rows(Options options) throws IOException {

LOG.debug("Reading ORC rows from " + path + " with " + options);

return new RecordReaderImpl(this, options);

}

@Override

public long getRawDataSize() {

// if the deserializedSize is not computed, then compute it, else

// return the already computed size. since we are reading from the footer

// we don't have to compute deserialized size repeatedly

if (deserializedSize == -1) {

List<Integer> indices = new ArrayList<>();

for (int i = 0; i < fileStats.size(); ++i) {

indices.add(i);

}

deserializedSize = getRawDataSizeFromColIndices(indices);

}

return deserializedSize;

}

@Override

public long getRawDataSizeFromColIndices(List<Integer> colIndices) {

boolean[] include = new boolean[schema.getMaximumId() + 1];

for(Integer rootId: colIndices) {

TypeDescription root = schema.findSubtype(rootId);

for(int c = root.getId(); c <= root.getMaximumId(); ++c) {

include[c] = true;

}

return getRawDataSizeFromColIndices(include, schema, fileStats);

}

public static long getRawDataSizeFromColIndices(

List<Integer> colIndices,

List<OrcProto.Type> types,

List<OrcProto.ColumnStatistics> stats)

throws FileFormatException {

TypeDescription schema = OrcUtils.convertTypeFromProtobuf(types, 0);

boolean[] include = new boolean[schema.getMaximumId() + 1];

for(Integer rootId: colIndices) {

TypeDescription root = schema.findSubtype(rootId);

for(int c = root.getId(); c <= root.getMaximumId(); ++c) {

include[c] = true;

}

return getRawDataSizeFromColIndices(include, schema, stats);

}

static long getRawDataSizeFromColIndices(boolean[] include,

TypeDescription schema,

List<OrcProto.ColumnStatistics> stats) {

long result = 0;

for (int c = schema.getId(); c <= schema.getMaximumId(); ++c) {

if (include[c]) {

result += getRawDataSizeOfColumn(schema.findSubtype(c), stats);

}

return result;

}

private static long getRawDataSizeOfColumn(TypeDescription column,

List<OrcProto.ColumnStatistics> stats) {

OrcProto.ColumnStatistics colStat = stats.get(column.getId());

long numVals = colStat.getNumberOfValues();

switch (column.getCategory()) {

case BINARY:

// old orc format doesn't support binary statistics. checking for binary

// statistics is not required as protocol buffers takes care of it.

return colStat.getBinaryStatistics().getSum();

case STRING:

case CHAR:

case VARCHAR:

// old orc format doesn't support sum for string statistics. checking for

// existence is not required as protocol buffers takes care of it.

// ORC strings are deserialized to java strings. so use java data model's

// string size

numVals = numVals == 0 ? 1 : numVals;

int avgStrLen = (int) (colStat.getStringStatistics().getSum() / numVals);

return numVals * JavaDataModel.get().lengthForStringOfLength(avgStrLen);

case TIMESTAMP:

case TIMESTAMP_INSTANT:

return numVals * JavaDataModel.get().lengthOfTimestamp();

case DATE:

return numVals * JavaDataModel.get().lengthOfDate();

case DECIMAL:

return numVals * JavaDataModel.get().lengthOfDecimal();

case DOUBLE:

case LONG:

return numVals * JavaDataModel.get().primitive2();

case FLOAT:

case INT:

case SHORT:

case BOOLEAN:

case BYTE:

case STRUCT:

case UNION:

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ReaderImpl.java

Latest commit

History

ReaderImpl.java

File metadata and controls