|
24 | 24 | import glob as py_glob |
25 | 25 | import io |
26 | 26 | import os |
| 27 | +import os.path |
27 | 28 | import sys |
28 | 29 | import tempfile |
29 | 30 |
|
|
35 | 36 | except ImportError: |
36 | 37 | S3_ENABLED = False |
37 | 38 |
|
| 39 | +try: |
| 40 | + import fsspec |
| 41 | + |
| 42 | + FSSPEC_ENABLED = True |
| 43 | +except ImportError: |
| 44 | + FSSPEC_ENABLED = False |
| 45 | + |
38 | 46 | if sys.version_info < (3, 0): |
39 | 47 | # In Python 2 FileExistsError is not defined and the |
40 | 48 | # error manifests it as OSError. |
@@ -69,6 +77,8 @@ def get_filesystem(filename): |
69 | 77 | if index >= 0: |
70 | 78 | prefix = filename[:index] |
71 | 79 | fs = _REGISTERED_FILESYSTEMS.get(prefix, None) |
| 80 | + if fs is None: |
| 81 | + fs = _get_fsspec_filesystem(filename) |
72 | 82 | if fs is None: |
73 | 83 | raise ValueError("No recognized filesystem for prefix %s" % prefix) |
74 | 84 | return fs |
@@ -401,6 +411,242 @@ def stat(self, filename): |
401 | 411 | raise |
402 | 412 |
|
403 | 413 |
|
| 414 | +class FSSpecFileSystem(object): |
| 415 | + """Provides filesystem access via fsspec. |
| 416 | +
|
| 417 | + The current gfile interface doesn't map perfectly to the fsspec interface |
| 418 | + leading to some notable inefficiencies. |
| 419 | +
|
| 420 | + * Reads and writes to files cause the file to be reopened each time which |
| 421 | + can cause a performance hit when accessing local file systems. |
| 422 | + * walk doesn't use the native fsspec walk function so performance may be |
| 423 | + slower. |
| 424 | +
|
| 425 | + See https://github.com/tensorflow/tensorboard/issues/5286 for more info on |
| 426 | + limitations. |
| 427 | + """ |
| 428 | + |
| 429 | + SEPARATOR = "://" |
| 430 | + CHAIN_SEPARATOR = "::" |
| 431 | + |
| 432 | + def _validate_path(self, path): |
| 433 | + parts = path.split(self.CHAIN_SEPARATOR) |
| 434 | + for part in parts[:-1]: |
| 435 | + if self.SEPARATOR in part: |
| 436 | + raise errors.InvalidArgumentError( |
| 437 | + None, |
| 438 | + None, |
| 439 | + "fsspec URL must only have paths in the last chained filesystem, got {}".format( |
| 440 | + path |
| 441 | + ), |
| 442 | + ) |
| 443 | + |
| 444 | + def _translate_errors(func): |
| 445 | + def func_wrapper(self, *args, **kwargs): |
| 446 | + try: |
| 447 | + return func(self, *args, **kwargs) |
| 448 | + except FileNotFoundError as e: |
| 449 | + raise errors.NotFoundError(None, None, str(e)) |
| 450 | + |
| 451 | + return func_wrapper |
| 452 | + |
| 453 | + def _fs_path(self, filename): |
| 454 | + if isinstance(filename, bytes): |
| 455 | + filename = filename.decode("utf-8") |
| 456 | + self._validate_path(filename) |
| 457 | + |
| 458 | + fs, path = fsspec.core.url_to_fs(filename) |
| 459 | + return fs, path |
| 460 | + |
| 461 | + @_translate_errors |
| 462 | + def exists(self, filename): |
| 463 | + """Determines whether a path exists or not.""" |
| 464 | + fs, path = self._fs_path(filename) |
| 465 | + return fs.exists(path) |
| 466 | + |
| 467 | + def _join(self, sep, paths): |
| 468 | + """ |
| 469 | + _join joins the paths with the given separator. |
| 470 | + """ |
| 471 | + result = [] |
| 472 | + for part in paths: |
| 473 | + if part.startswith(sep): |
| 474 | + result = [] |
| 475 | + if result and result[-1] and not result[-1].endswith(sep): |
| 476 | + result.append(sep) |
| 477 | + result.append(part) |
| 478 | + return "".join(result) |
| 479 | + |
| 480 | + @_translate_errors |
| 481 | + def join(self, path, *paths): |
| 482 | + """Join paths with a slash.""" |
| 483 | + self._validate_path(path) |
| 484 | + |
| 485 | + before, sep, last_path = path.rpartition(self.CHAIN_SEPARATOR) |
| 486 | + chain_prefix = before + sep |
| 487 | + protocol, path = fsspec.core.split_protocol(last_path) |
| 488 | + fs = fsspec.get_filesystem_class(protocol) |
| 489 | + if protocol: |
| 490 | + chain_prefix += protocol + self.SEPARATOR |
| 491 | + return chain_prefix + self._join(fs.sep, ((path,) + paths)) |
| 492 | + |
| 493 | + @_translate_errors |
| 494 | + def read(self, filename, binary_mode=False, size=None, continue_from=None): |
| 495 | + """Reads contents of a file to a string. |
| 496 | +
|
| 497 | + Args: |
| 498 | + filename: string, a path |
| 499 | + binary_mode: bool, read as binary if True, otherwise text |
| 500 | + size: int, number of bytes or characters to read, otherwise |
| 501 | + read all the contents of the file (from the continuation |
| 502 | + marker, if present). |
| 503 | + continue_from: An opaque value returned from a prior invocation of |
| 504 | + `read(...)` marking the last read position, so that reading |
| 505 | + may continue from there. Otherwise read from the beginning. |
| 506 | +
|
| 507 | + Returns: |
| 508 | + A tuple of `(data, continuation_token)` where `data' provides either |
| 509 | + bytes read from the file (if `binary_mode == true`) or the decoded |
| 510 | + string representation thereof (otherwise), and `continuation_token` |
| 511 | + is an opaque value that can be passed to the next invocation of |
| 512 | + `read(...) ' in order to continue from the last read position. |
| 513 | + """ |
| 514 | + fs, path = self._fs_path(filename) |
| 515 | + |
| 516 | + mode = "rb" if binary_mode else "r" |
| 517 | + encoding = None if binary_mode else "utf8" |
| 518 | + if not exists(filename): |
| 519 | + raise errors.NotFoundError( |
| 520 | + None, None, "Not Found: " + compat.as_text(filename) |
| 521 | + ) |
| 522 | + with fs.open(path, mode, encoding=encoding) as f: |
| 523 | + if continue_from is not None: |
| 524 | + if not f.seekable(): |
| 525 | + raise errors.InvalidArgumentError( |
| 526 | + None, |
| 527 | + None, |
| 528 | + "{} is not seekable".format(filename), |
| 529 | + ) |
| 530 | + offset = continue_from.get("opaque_offset", None) |
| 531 | + if offset is not None: |
| 532 | + f.seek(offset) |
| 533 | + |
| 534 | + data = f.read(size) |
| 535 | + # The new offset may not be `offset + len(data)`, due to decoding |
| 536 | + # and newline translation. |
| 537 | + # So, just measure it in whatever terms the underlying stream uses. |
| 538 | + continuation_token = ( |
| 539 | + {"opaque_offset": f.tell()} if f.seekable() else {} |
| 540 | + ) |
| 541 | + return (data, continuation_token) |
| 542 | + |
| 543 | + @_translate_errors |
| 544 | + def write(self, filename, file_content, binary_mode=False): |
| 545 | + """Writes string file contents to a file. |
| 546 | +
|
| 547 | + Args: |
| 548 | + filename: string, a path |
| 549 | + file_content: string, the contents |
| 550 | + binary_mode: bool, write as binary if True, otherwise text |
| 551 | + """ |
| 552 | + self._write(filename, file_content, "wb" if binary_mode else "w") |
| 553 | + |
| 554 | + @_translate_errors |
| 555 | + def append(self, filename, file_content, binary_mode=False): |
| 556 | + """Append string file contents to a file. |
| 557 | +
|
| 558 | + Args: |
| 559 | + filename: string, a path |
| 560 | + file_content: string, the contents to append |
| 561 | + binary_mode: bool, write as binary if True, otherwise text |
| 562 | + """ |
| 563 | + self._write(filename, file_content, "ab" if binary_mode else "a") |
| 564 | + |
| 565 | + def _write(self, filename, file_content, mode): |
| 566 | + fs, path = self._fs_path(filename) |
| 567 | + encoding = None if "b" in mode else "utf8" |
| 568 | + with fs.open(path, mode, encoding=encoding) as f: |
| 569 | + compatify = compat.as_bytes if "b" in mode else compat.as_text |
| 570 | + f.write(compatify(file_content)) |
| 571 | + |
| 572 | + def _get_chain_protocol_prefix(self, filename): |
| 573 | + chain_prefix, chain_sep, last_path = filename.rpartition( |
| 574 | + self.CHAIN_SEPARATOR |
| 575 | + ) |
| 576 | + protocol, sep, _ = last_path.rpartition(self.SEPARATOR) |
| 577 | + return chain_prefix + chain_sep + protocol + sep |
| 578 | + |
| 579 | + @_translate_errors |
| 580 | + def glob(self, filename): |
| 581 | + """Returns a list of files that match the given pattern(s).""" |
| 582 | + if isinstance(filename, bytes): |
| 583 | + filename = filename.decode("utf-8") |
| 584 | + |
| 585 | + fs, path = self._fs_path(filename) |
| 586 | + files = fs.glob(path) |
| 587 | + |
| 588 | + # check if applying the original chaining is required. |
| 589 | + if ( |
| 590 | + self.SEPARATOR not in filename |
| 591 | + and self.CHAIN_SEPARATOR not in filename |
| 592 | + ): |
| 593 | + return files |
| 594 | + |
| 595 | + prefix = self._get_chain_protocol_prefix(filename) |
| 596 | + |
| 597 | + return [ |
| 598 | + file |
| 599 | + if (self.SEPARATOR in file or self.CHAIN_SEPARATOR in file) |
| 600 | + else prefix + file |
| 601 | + for file in files |
| 602 | + ] |
| 603 | + |
| 604 | + @_translate_errors |
| 605 | + def isdir(self, dirname): |
| 606 | + """Returns whether the path is a directory or not.""" |
| 607 | + fs, path = self._fs_path(dirname) |
| 608 | + return fs.isdir(path) |
| 609 | + |
| 610 | + @_translate_errors |
| 611 | + def listdir(self, dirname): |
| 612 | + """Returns a list of entries contained within a directory.""" |
| 613 | + fs, path = self._fs_path(dirname) |
| 614 | + files = fs.listdir(path, detail=False) |
| 615 | + files = [os.path.basename(fname) for fname in files] |
| 616 | + return files |
| 617 | + |
| 618 | + @_translate_errors |
| 619 | + def makedirs(self, dirname): |
| 620 | + """Creates a directory and all parent/intermediate directories.""" |
| 621 | + fs, path = self._fs_path(dirname) |
| 622 | + return fs.makedirs(path, exist_ok=True) |
| 623 | + |
| 624 | + @_translate_errors |
| 625 | + def stat(self, filename): |
| 626 | + """Returns file statistics for a given path.""" |
| 627 | + fs, path = self._fs_path(filename) |
| 628 | + return StatData(fs.size(path)) |
| 629 | + |
| 630 | + |
| 631 | +_FSSPEC_FILESYSTEM = FSSpecFileSystem() |
| 632 | + |
| 633 | + |
| 634 | +def _get_fsspec_filesystem(filename): |
| 635 | + """ |
| 636 | + _get_fsspec_filesystem checks if the provided protocol is known to fsspec |
| 637 | + and if so returns the filesystem wrapper for it. |
| 638 | + """ |
| 639 | + if not FSSPEC_ENABLED: |
| 640 | + return None |
| 641 | + |
| 642 | + segment = filename.partition(FSSpecFileSystem.CHAIN_SEPARATOR)[0] |
| 643 | + protocol = segment.partition(FSSpecFileSystem.SEPARATOR)[0] |
| 644 | + if fsspec.get_filesystem_class(protocol): |
| 645 | + return _FSSPEC_FILESYSTEM |
| 646 | + else: |
| 647 | + return None |
| 648 | + |
| 649 | + |
404 | 650 | register_filesystem("", LocalFileSystem()) |
405 | 651 | if S3_ENABLED: |
406 | 652 | register_filesystem("s3", S3FileSystem()) |
@@ -514,6 +760,7 @@ def write(self, file_content): |
514 | 760 | # write the first chunk to truncate file if it already exists |
515 | 761 | self.fs.write(self.filename, file_content, self.binary_mode) |
516 | 762 | self.write_started = True |
| 763 | + |
517 | 764 | else: |
518 | 765 | # append the later chunks |
519 | 766 | self.fs.append(self.filename, file_content, self.binary_mode) |
|
0 commit comments