Skip to content

Commit ee45f6e

Browse files
committed
gfile: add support for fsspec filesystems
1 parent fcfbc1a commit ee45f6e

File tree

8 files changed

+931
-1
lines changed

8 files changed

+931
-1
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ jobs:
110110
- name: 'Bazel: run manual tests'
111111
run: |
112112
bazel test //tensorboard/compat/tensorflow_stub:gfile_s3_test &&
113-
bazel test //tensorboard/summary/writer:event_file_writer_s3_test
113+
bazel test //tensorboard/summary/writer:event_file_writer_s3_test &&
114+
bazel test //tensorboard/compat/tensorflow_stub:gfile_fsspec_test &&
115+
bazel test //tensorboard/summary/writer:event_file_writer_fsspec_test
114116
115117
build-data-server-pip:
116118
runs-on: ${{ matrix.platform }}

tensorboard/BUILD

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -488,6 +488,13 @@ py_library(name = "expect_requests_installed")
488488
# optional dependency.
489489
py_library(name = "expect_pandas_installed")
490490

491+
# This is a dummy rule used as a fsspec dependency in open-source.
492+
# We expect fsspec to already be installed on the system, e.g. via
493+
# `pip install fsspec`.
494+
# NOTE: Unlike other parallel dependencies in this file, fsspec is an
495+
# optional dependency.
496+
py_library(name = "expect_fsspec_installed")
497+
491498
py_library(
492499
name = "data_compat",
493500
srcs = ["data_compat.py"],

tensorboard/compat/tensorflow_stub/BUILD

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ py_library(
1616
srcs_version = "PY3",
1717
deps = [
1818
"//tensorboard:expect_absl_flags_installed",
19+
"//tensorboard:expect_fsspec_installed",
1920
"//tensorboard:expect_numpy_installed",
2021
"//tensorboard/compat/proto:protos_all_py_pb2",
2122
],
@@ -59,3 +60,16 @@ py_test(
5960
"//tensorboard:test",
6061
],
6162
)
63+
64+
py_test(
65+
name = "gfile_fsspec_test",
66+
size = "small",
67+
srcs = ["io/gfile_fsspec_test.py"],
68+
srcs_version = "PY3",
69+
tags = ["support_notf"],
70+
deps = [
71+
":tensorflow_stub",
72+
"//tensorboard:expect_fsspec_installed",
73+
"//tensorboard:test",
74+
],
75+
)

tensorboard/compat/tensorflow_stub/io/gfile.py

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
import glob as py_glob
2525
import io
2626
import os
27+
import os.path
2728
import sys
2829
import tempfile
2930

@@ -35,6 +36,13 @@
3536
except ImportError:
3637
S3_ENABLED = False
3738

39+
try:
40+
import fsspec
41+
42+
FSSPEC_ENABLED = True
43+
except ImportError:
44+
FSSPEC_ENABLED = False
45+
3846
if sys.version_info < (3, 0):
3947
# In Python 2 FileExistsError is not defined and the
4048
# error manifests it as OSError.
@@ -69,6 +77,8 @@ def get_filesystem(filename):
6977
if index >= 0:
7078
prefix = filename[:index]
7179
fs = _REGISTERED_FILESYSTEMS.get(prefix, None)
80+
if fs is None:
81+
fs = _get_fsspec_filesystem(filename)
7282
if fs is None:
7383
raise ValueError("No recognized filesystem for prefix %s" % prefix)
7484
return fs
@@ -401,6 +411,242 @@ def stat(self, filename):
401411
raise
402412

403413

414+
class FSSpecFileSystem(object):
415+
"""Provides filesystem access via fsspec.
416+
417+
The current gfile interface doesn't map perfectly to the fsspec interface
418+
leading to some notable inefficiencies.
419+
420+
* Reads and writes to files cause the file to be reopened each time which
421+
can cause a performance hit when accessing local file systems.
422+
* walk doesn't use the native fsspec walk function so performance may be
423+
slower.
424+
425+
See https://github.com/tensorflow/tensorboard/issues/5286 for more info on
426+
limitations.
427+
"""
428+
429+
SEPARATOR = "://"
430+
CHAIN_SEPARATOR = "::"
431+
432+
def _validate_path(self, path):
433+
parts = path.split(self.CHAIN_SEPARATOR)
434+
for part in parts[:-1]:
435+
if self.SEPARATOR in part:
436+
raise errors.InvalidArgumentError(
437+
None,
438+
None,
439+
"fsspec URL must only have paths in the last chained filesystem, got {}".format(
440+
path
441+
),
442+
)
443+
444+
def _translate_errors(func):
445+
def func_wrapper(self, *args, **kwargs):
446+
try:
447+
return func(self, *args, **kwargs)
448+
except FileNotFoundError as e:
449+
raise errors.NotFoundError(None, None, str(e))
450+
451+
return func_wrapper
452+
453+
def _fs_path(self, filename):
454+
if isinstance(filename, bytes):
455+
filename = filename.decode("utf-8")
456+
self._validate_path(filename)
457+
458+
fs, path = fsspec.core.url_to_fs(filename)
459+
return fs, path
460+
461+
@_translate_errors
462+
def exists(self, filename):
463+
"""Determines whether a path exists or not."""
464+
fs, path = self._fs_path(filename)
465+
return fs.exists(path)
466+
467+
def _join(self, sep, paths):
468+
"""
469+
_join joins the paths with the given separator.
470+
"""
471+
result = []
472+
for part in paths:
473+
if part.startswith(sep):
474+
result = []
475+
if result and result[-1] and not result[-1].endswith(sep):
476+
result.append(sep)
477+
result.append(part)
478+
return "".join(result)
479+
480+
@_translate_errors
481+
def join(self, path, *paths):
482+
"""Join paths with a slash."""
483+
self._validate_path(path)
484+
485+
before, sep, last_path = path.rpartition(self.CHAIN_SEPARATOR)
486+
chain_prefix = before + sep
487+
protocol, path = fsspec.core.split_protocol(last_path)
488+
fs = fsspec.get_filesystem_class(protocol)
489+
if protocol:
490+
chain_prefix += protocol + self.SEPARATOR
491+
return chain_prefix + self._join(fs.sep, ((path,) + paths))
492+
493+
@_translate_errors
494+
def read(self, filename, binary_mode=False, size=None, continue_from=None):
495+
"""Reads contents of a file to a string.
496+
497+
Args:
498+
filename: string, a path
499+
binary_mode: bool, read as binary if True, otherwise text
500+
size: int, number of bytes or characters to read, otherwise
501+
read all the contents of the file (from the continuation
502+
marker, if present).
503+
continue_from: An opaque value returned from a prior invocation of
504+
`read(...)` marking the last read position, so that reading
505+
may continue from there. Otherwise read from the beginning.
506+
507+
Returns:
508+
A tuple of `(data, continuation_token)` where `data' provides either
509+
bytes read from the file (if `binary_mode == true`) or the decoded
510+
string representation thereof (otherwise), and `continuation_token`
511+
is an opaque value that can be passed to the next invocation of
512+
`read(...) ' in order to continue from the last read position.
513+
"""
514+
fs, path = self._fs_path(filename)
515+
516+
mode = "rb" if binary_mode else "r"
517+
encoding = None if binary_mode else "utf8"
518+
if not exists(filename):
519+
raise errors.NotFoundError(
520+
None, None, "Not Found: " + compat.as_text(filename)
521+
)
522+
with fs.open(path, mode, encoding=encoding) as f:
523+
if continue_from is not None:
524+
if not f.seekable():
525+
raise errors.InvalidArgumentError(
526+
None,
527+
None,
528+
"{} is not seekable".format(filename),
529+
)
530+
offset = continue_from.get("opaque_offset", None)
531+
if offset is not None:
532+
f.seek(offset)
533+
534+
data = f.read(size)
535+
# The new offset may not be `offset + len(data)`, due to decoding
536+
# and newline translation.
537+
# So, just measure it in whatever terms the underlying stream uses.
538+
continuation_token = (
539+
{"opaque_offset": f.tell()} if f.seekable() else {}
540+
)
541+
return (data, continuation_token)
542+
543+
@_translate_errors
544+
def write(self, filename, file_content, binary_mode=False):
545+
"""Writes string file contents to a file.
546+
547+
Args:
548+
filename: string, a path
549+
file_content: string, the contents
550+
binary_mode: bool, write as binary if True, otherwise text
551+
"""
552+
self._write(filename, file_content, "wb" if binary_mode else "w")
553+
554+
@_translate_errors
555+
def append(self, filename, file_content, binary_mode=False):
556+
"""Append string file contents to a file.
557+
558+
Args:
559+
filename: string, a path
560+
file_content: string, the contents to append
561+
binary_mode: bool, write as binary if True, otherwise text
562+
"""
563+
self._write(filename, file_content, "ab" if binary_mode else "a")
564+
565+
def _write(self, filename, file_content, mode):
566+
fs, path = self._fs_path(filename)
567+
encoding = None if "b" in mode else "utf8"
568+
with fs.open(path, mode, encoding=encoding) as f:
569+
compatify = compat.as_bytes if "b" in mode else compat.as_text
570+
f.write(compatify(file_content))
571+
572+
def _get_chain_protocol_prefix(self, filename):
573+
chain_prefix, chain_sep, last_path = filename.rpartition(
574+
self.CHAIN_SEPARATOR
575+
)
576+
protocol, sep, _ = last_path.rpartition(self.SEPARATOR)
577+
return chain_prefix + chain_sep + protocol + sep
578+
579+
@_translate_errors
580+
def glob(self, filename):
581+
"""Returns a list of files that match the given pattern(s)."""
582+
if isinstance(filename, bytes):
583+
filename = filename.decode("utf-8")
584+
585+
fs, path = self._fs_path(filename)
586+
files = fs.glob(path)
587+
588+
# check if applying the original chaining is required.
589+
if (
590+
self.SEPARATOR not in filename
591+
and self.CHAIN_SEPARATOR not in filename
592+
):
593+
return files
594+
595+
prefix = self._get_chain_protocol_prefix(filename)
596+
597+
return [
598+
file
599+
if (self.SEPARATOR in file or self.CHAIN_SEPARATOR in file)
600+
else prefix + file
601+
for file in files
602+
]
603+
604+
@_translate_errors
605+
def isdir(self, dirname):
606+
"""Returns whether the path is a directory or not."""
607+
fs, path = self._fs_path(dirname)
608+
return fs.isdir(path)
609+
610+
@_translate_errors
611+
def listdir(self, dirname):
612+
"""Returns a list of entries contained within a directory."""
613+
fs, path = self._fs_path(dirname)
614+
files = fs.listdir(path, detail=False)
615+
files = [os.path.basename(fname) for fname in files]
616+
return files
617+
618+
@_translate_errors
619+
def makedirs(self, dirname):
620+
"""Creates a directory and all parent/intermediate directories."""
621+
fs, path = self._fs_path(dirname)
622+
return fs.makedirs(path, exist_ok=True)
623+
624+
@_translate_errors
625+
def stat(self, filename):
626+
"""Returns file statistics for a given path."""
627+
fs, path = self._fs_path(filename)
628+
return StatData(fs.size(path))
629+
630+
631+
_FSSPEC_FILESYSTEM = FSSpecFileSystem()
632+
633+
634+
def _get_fsspec_filesystem(filename):
635+
"""
636+
_get_fsspec_filesystem checks if the provided protocol is known to fsspec
637+
and if so returns the filesystem wrapper for it.
638+
"""
639+
if not FSSPEC_ENABLED:
640+
return None
641+
642+
segment = filename.partition(FSSpecFileSystem.CHAIN_SEPARATOR)[0]
643+
protocol = segment.partition(FSSpecFileSystem.SEPARATOR)[0]
644+
if fsspec.get_filesystem_class(protocol):
645+
return _FSSPEC_FILESYSTEM
646+
else:
647+
return None
648+
649+
404650
register_filesystem("", LocalFileSystem())
405651
if S3_ENABLED:
406652
register_filesystem("s3", S3FileSystem())
@@ -514,6 +760,7 @@ def write(self, file_content):
514760
# write the first chunk to truncate file if it already exists
515761
self.fs.write(self.filename, file_content, self.binary_mode)
516762
self.write_started = True
763+
517764
else:
518765
# append the later chunks
519766
self.fs.append(self.filename, file_content, self.binary_mode)

0 commit comments

Comments
 (0)