Skip to content

Commit bb69967

Browse files
nealrichardsonfsaintjacques
authored andcommitted
ARROW-6439: [R] Implement S3 file-system interface in R
This patch does the foundational work to conditionally build the S3 bindings if the C++ library was built with ARROW_S3=ON. It also adds bindings for `FileSystemFromUri` and enough wiring up in the datasets code so that `open_dataset("s3://ursa-labs-taxi-data", partitioning = c("year", "month"))` works. There's lots of other S3FileSystem methods that probably should be implemented and aren't here. Also, calling `S3FileSystem$create()` segfaults. But `FileSystemFromUri` works fine. Closes apache#6901 from nealrichardson/r-s3 Lead-authored-by: Neal Richardson <neal.p.richardson@gmail.com> Co-authored-by: François Saint-Jacques <fsaintjacques@gmail.com> Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
1 parent a74d32c commit bb69967

19 files changed

Lines changed: 262 additions & 69 deletions

.github/workflows/r.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,9 @@ jobs:
156156
with:
157157
fetch-depth: 0
158158
- name: Make R tests verbose
159+
# If you get a segfault/mysterious test Execution halted,
160+
# make this `true` to see where it dies.
161+
if: false
159162
shell: cmd
160163
run: |
161164
cd r/tests

ci/scripts/cpp_test.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,10 @@ export ARROW_TEST_DATA=${arrow_dir}/testing/data
2828
export PARQUET_TEST_DATA=${source_dir}/submodules/parquet-testing/data
2929
export LD_LIBRARY_PATH=${ARROW_HOME}/${CMAKE_INSTALL_LIBDIR:-lib}:${LD_LIBRARY_PATH}
3030

31+
# By default, aws-sdk tries to contact a non-existing local ip host
32+
# to retrieve metadata. Disable this so that S3FileSystem tests run faster.
33+
export AWS_EC2_METADATA_DISABLED=TRUE
34+
3135
case "$(uname)" in
3236
Linux)
3337
n_jobs=$(nproc)

ci/scripts/r_test.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ export _R_CHECK_TESTS_NLINES_=0
4242
export _R_CHECK_CRAN_INCOMING_REMOTE_=FALSE
4343
export _R_CHECK_LIMIT_CORES_=FALSE
4444
export VERSION=$(grep ^Version DESCRIPTION | sed s/Version:\ //)
45+
# By default, aws-sdk tries to contact a non-existing local ip host
46+
# to retrieve metadata. Disable this so that S3FileSystem tests run faster.
47+
export AWS_EC2_METADATA_DISABLED=TRUE
4548

4649
# Make sure we aren't writing to the home dir (CRAN _hates_ this but there is no official check)
4750
BEFORE=$(ls -alh ~/)

r/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ export(RecordBatchFileReader)
119119
export(RecordBatchFileWriter)
120120
export(RecordBatchStreamReader)
121121
export(RecordBatchStreamWriter)
122+
export(S3FileSystem)
122123
export(ScalarExpression)
123124
export(Scanner)
124125
export(ScannerBuilder)

r/R/arrowExports.R

Lines changed: 16 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/R/dataset.R

Lines changed: 11 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -268,11 +268,9 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = ArrowObject,
268268
)
269269
)
270270
DatasetFactory$create <- function(x,
271-
filesystem = c("auto", "local"),
271+
filesystem = NULL,
272272
format = c("parquet", "arrow", "ipc", "feather"),
273273
partitioning = NULL,
274-
allow_not_found = FALSE,
275-
recursive = TRUE,
276274
...) {
277275
if (is_list_of(x, "DatasetFactory")) {
278276
return(shared_ptr(DatasetFactory, dataset___UnionDatasetFactory__Make(x)))
@@ -282,21 +280,15 @@ DatasetFactory$create <- function(x,
282280
}
283281

284282
if (!inherits(filesystem, "FileSystem")) {
285-
filesystem <- match.arg(filesystem)
286-
if (filesystem == "auto") {
287-
# When there are other FileSystems supported, detect e.g. S3 from x
288-
filesystem <- "local"
283+
if (grepl("://", x)) {
284+
fs_from_uri <- FileSystem$from_uri(x)
285+
filesystem <- fs_from_uri$fs
286+
x <- fs_from_uri$path
287+
} else {
288+
filesystem <- LocalFileSystem$create()
289289
}
290-
filesystem <- list(
291-
local = LocalFileSystem
292-
# We'll register other file systems here
293-
)[[filesystem]]$create(...)
294290
}
295-
selector <- FileSelector$create(
296-
x,
297-
allow_not_found = allow_not_found,
298-
recursive = recursive
299-
)
291+
selector <- FileSelector$create(x, allow_not_found = FALSE, recursive = TRUE)
300292

301293
if (is.character(format)) {
302294
format <- FileFormat$create(match.arg(format))
@@ -331,8 +323,8 @@ DatasetFactory$create <- function(x,
331323
#' a list of `DatasetFactory` objects whose datasets should be
332324
#' grouped. If this argument is specified it will be used to construct a
333325
#' `UnionDatasetFactory` and other arguments will be ignored.
334-
#' @param filesystem A string identifier for the filesystem corresponding to
335-
#' `x`. Currently only "local" is supported.
326+
#' @param filesystem A [FileSystem] object; if omitted, the `FileSystem` will
327+
#' be detected from `x`
336328
#' @param format A string identifier of the format of the files in `x`.
337329
#' Currently "parquet" and "ipc"/"arrow"/"feather" (aliases for each other)
338330
#' are supported. For Feather, only version 2 files are supported.
@@ -348,11 +340,7 @@ DatasetFactory$create <- function(x,
348340
#' by [hive_partition()] which parses explicit or autodetected fields from
349341
#' Hive-style path segments
350342
#' * `NULL` for no partitioning
351-
#' @param allow_not_found logical: is `x` allowed to not exist? Default
352-
#' `FALSE`. See [FileSelector].
353-
#' @param recursive logical: should files be discovered in subdirectories of
354-
#' `x`? Default `TRUE`.
355-
#' @param ... Additional arguments passed to the [FileSystem] `$create()` method
343+
#' @param ... Additional arguments, currently ignored
356344
#' @return A `DatasetFactory` object. Pass this to [open_dataset()],
357345
#' in a list potentially with other `DatasetFactory` objects, to create
358346
#' a `Dataset`.

r/R/filesystem.R

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,18 @@ FileSelector$create <- function(base_dir, allow_not_found = FALSE, recursive = F
166166
#' @export
167167
FileSystem <- R6Class("FileSystem", inherit = ArrowObject,
168168
public = list(
169+
..dispatch = function() {
170+
type_name <- self$type_name
171+
if (type_name == "local") {
172+
shared_ptr(LocalFileSystem, self$pointer())
173+
} else if (type_name == "s3") {
174+
shared_ptr(S3FileSystem, self$pointer())
175+
} else if (type_name == "subtree") {
176+
shared_ptr(SubTreeFileSystem, self$pointer())
177+
} else {
178+
self
179+
}
180+
},
169181
GetFileInfo = function(x) {
170182
if (inherits(x, "FileSelector")) {
171183
map(
@@ -224,8 +236,16 @@ FileSystem <- R6Class("FileSystem", inherit = ArrowObject,
224236
OpenAppendStream = function(path) {
225237
shared_ptr(OutputStream, fs___FileSystem__OpenAppendStream(self, clean_path_rel(path)))
226238
}
239+
),
240+
active = list(
241+
type_name = function() fs___FileSystem__type_name(self)
227242
)
228243
)
244+
FileSystem$from_uri <- function(uri) {
245+
out <- fs___FileSystemFromUri(uri)
246+
out$fs <- shared_ptr(FileSystem, out$fs)$..dispatch()
247+
out
248+
}
229249

230250
#' @usage NULL
231251
#' @format NULL
@@ -236,6 +256,19 @@ LocalFileSystem$create <- function() {
236256
shared_ptr(LocalFileSystem, fs___LocalFileSystem__create())
237257
}
238258

259+
#' @usage NULL
260+
#' @format NULL
261+
#' @rdname FileSystem
262+
#' @export
263+
S3FileSystem <- R6Class("S3FileSystem", inherit = FileSystem)
264+
S3FileSystem$create <- function() {
265+
fs___EnsureS3Initialized()
266+
shared_ptr(S3FileSystem, fs___S3FileSystem__create())
267+
}
268+
269+
arrow_with_s3 <- function() {
270+
.Call(`_s3_available`)
271+
}
239272

240273
#' @usage NULL
241274
#' @format NULL

r/configure

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,12 @@ echo "#include $PKG_TEST_HEADER" | ${TEST_CMD} >/dev/null 2>&1
159159
if [ $? -eq 0 ] || [ "$UNAME" = "Darwin" ]; then
160160
# Always build with arrow on macOS
161161
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_ARROW"
162+
# Check for features
163+
LIB_DIR=`echo $PKG_LIBS | sed -e 's/ -l.*//' | sed -e 's/^-L//'`
164+
grep 'set(ARROW_S3 "ON")' $LIB_DIR/cmake/arrow/ArrowOptions.cmake >/dev/null 2>&1
165+
if [ $? -eq 0 ]; then
166+
PKG_CFLAGS="$PKG_CFLAGS -DARROW_R_WITH_S3"
167+
fi
162168
echo "PKG_CFLAGS=$PKG_CFLAGS"
163169
echo "PKG_LIBS=$PKG_LIBS"
164170
else

r/data-raw/codegen.R

Lines changed: 34 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,16 +48,19 @@ if (packageVersion("decor") < '0.0.0.9001') {
4848
stop("more recent version of `decor` needed, please install with `remotes::install_github('romainfrancois/decor')`")
4949
}
5050

51-
decorations <- cpp_decorations() %>%
52-
filter(decoration == "arrow::export") %>%
53-
# the three lines below can be expressed with rap()
54-
# more concisely
55-
# rap( ~ decor:::parse_cpp_function(context))
56-
mutate(functions = map(context, decor:::parse_cpp_function)) %>%
57-
{ vec_cbind(., vec_rbind(!!!pull(., functions))) } %>%
58-
select(-functions)
59-
60-
message(glue("*** > {n} functions decorated with [[arrow::export]]", n = nrow(decorations)))
51+
get_exported_functions <- function(decorations, export_tag) {
52+
out <- decorations %>%
53+
filter(decoration %in% paste0(export_tag, "::export")) %>%
54+
# the three lines below can be expressed with rap()
55+
# more concisely
56+
# rap( ~ decor:::parse_cpp_function(context))
57+
mutate(functions = map(context, decor:::parse_cpp_function)) %>%
58+
{ vec_cbind(., vec_rbind(!!!pull(., functions))) } %>%
59+
select(-functions) %>%
60+
mutate(decoration = sub("::export", "", decoration))
61+
message(glue("*** > {n} functions decorated with [[{tags}::export]]", n = nrow(out), tags = paste0(export_tag, collapse = "|")))
62+
out
63+
}
6164

6265
glue_collapse_data <- function(data, ..., sep = ", ", last = "") {
6366
res <- glue_collapse(glue_data(data, ...), sep = sep, last = last)
@@ -73,12 +76,16 @@ wrap_call <- function(name, return_type, args) {
7376
glue::glue("\treturn Rcpp::wrap({call});")
7477
}
7578
}
76-
cpp_functions_definitions <- decorations %>%
77-
select(name, return_type, args, file, line) %>%
78-
pmap_chr(function(name, return_type, args, file, line){
79+
80+
all_decorations <- cpp_decorations()
81+
arrow_exports <- get_exported_functions(all_decorations, c("arrow", "s3"))
82+
83+
cpp_functions_definitions <- arrow_exports %>%
84+
select(name, return_type, args, file, line, decoration) %>%
85+
pmap_chr(function(name, return_type, args, file, line, decoration){
7986
glue::glue('
8087
// {basename(file)}
81-
#if defined(ARROW_R_WITH_ARROW)
88+
#if defined(ARROW_R_WITH_{toupper(decoration)})
8289
{return_type} {name}({real_params});
8390
RcppExport SEXP _arrow_{name}({sexp_params}){{
8491
BEGIN_RCPP
@@ -101,7 +108,7 @@ cpp_functions_definitions <- decorations %>%
101108
}) %>%
102109
glue_collapse(sep = "\n")
103110

104-
cpp_functions_registration <- decorations %>%
111+
cpp_functions_registration <- arrow_exports %>%
105112
select(name, return_type, args) %>%
106113
pmap_chr(function(name, return_type, args){
107114
glue('\t\t{{ "_arrow_{name}", (DL_FUNC) &_arrow_{name}, {nrow(args)}}}, ')
@@ -127,8 +134,19 @@ return Rf_ScalarLogical(
127134
);
128135
}}
129136
137+
extern "C" SEXP _s3_available() {{
138+
return Rf_ScalarLogical(
139+
#if defined(ARROW_R_WITH_S3)
140+
TRUE
141+
#else
142+
FALSE
143+
#endif
144+
);
145+
}}
146+
130147
static const R_CallMethodDef CallEntries[] = {{
131148
\t\t{{ "_arrow_available", (DL_FUNC)& _arrow_available, 0 }},
149+
\t\t{{ "_s3_available", (DL_FUNC)& _s3_available, 0 }},
132150
{cpp_functions_registration}
133151
\t\t{{NULL, NULL, 0}}
134152
}};
@@ -142,7 +160,7 @@ RcppExport void R_init_arrow(DllInfo* dll){{
142160

143161
message("*** > generated file `src/arrowExports.cpp`")
144162

145-
r_functions <- decorations %>%
163+
r_functions <- arrow_exports %>%
146164
select(name, return_type, args) %>%
147165
pmap_chr(function(name, return_type, args) {
148166
params <- if (nrow(args)) {

r/man/FileSystem.Rd

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)