Skip to content

Commit cb29537

Browse files
ARROW-11787: [R] Implement write csv
Closes apache#10141 from thisisnic/arrow-11787-write_csv Lead-authored-by: Nic Crane <thisisnic@gmail.com> Co-authored-by: Neal Richardson <neal.p.richardson@gmail.com> Signed-off-by: Neal Richardson <neal.p.richardson@gmail.com>
1 parent fc10964 commit cb29537

11 files changed

Lines changed: 293 additions & 2 deletions

File tree

cpp/src/arrow/csv/type_fwd.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ class TableReader;
2222
struct ConvertOptions;
2323
struct ReadOptions;
2424
struct ParseOptions;
25+
struct WriteOptions;
2526

2627
} // namespace csv
2728
} // namespace arrow

r/NAMESPACE

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ export(CsvFragmentScanOptions)
122122
export(CsvParseOptions)
123123
export(CsvReadOptions)
124124
export(CsvTableReader)
125+
export(CsvWriteOptions)
125126
export(Dataset)
126127
export(DatasetFactory)
127128
export(DateUnit)
@@ -278,6 +279,7 @@ export(unify_schemas)
278279
export(utf8)
279280
export(value_counts)
280281
export(write_arrow)
282+
export(write_csv_arrow)
281283
export(write_dataset)
282284
export(write_feather)
283285
export(write_ipc_stream)

r/R/arrowExports.R

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/R/csv.R

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,11 @@ CsvTableReader$create <- function(file,
381381
#' `TimestampParser$create()` takes an optional `format` string argument.
382382
#' See [`strptime()`][base::strptime()] for example syntax.
383383
#' The default is to use an ISO-8601 format parser.
384+
#'
385+
#' The `CsvWriteOptions$create()` factory method takes the following arguments:
386+
#' - `include_header` Whether to write an initial header line with column names
387+
#' - `batch_size` Maximum number of rows processed at a time. Default is 1024.
388+
#'
384389
#' @section Active bindings:
385390
#'
386391
#' - `column_names`: from `CsvReadOptions`
@@ -408,6 +413,19 @@ CsvReadOptions$create <- function(use_threads = option_use_threads(),
408413
)
409414
}
410415

416+
#' @rdname CsvReadOptions
417+
#' @export
418+
CsvWriteOptions <- R6Class("CsvWriteOptions", inherit = ArrowObject)
419+
CsvWriteOptions$create <- function(include_header = TRUE, batch_size = 1024L){
420+
assert_that(is_integerish(batch_size, n = 1, finite = TRUE), batch_size > 0)
421+
csv___WriteOptions__initialize(
422+
list(
423+
include_header = include_header,
424+
batch_size = as.integer(batch_size)
425+
)
426+
)
427+
}
428+
411429
readr_to_csv_read_options <- function(skip, col_names, col_types) {
412430
if (isTRUE(col_names)) {
413431
# C++ default to parse is 0-length string array
@@ -585,3 +603,49 @@ readr_to_csv_convert_options <- function(na,
585603
include_columns = include_columns
586604
)
587605
}
606+
607+
#' Write CSV file to disk
608+
#'
609+
#' @param x `data.frame`, [RecordBatch], or [Table]
610+
#' @param sink A string file path, URI, or [OutputStream], or path in a file
611+
#' system (`SubTreeFileSystem`)
612+
#' @param include_header Whether to write an initial header line with column names
613+
#' @param batch_size Maximum number of rows processed at a time. Default is 1024.
614+
#'
615+
#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream],
616+
#' the stream will be left open.
617+
#' @export
618+
#' @examples
619+
#' \donttest{
620+
#' tf <- tempfile()
621+
#' on.exit(unlink(tf))
622+
#' write_csv_arrow(mtcars, tf)
623+
#' }
624+
#' @include arrow-package.R
625+
write_csv_arrow <- function(x,
626+
sink,
627+
include_header = TRUE,
628+
batch_size = 1024L) {
629+
630+
write_options <- CsvWriteOptions$create(include_header, batch_size)
631+
632+
x_out <- x
633+
if (is.data.frame(x)) {
634+
x <- Table$create(x)
635+
}
636+
637+
assert_is(x, "ArrowTabular")
638+
639+
if (!inherits(sink, "OutputStream")) {
640+
sink <- make_output_stream(sink)
641+
on.exit(sink$close())
642+
}
643+
644+
if(inherits(x, "RecordBatch")){
645+
csv___WriteCSV__RecordBatch(x, write_options, sink)
646+
} else if(inherits(x, "Table")){
647+
csv___WriteCSV__Table(x, write_options, sink)
648+
}
649+
650+
invisible(x_out)
651+
}

r/_pkgdown.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ reference:
9898
- write_ipc_stream
9999
- write_to_raw
100100
- write_parquet
101+
- write_csv_arrow
101102
- title: C++ reader/writer interface
102103
contents:
103104
- ParquetFileReader
@@ -109,6 +110,7 @@ reference:
109110
- RecordBatchReader
110111
- RecordBatchWriter
111112
- CsvReadOptions
113+
- CsvWriteOptions
112114
- title: Arrow data containers
113115
contents:
114116
- array

r/man/CsvWriteOptions.Rd

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/man/write_csv_arrow.Rd

Lines changed: 32 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/arrowExports.cpp

Lines changed: 54 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/arrow_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ R6_CLASS_NAME(arrow::csv::ReadOptions, "CsvReadOptions");
179179
R6_CLASS_NAME(arrow::csv::ParseOptions, "CsvParseOptions");
180180
R6_CLASS_NAME(arrow::csv::ConvertOptions, "CsvConvertOptions");
181181
R6_CLASS_NAME(arrow::csv::TableReader, "CsvTableReader");
182+
R6_CLASS_NAME(arrow::csv::WriteOptions, "CsvWriteOptions");
182183

183184
#if defined(ARROW_R_WITH_PARQUET)
184185
R6_CLASS_NAME(parquet::ArrowReaderProperties, "ParquetArrowReaderProperties");

r/src/csv.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,21 @@
2020
#if defined(ARROW_R_WITH_ARROW)
2121

2222
#include <arrow/csv/reader.h>
23+
#include <arrow/csv/writer.h>
24+
#include <arrow/memory_pool.h>
25+
2326
#include <arrow/util/value_parsing.h>
2427

28+
// [[arrow::export]]
29+
std::shared_ptr<arrow::csv::WriteOptions> csv___WriteOptions__initialize(
30+
cpp11::list options) {
31+
auto res =
32+
std::make_shared<arrow::csv::WriteOptions>(arrow::csv::WriteOptions::Defaults());
33+
res->include_header = cpp11::as_cpp<bool>(options["include_header"]);
34+
res->batch_size = cpp11::as_cpp<int>(options["batch_size"]);
35+
return res;
36+
}
37+
2538
// [[arrow::export]]
2639
std::shared_ptr<arrow::csv::ReadOptions> csv___ReadOptions__initialize(
2740
cpp11::list options) {
@@ -174,4 +187,21 @@ std::shared_ptr<arrow::TimestampParser> TimestampParser__MakeISO8601() {
174187
return arrow::TimestampParser::MakeISO8601();
175188
}
176189

190+
// [[arrow::export]]
191+
void csv___WriteCSV__Table(const std::shared_ptr<arrow::Table>& table,
192+
const std::shared_ptr<arrow::csv::WriteOptions>& write_options,
193+
const std::shared_ptr<arrow::io::OutputStream>& stream) {
194+
StopIfNotOk(
195+
arrow::csv::WriteCSV(*table, *write_options, gc_memory_pool(), stream.get()));
196+
}
197+
198+
// [[arrow::export]]
199+
void csv___WriteCSV__RecordBatch(
200+
const std::shared_ptr<arrow::RecordBatch>& record_batch,
201+
const std::shared_ptr<arrow::csv::WriteOptions>& write_options,
202+
const std::shared_ptr<arrow::io::OutputStream>& stream) {
203+
StopIfNotOk(arrow::csv::WriteCSV(*record_batch, *write_options, gc_memory_pool(),
204+
stream.get()));
205+
}
206+
177207
#endif

0 commit comments

Comments
 (0)