Skip to content

Commit 9442e1c

Browse files
authored
ARROW-15016: [R] show_exec_plan for an arrow_dplyr_query (apache#13541)
This PR adds `show_exec_plan()` will allow users to inspect the ExecPlan, in a similar way to dplyr's `show_query()`. ```r mtcars %>% arrow_table() %>% filter(mpg > 20) %>% mutate(x = gear/carb) %>% group_by(cyl) %>% show_exec_plan() #> ExecPlan with 3 nodes: #> 2:ProjectNode{projection=[mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb, "x": divide(cast(gear, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}), cast(carb, {to_type=double, allow_int_overflow=false, allow_time_truncate=false, allow_time_overflow=false, allow_decimal_truncate=false, allow_float_truncate=false, allow_invalid_utf8=false}))]} #> 1:FilterNode{filter=(mpg > 20)} #> 0:TableSourceNode{} ``` Some design considerations are discussed in the [design doc](https://docs.google.com/document/d/1Ep8aV4jDsNCCy9uv1bjWY_JF17nzHQogv0EnGJvraQI/edit?usp=sharing). Summary of the approach: * I opted for the `show_exec_plan()` name as I believe it aligns well with the purpose of this PR: to expose the ExecPlan (in its raw state). * We could later extend, beautify the output (e.g. use {cli} and / or `show_query()` / `explain()` methods). Authored-by: Dragoș Moldovan-Grünfeld <dragos.mold@gmail.com> Signed-off-by: Dewey Dunnington <dewey@fishandwhistle.net>
1 parent 9ad2255 commit 9442e1c

11 files changed

Lines changed: 381 additions & 1 deletion

r/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,7 @@ export(s3_bucket)
352352
export(schema)
353353
export(set_cpu_count)
354354
export(set_io_thread_count)
355+
export(show_exec_plan)
355356
export(starts_with)
356357
export(string)
357358
export(struct)

r/R/arrow-package.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,8 @@
4141
"group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
4242
"arrange", "rename", "pull", "relocate", "compute", "collapse",
4343
"distinct", "left_join", "right_join", "inner_join", "full_join",
44-
"semi_join", "anti_join", "count", "tally", "rename_with", "union", "union_all", "glimpse"
44+
"semi_join", "anti_join", "count", "tally", "rename_with", "union",
45+
"union_all", "glimpse", "show_query", "explain"
4546
)
4647
)
4748
for (cl in c("Dataset", "ArrowTabular", "RecordBatchReader", "arrow_dplyr_query")) {

r/R/arrowExports.R

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/R/dplyr.R

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,53 @@ tail.arrow_dplyr_query <- function(x, n = 6L, ...) {
219219
x
220220
}
221221

222+
#' Show the details of an Arrow Execution Plan
223+
#'
224+
#' This is a function which gives more details about the logical query plan
225+
#' that will be executed when evaluating an `arrow_dplyr_query` object.
226+
#' It calls the C++ `ExecPlan` object's print method.
227+
#' Functionally, it is similar to `dplyr::explain()`. This function is used as
228+
#' the `dplyr::explain()` and `dplyr::show_query()` methods.
229+
#'
230+
#' @param x an `arrow_dplyr_query` to print the `ExecPlan` for.
231+
#'
232+
#' @return `x`, invisibly.
233+
#' @export
234+
#'
235+
#' @examplesIf arrow_with_dataset() && requireNamespace("dplyr", quietly = TRUE)
236+
#' library(dplyr)
237+
#' mtcars %>%
238+
#' arrow_table() %>%
239+
#' filter(mpg > 20) %>%
240+
#' mutate(x = gear/carb) %>%
241+
#' show_exec_plan()
242+
show_exec_plan <- function(x) {
243+
adq <- as_adq(x)
244+
plan <- ExecPlan$create()
245+
# do not show the plan if we have a nested query (as this will force the
246+
# evaluation of the inner query/queries)
247+
# TODO see if we can remove after ARROW-16628
248+
if (is_collapsed(x) && has_head_tail(x$.data)) {
249+
warn("The `ExecPlan` cannot be printed for a nested query.")
250+
return(invisible(x))
251+
}
252+
final_node <- plan$Build(adq)
253+
cat(plan$BuildAndShow(final_node))
254+
invisible(x)
255+
}
256+
257+
show_query.arrow_dplyr_query <- function(x, ...) {
258+
show_exec_plan(x)
259+
}
260+
261+
show_query.Dataset <- show_query.ArrowTabular <- show_query.RecordBatchReader <- show_query.arrow_dplyr_query
262+
263+
explain.arrow_dplyr_query <- function(x, ...) {
264+
show_exec_plan(x)
265+
}
266+
267+
explain.Dataset <- explain.ArrowTabular <- explain.RecordBatchReader <- explain.arrow_dplyr_query
268+
222269
ensure_group_vars <- function(x) {
223270
if (inherits(x, "arrow_dplyr_query")) {
224271
# Before pulling data from Arrow, make sure all group vars are in the projection

r/R/query-engine.R

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ ExecPlan <- R6Class("ExecPlan",
193193
node
194194
},
195195
Run = function(node, as_table = FALSE) {
196+
# a section of this code is used by `BuildAndShow()` too - the 2 need to be in sync
197+
# Start of chunk used in `BuildAndShow()`
196198
assert_is(node, "ExecNode")
197199

198200
# Sorting and head/tail (if sorted) are handled in the SinkNode,
@@ -210,6 +212,8 @@ ExecPlan <- R6Class("ExecPlan",
210212
sorting$orders <- as.integer(sorting$orders)
211213
}
212214

215+
# End of chunk used in `BuildAndShow()`
216+
213217
# If we are going to return a Table anyway, we do this in one step and
214218
# entirely in one C++ call to ensure that we can execute user-defined
215219
# functions from the worker threads spawned by the ExecPlan. If not, we
@@ -273,6 +277,39 @@ ExecPlan <- R6Class("ExecPlan",
273277
...
274278
)
275279
},
280+
# SinkNodes (involved in arrange and/or head/tail operations) are created in
281+
# ExecPlan_run and are not captured by the regulat print method. We take a
282+
# similar approach to expose them before calling the print method.
283+
BuildAndShow = function(node) {
284+
# a section of this code is copied from `Run()` - the 2 need to be in sync
285+
# Start of chunk copied from `Run()`
286+
287+
assert_is(node, "ExecNode")
288+
289+
# Sorting and head/tail (if sorted) are handled in the SinkNode,
290+
# created in ExecPlan_run
291+
sorting <- node$extras$sort %||% list()
292+
select_k <- node$extras$head %||% -1L
293+
has_sorting <- length(sorting) > 0
294+
if (has_sorting) {
295+
if (!is.null(node$extras$tail)) {
296+
# Reverse the sort order and take the top K, then after we'll reverse
297+
# the resulting rows so that it is ordered as expected
298+
sorting$orders <- !sorting$orders
299+
select_k <- node$extras$tail
300+
}
301+
sorting$orders <- as.integer(sorting$orders)
302+
}
303+
304+
# End of chunk copied from `Run()`
305+
306+
ExecPlan_BuildAndShow(
307+
self,
308+
node,
309+
sorting,
310+
select_k
311+
)
312+
},
276313
Stop = function() ExecPlan_StopProducing(self)
277314
)
278315
)

r/_pkgdown.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ reference:
220220
- value_counts
221221
- list_compute_functions
222222
- register_scalar_function
223+
- show_exec_plan
223224
- title: Connections to other systems
224225
contents:
225226
- to_arrow

r/man/show_exec_plan.Rd

Lines changed: 31 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/arrowExports.cpp

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/src/compute-exec.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ std::pair<std::shared_ptr<compute::ExecPlan>, std::shared_ptr<arrow::RecordBatch
6060
ExecPlan_prepare(const std::shared_ptr<compute::ExecPlan>& plan,
6161
const std::shared_ptr<compute::ExecNode>& final_node,
6262
cpp11::list sort_options, cpp11::strings metadata, int64_t head = -1) {
63+
// a section of this code is copied and used in ExecPlan_BuildAndShow - the 2 need
64+
// to be in sync
65+
// Start of chunk used in ExecPlan_BuildAndShow
66+
6367
// For now, don't require R to construct SinkNodes.
6468
// Instead, just pass the node we should collect as an argument.
6569
arrow::AsyncGenerator<arrow::util::optional<compute::ExecBatch>> sink_gen;
@@ -88,6 +92,8 @@ ExecPlan_prepare(const std::shared_ptr<compute::ExecPlan>& plan,
8892
compute::SinkNodeOptions{&sink_gen});
8993
}
9094

95+
// End of chunk used in ExecPlan_BuildAndShow
96+
9197
StopIfNotOk(plan->Validate());
9298

9399
// If the generator is destroyed before being completely drained, inform plan
@@ -155,6 +161,46 @@ std::shared_ptr<arrow::Schema> ExecNode_output_schema(
155161
return node->output_schema();
156162
}
157163

164+
// [[arrow::export]]
165+
std::string ExecPlan_BuildAndShow(const std::shared_ptr<compute::ExecPlan>& plan,
166+
const std::shared_ptr<compute::ExecNode>& final_node,
167+
cpp11::list sort_options, int64_t head = -1) {
168+
// a section of this code is copied from ExecPlan_prepare - the 2 need to be in sync
169+
// Start of chunk copied from ExecPlan_prepare
170+
171+
// For now, don't require R to construct SinkNodes.
172+
// Instead, just pass the node we should collect as an argument.
173+
arrow::AsyncGenerator<arrow::util::optional<compute::ExecBatch>> sink_gen;
174+
175+
// Sorting uses a different sink node; there is no general sort yet
176+
if (sort_options.size() > 0) {
177+
if (head >= 0) {
178+
// Use the SelectK node to take only what we need
179+
MakeExecNodeOrStop(
180+
"select_k_sink", plan.get(), {final_node.get()},
181+
compute::SelectKSinkNodeOptions{
182+
arrow::compute::SelectKOptions(
183+
head, std::dynamic_pointer_cast<compute::SortOptions>(
184+
make_compute_options("sort_indices", sort_options))
185+
->sort_keys),
186+
&sink_gen});
187+
} else {
188+
MakeExecNodeOrStop("order_by_sink", plan.get(), {final_node.get()},
189+
compute::OrderBySinkNodeOptions{
190+
*std::dynamic_pointer_cast<compute::SortOptions>(
191+
make_compute_options("sort_indices", sort_options)),
192+
&sink_gen});
193+
}
194+
} else {
195+
MakeExecNodeOrStop("sink", plan.get(), {final_node.get()},
196+
compute::SinkNodeOptions{&sink_gen});
197+
}
198+
199+
// End of chunk copied from ExecPlan_prepare
200+
201+
return plan->ToString();
202+
}
203+
158204
#if defined(ARROW_R_WITH_DATASET)
159205

160206
#include <arrow/dataset/file_base.h>

r/tests/testthat/test-dataset-dplyr.R

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,3 +340,80 @@ test_that("dplyr method not implemented messages", {
340340
fixed = TRUE
341341
)
342342
})
343+
344+
test_that("show_exec_plan(), show_query() and explain() with datasets", {
345+
# show_query() and explain() are wrappers around show_exec_plan() and are not
346+
# tested separately
347+
348+
ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
349+
350+
# minimal test
351+
expect_output(
352+
ds %>%
353+
show_exec_plan(),
354+
regexp = paste0(
355+
"ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
356+
"ProjectNode.*", # output columns
357+
"SourceNode" # entry point
358+
)
359+
)
360+
361+
# filter and select
362+
expect_output(
363+
ds %>%
364+
select(string = chr, integer = int, part) %>%
365+
filter(integer > 6L & part == 1) %>%
366+
show_exec_plan(),
367+
regexp = paste0(
368+
"ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
369+
"ProjectNode.*", # output columns
370+
"FilterNode.*", # filter node
371+
"int > 6.*cast.*", # filtering expressions + auto-casting of part
372+
"SourceNode" # entry point
373+
)
374+
)
375+
376+
# group_by and summarise
377+
expect_output(
378+
ds %>%
379+
group_by(part) %>%
380+
summarise(avg = mean(int)) %>%
381+
show_exec_plan(),
382+
regexp = paste0(
383+
"ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
384+
"ProjectNode.*", # output columns
385+
"GroupByNode.*", # group by node
386+
"keys=.*part.*", # key for aggregations
387+
"aggregates=.*hash_mean.*", # aggregations
388+
"ProjectNode.*", # input columns
389+
"SourceNode" # entry point
390+
)
391+
)
392+
393+
# arrange and head
394+
expect_output(
395+
ds %>%
396+
filter(lgl) %>%
397+
arrange(chr) %>%
398+
show_exec_plan(),
399+
regexp = paste0(
400+
"ExecPlan with .* nodes:.*", # boiler plate for ExecPlan
401+
"OrderBySinkNode.*chr.*ASC.*", # arrange goes via the OrderBy sink node
402+
"ProjectNode.*", # output columns
403+
"FilterNode.*", # filter node
404+
"filter=lgl.*", # filtering expression
405+
"SourceNode" # entry point
406+
)
407+
)
408+
409+
# printing the ExecPlan for a nested query would currently force the
410+
# evaluation of the inner one(s), which we want to avoid => no output
411+
expect_warning(
412+
ds %>%
413+
filter(lgl) %>%
414+
arrange(chr) %>%
415+
head() %>%
416+
show_exec_plan(),
417+
"The `ExecPlan` cannot be printed for a nested query."
418+
)
419+
})

0 commit comments

Comments
 (0)