Skip to content

Commit c273ea7

Browse files
ARROW-13995: [R] Bindings for join node
This is based on apache#11150. Among the issues observed: * Dictionary columns aren't allowed even in the left data, though you can first `Project` to remove them * Duplicate column names aren't allowed at all, even though there is a provision for deduping with a prefix Closes apache#11155 from nealrichardson/join-dplyr Lead-authored-by: Neal Richardson <neal.p.richardson@gmail.com> Co-authored-by: Jonathan Keane <jkeane@gmail.com> Signed-off-by: Jonathan Keane <jkeane@gmail.com>
1 parent 5d41f1d commit c273ea7

13 files changed

Lines changed: 445 additions & 8 deletions

File tree

cpp/src/arrow/compute/exec/options.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -155,9 +155,9 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
155155
output_all(true),
156156
output_prefix_for_left(std::move(output_prefix_for_left)),
157157
output_prefix_for_right(std::move(output_prefix_for_right)) {
158-
key_cmp.resize(left_keys.size());
159-
for (size_t i = 0; i < left_keys.size(); ++i) {
160-
key_cmp[i] = JoinKeyCmp::EQ;
158+
this->key_cmp.resize(this->left_keys.size());
159+
for (size_t i = 0; i < this->left_keys.size(); ++i) {
160+
this->key_cmp[i] = JoinKeyCmp::EQ;
161161
}
162162
}
163163
HashJoinNodeOptions(
@@ -174,9 +174,9 @@ class ARROW_EXPORT HashJoinNodeOptions : public ExecNodeOptions {
174174
right_output(std::move(right_output)),
175175
output_prefix_for_left(std::move(output_prefix_for_left)),
176176
output_prefix_for_right(std::move(output_prefix_for_right)) {
177-
key_cmp.resize(left_keys.size());
178-
for (size_t i = 0; i < left_keys.size(); ++i) {
179-
key_cmp[i] = JoinKeyCmp::EQ;
177+
this->key_cmp.resize(this->left_keys.size());
178+
for (size_t i = 0; i < this->left_keys.size(); ++i) {
179+
this->key_cmp[i] = JoinKeyCmp::EQ;
180180
}
181181
}
182182
HashJoinNodeOptions(

r/DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ Collate:
9292
'expression.R'
9393
'dplyr-functions.R'
9494
'dplyr-group-by.R'
95+
'dplyr-join.R'
9596
'dplyr-mutate.R'
9697
'dplyr-select.R'
9798
'dplyr-summarize.R'

r/NAMESPACE

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,7 @@ export(HivePartitioning)
142142
export(HivePartitioningFactory)
143143
export(InMemoryDataset)
144144
export(IpcFileFormat)
145+
export(JoinType)
145146
export(JsonParseOptions)
146147
export(JsonReadOptions)
147148
export(JsonTableReader)

r/R/arrow-package.R

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
"select", "filter", "collect", "summarise", "group_by", "groups",
3737
"group_vars", "group_by_drop_default", "ungroup", "mutate", "transmute",
3838
"arrange", "rename", "pull", "relocate", "compute", "collapse",
39-
"distinct"
39+
"distinct", "left_join", "right_join", "inner_join", "full_join",
40+
"semi_join", "anti_join"
4041
)
4142
)
4243
for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {

r/R/arrowExports.R

Lines changed: 8 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

r/R/dplyr-collect.R

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,14 @@ implicit_schema <- function(.data) {
8989

9090
if (is.null(.data$aggregations)) {
9191
new_fields <- map(.data$selected_columns, ~ .$type(old_schm))
92+
if (!is.null(.data$join) && !(.data$join$type %in% JoinType[1:4])) {
93+
# Add cols from right side, except for semi/anti joins
94+
right_cols <- .data$join$right_data$selected_columns
95+
new_fields <- c(new_fields, map(
96+
right_cols[setdiff(names(right_cols), .data$join$by)],
97+
~ .$type(.data$join$right_data$.data$schema)
98+
))
99+
}
92100
} else {
93101
new_fields <- map(summarize_projection(.data), ~ .$type(old_schm))
94102
# * Put group_by_vars first (this can't be done by summarize,

r/R/dplyr-join.R

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
18+
19+
# The following S3 methods are registered on load if dplyr is present
20+
21+
do_join <- function(x,
22+
y,
23+
by = NULL,
24+
copy = FALSE,
25+
suffix = c(".x", ".y"),
26+
...,
27+
keep = FALSE,
28+
na_matches,
29+
join_type) {
30+
# TODO: handle `copy` arg: ignore?
31+
# TODO: handle `suffix` arg: Arrow does prefix
32+
# TODO: handle `keep` arg: "Should the join keys from both ‘x’ and ‘y’ be preserved in the output?"
33+
# TODO: handle `na_matches` arg
34+
x <- as_adq(x)
35+
y <- as_adq(y)
36+
by <- handle_join_by(by, x, y)
37+
38+
x$join <- list(
39+
type = JoinType[[join_type]],
40+
right_data = y,
41+
by = by
42+
)
43+
collapse.arrow_dplyr_query(x)
44+
}
45+
46+
left_join.arrow_dplyr_query <- function(x,
47+
y,
48+
by = NULL,
49+
copy = FALSE,
50+
suffix = c(".x", ".y"),
51+
...,
52+
keep = FALSE) {
53+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_OUTER")
54+
}
55+
left_join.Dataset <- left_join.ArrowTabular <- left_join.arrow_dplyr_query
56+
57+
right_join.arrow_dplyr_query <- function(x,
58+
y,
59+
by = NULL,
60+
copy = FALSE,
61+
suffix = c(".x", ".y"),
62+
...,
63+
keep = FALSE) {
64+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "RIGHT_OUTER")
65+
}
66+
right_join.Dataset <- right_join.ArrowTabular <- right_join.arrow_dplyr_query
67+
68+
inner_join.arrow_dplyr_query <- function(x,
69+
y,
70+
by = NULL,
71+
copy = FALSE,
72+
suffix = c(".x", ".y"),
73+
...,
74+
keep = FALSE) {
75+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "INNER")
76+
}
77+
inner_join.Dataset <- inner_join.ArrowTabular <- inner_join.arrow_dplyr_query
78+
79+
full_join.arrow_dplyr_query <- function(x,
80+
y,
81+
by = NULL,
82+
copy = FALSE,
83+
suffix = c(".x", ".y"),
84+
...,
85+
keep = FALSE) {
86+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "FULL_OUTER")
87+
}
88+
full_join.Dataset <- full_join.ArrowTabular <- full_join.arrow_dplyr_query
89+
90+
semi_join.arrow_dplyr_query <- function(x,
91+
y,
92+
by = NULL,
93+
copy = FALSE,
94+
suffix = c(".x", ".y"),
95+
...,
96+
keep = FALSE) {
97+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_SEMI")
98+
}
99+
semi_join.Dataset <- semi_join.ArrowTabular <- semi_join.arrow_dplyr_query
100+
101+
anti_join.arrow_dplyr_query <- function(x,
102+
y,
103+
by = NULL,
104+
copy = FALSE,
105+
suffix = c(".x", ".y"),
106+
...,
107+
keep = FALSE) {
108+
do_join(x, y, by, copy, suffix, ..., keep = keep, join_type = "LEFT_ANTI")
109+
}
110+
anti_join.Dataset <- anti_join.ArrowTabular <- anti_join.arrow_dplyr_query
111+
112+
handle_join_by <- function(by, x, y) {
113+
if (is.null(by)) {
114+
return(set_names(intersect(names(x), names(y))))
115+
}
116+
stopifnot(is.character(by))
117+
if (is.null(names(by))) {
118+
by <- set_names(by)
119+
}
120+
# TODO: nicer messages?
121+
stopifnot(
122+
all(names(by) %in% names(x)),
123+
all(by %in% names(y))
124+
)
125+
by
126+
}

r/R/enums.R

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,3 +163,16 @@ RoundMode <- enum("RoundMode",
163163
HALF_TO_EVEN = 8L,
164164
HALF_TO_ODD = 9L
165165
)
166+
167+
#' @export
168+
#' @rdname enums
169+
JoinType <- enum("JoinType",
170+
LEFT_SEMI = 0L,
171+
RIGHT_SEMI = 1L,
172+
LEFT_ANTI = 2L,
173+
RIGHT_ANTI = 3L,
174+
INNER = 4L,
175+
LEFT_OUTER = 5L,
176+
RIGHT_OUTER = 6L,
177+
FULL_OUTER = 7L
178+
)

r/R/query-engine.R

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,15 @@ ExecPlan <- R6Class("ExecPlan",
143143
)
144144
}
145145
}
146-
} else {
146+
} else if (!is.null(.data$join)) {
147+
node <- node$Join(
148+
type = .data$join$type,
149+
right_node = self$Build(.data$join$right_data),
150+
by = .data$join$by,
151+
left_output = names(.data),
152+
right_output = setdiff(names(.data$join$right_data), .data$join$by)
153+
)
154+
} else if (length(node$schema)) {
147155
# If any columns are derived, reordered, or renamed we need to Project
148156
# If there are aggregations, the projection was already handled above
149157
# We have to project at least once to eliminate some junk columns
@@ -206,6 +214,22 @@ ExecNode <- R6Class("ExecNode",
206214
self$preserve_sort(
207215
ExecNode_Aggregate(self, options, target_names, out_field_names, key_names)
208216
)
217+
},
218+
Join = function(type, right_node, by, left_output, right_output) {
219+
self$preserve_sort(
220+
ExecNode_Join(
221+
self,
222+
type,
223+
right_node,
224+
left_keys = names(by),
225+
right_keys = by,
226+
left_output = left_output,
227+
right_output = right_output
228+
)
229+
)
209230
}
231+
),
232+
active = list(
233+
schema = function() ExecNode_output_schema(self)
210234
)
211235
)

r/man/enums.Rd

Lines changed: 5 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)