4

Let's say I have a data.frame df:

#fake dataset
id <- c(1,2,3,4,5,6,7,8,9,10,11,12,13)
loc <- c(1,1,2,2,2,3,3,3,3,3,3,3,3)
date <- c(2021, 2022, 2021, 2021, 2022, 2021, 2021, 2022, 2023, 2023, 2023, 2023, 2023)
hab <- c("w", "l", "w", "w", "w", "l", "l", "w", "w", "w", "w", "w", "w")
spec <- c("frog", "frog", "frog", "frog", "frog", "beaver", "beaver", "beaver", "kingfisher", "kingfisher", "kingfisher", "kingfisher", "kingfisher")

df <- data.frame(id, loc, date, hab, spec)

I want to send rows of df down different logic branches. In each branch I might reassign values of existing columns or create new columns. Easy enough to do in dplyr with mutate(), if_else() and case_when(). But this becomes cumbersome as I end up repeating the same condition for each column assignment I want to do and dplyr has to check the same condition for each column assignment.

My sense is that the solution looks something like this:

# conditional branch for loc == 1 & spec == "frog"
df_tmp <- df %>%
  filter(loc == 1 & spec == "frog") %>%
  mutate(Date = 2025,
         x = 1)

# remerge with original df
df <- df %>%
  left_join(df_tmp, by=join_by("id"))

# id loc.x date.x hab.x     spec.x loc.y date.y hab.y spec.y  x
# 1   1     1   2021     w       frog     1   2025     w   frog  1
# 2   2     1   2022     l       frog     1   2025     l   frog  1
# 3   3     2   2021     w       frog    NA     NA  <NA>   <NA> NA
# 4   4     2   2021     w       frog    NA     NA  <NA>   <NA> NA
# 5   5     2   2022     w       frog    NA     NA  <NA>   <NA> NA
# 6   6     3   2021     l     beaver    NA     NA  <NA>   <NA> NA
# 7   7     3   2021     l     beaver    NA     NA  <NA>   <NA> NA
# 8   8     3   2022     w     beaver    NA     NA  <NA>   <NA> NA
# 9   9     3   2023     w kingfisher    NA     NA  <NA>   <NA> NA
# 10 10     3   2023     w kingfisher    NA     NA  <NA>   <NA> NA
# 11 11     3   2023     w kingfisher    NA     NA  <NA>   <NA> NA
# 12 12     3   2023     w kingfisher    NA     NA  <NA>   <NA> NA
# 13 13     3   2023     w kingfisher    NA     NA  <NA>   <NA> NA

However, I want a data.frame that does not have repeated columns and NAs populated for unaffected rows that looks like this:

# id loc date hab       spec  x
# 1   1   1 2025   w       frog  1
# 2   2   1 2025   l       frog  1
# 3   3   2 2021   w       frog NA
# 4   4   2 2021   w       frog NA
# 5   5   2 2022   w       frog NA
# 6   6   3 2021   l     beaver NA
# 7   7   3 2021   l     beaver NA
# 8   8   3 2022   w     beaver NA
# 9   9   3 2023   w kingfisher NA
# 10 10   3 2023   w kingfisher NA
# 11 11   3 2023   w kingfisher NA
# 12 12   3 2023   w kingfisher NA
# 13 13   3 2023   w kingfisher NA
2
  • stackoverflow.com/q/29836047/1412059 Commented Oct 23 at 5:55
  • 1
    @JonSpring dplyr::rows_update() would work if it didn't require both data.frames to have the same columns. Commented Oct 23 at 16:05

2 Answers 2

1

If your conditions are built around variables that are suitable for grouping, e.g. are discrete / categorical or can be binned first, then group_map() is quite convenient, returned list can be combined back into a single frame with bind_rows(). Just need to keep in mind that only a single group is accessible from applied function, so no aggregates over full columns during that stage.

library(dplyr)

df |> 
  group_by(loc, spec) |> 
  group_map(\(grp_dat, grp_keys){
    # multiple group keys for flow control ...
    if (grp_keys$loc == 1 && grp_keys$spec == "frog"){
      grp_dat <- mutate(grp_dat, date = 2025, x = 1)
    }
    
    # ... or just one
    if(grp_keys$loc > 1) {
      grp_dat <- mutate(grp_dat, loc = loc + 100)
    }
    
    if(grp_keys$spec %in% c("beaver", "kingfisher")) {
      grp_dat <- mutate(grp_dat, spec = toupper(spec))
    }
    grp_dat
  }, .keep = TRUE) |> 
  bind_rows()

#> # A tibble: 13 × 6
#>       id   loc  date hab   spec           x
#>    <dbl> <dbl> <dbl> <chr> <chr>      <dbl>
#>  1     1     1  2025 w     frog           1
#>  2     2     1  2025 l     frog           1
#>  3     3   102  2021 w     frog          NA
#>  4     4   102  2021 w     frog          NA
#>  5     5   102  2022 w     frog          NA
#>  6     6   103  2021 l     BEAVER        NA
#>  7     7   103  2021 l     BEAVER        NA
#>  8     8   103  2022 w     BEAVER        NA
#>  9     9   103  2023 w     KINGFISHER    NA
#> 10    10   103  2023 w     KINGFISHER    NA
#> 11    11   103  2023 w     KINGFISHER    NA
#> 12    12   103  2023 w     KINGFISHER    NA
#> 13    13   103  2023 w     KINGFISHER    NA

group_map() passes 2 argument's to the function, a subset defied by current group and 1-row tibble with current group keys.

To illustrate:

df |> 
  group_by(loc, spec) |> 
  group_map(\(grp_dat, grp_keys){
    print(grp_keys)
    print(grp_dat)
    cli::cat_rule()
  }, .keep = TRUE) |> 
  invisible()

#> # A tibble: 1 × 2
#>     loc spec 
#>   <dbl> <chr>
#> 1     1 frog 

#> # A tibble: 2 × 5
#>      id   loc  date hab   spec 
#>   <dbl> <dbl> <dbl> <chr> <chr>
#> 1     1     1  2021 w     frog 
#> 2     2     1  2022 l     frog 
#> ────────────────────────────────────────────────────────────────────────────

#> # A tibble: 1 × 2
#>     loc spec 
#>   <dbl> <chr>
#> 1     2 frog 

#> # A tibble: 3 × 5
#>      id   loc  date hab   spec 
#>   <dbl> <dbl> <dbl> <chr> <chr>
#> 1     3     2  2021 w     frog 
#> 2     4     2  2021 w     frog 
#> 3     5     2  2022 w     frog 
#> ────────────────────────────────────────────────────────────────────────────
#  ...

Sign up to request clarification or add additional context in comments.

1 Comment

I like this approach. As a variation, one could define the branch first with a case_when and then group_by branch, the main benefit being then the group_by step would not need to anticipate what variables will be needed as inputs for the branching.
0

Solved this with a custom function. There probably exists a more performant solution but this has worked for my needs.


row_updater <- function(df1, df2, id){

  df_result_tmp <- df1 %>%
    # append dfs and create column denoting input df 
    dplyr::bind_rows(df2, .id="df_id") %>%
    # count number of rows per id
    dplyr::group_by({{id}}) %>%
    dplyr::mutate(id_count = n()) %>%
    dplyr::ungroup()
  
  if (max(df_result_tmp['id_count']) > 2){
    warning(paste0("Attempted to update more than 1 row per ", quote(id), ". Check input datasets for duplicated rows."))
  }
  df_result <- df_result_tmp %>%
    # filter to unaltered rows from df1 and rows from df2
    dplyr::filter(id_count == 1 | (id_count == 2 & df_id == 2)) %>%
    dplyr::select(-c(df_id, id_count))
  
  return(df_result)
}

1 Comment

No need to write df_result <- and return(df_result) you can safely remove those. Note that your namespace handling is finde for {dplyr} functions but missing for {magrittr} forward pipe operator, easy quick fix (here): use |> (native base R operator) instead.

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.