Skip to content

Split a DataFrame into multiple DataFrames

Source code

Description

Similar to $group_by(). Group by the given columns and return the groups as separate DataFrames. It is useful to use this in combination with functions like lapply() or purrr::map().

Usage

<DataFrame>$partition_by(
  ...,
  maintain_order = TRUE,
  include_key = TRUE,
  as_nested_list = FALSE
)

Arguments

Characters of column names to group by. Passed to pl$col().
maintain_order If TRUE, ensure that the order of the groups is consistent with the input data. This is slower than a default partition by operation.
include_key If TRUE, include the columns used to partition the DataFrame in the output.
as_nested_list This affects the format of the output. If FALSE (default), the output is a flat list of DataFrames. IF TRUE and one of the maintain_order or include_key argument is TRUE, then each element of the output has two children: key and data. See the examples for more details.

Value

A list of DataFrames. See the examples for details.

See Also

  • \$group_by()

Examples

library("polars")

df = pl$DataFrame(
  a = c("a", "b", "a", "b", "c"),
  b = c(1, 2, 1, 3, 3),
  c = c(5, 4, 3, 2, 1)
)
df
#> shape: (5, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a   ┆ 1.0 ┆ 5.0 │
#> │ b   ┆ 2.0 ┆ 4.0 │
#> │ a   ┆ 1.0 ┆ 3.0 │
#> │ b   ┆ 3.0 ┆ 2.0 │
#> │ c   ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# Pass a single column name to partition by that column.
df$partition_by("a")
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a   ┆ 1.0 ┆ 5.0 │
#> │ a   ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#> 
#> [[2]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 2.0 ┆ 4.0 │
#> │ b   ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#> 
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c   ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# Partition by multiple columns.
df$partition_by("a", "b")
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a   ┆ 1.0 ┆ 5.0 │
#> │ a   ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#> 
#> [[2]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 2.0 ┆ 4.0 │
#> └─────┴─────┴─────┘
#> 
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#> 
#> [[4]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c   ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# Partition by column data type
df$partition_by(pl$String)
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a   ┆ 1.0 ┆ 5.0 │
#> │ a   ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#> 
#> [[2]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 2.0 ┆ 4.0 │
#> │ b   ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#> 
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c   ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field.
# The `key` is a named list of the key values, and the `data` is the DataFrame.
df$partition_by("a", "b", as_nested_list = TRUE)
#> [[1]]
#> [[1]]$key
#> [[1]]$key$a
#> [1] "a"
#> 
#> [[1]]$key$b
#> [1] 1
#> 
#> 
#> [[1]]$data
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a   ┆ 1.0 ┆ 5.0 │
#> │ a   ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#> 
#> 
#> [[2]]
#> [[2]]$key
#> [[2]]$key$a
#> [1] "b"
#> 
#> [[2]]$key$b
#> [1] 2
#> 
#> 
#> [[2]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 2.0 ┆ 4.0 │
#> └─────┴─────┴─────┘
#> 
#> 
#> [[3]]
#> [[3]]$key
#> [[3]]$key$a
#> [1] "b"
#> 
#> [[3]]$key$b
#> [1] 3
#> 
#> 
#> [[3]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b   ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#> 
#> 
#> [[4]]
#> [[4]]$key
#> [[4]]$key$a
#> [1] "c"
#> 
#> [[4]]$key$b
#> [1] 3
#> 
#> 
#> [[4]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a   ┆ b   ┆ c   │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c   ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`.
tryCatch(
  df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE),
  warning = function(w) w
)
#> <simpleWarning in df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE,     as_nested_list = TRUE): cannot use `$partition_by` with `maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. Fall back to a flat list.>
# Example of using with lapply(), and printing the key and the data summary
df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |>
  lapply(\(x) {
    sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |>
      cat()
    x$data$drop(names(x$key))$describe() |>
      print()
    invisible(NULL)
  }) |>
  invisible()
#> 
#> The key value of `a` is b and the key value of `b` is 3
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic  ┆ c    │
#> │ ---        ┆ ---  │
#> │ str        ┆ f64  │
#> ╞════════════╪══════╡
#> │ count      ┆ 1.0  │
#> │ null_count ┆ 0.0  │
#> │ mean       ┆ 2.0  │
#> │ std        ┆ null │
#> │ min        ┆ 2.0  │
#> │ 25%        ┆ 2.0  │
#> │ 50%        ┆ 2.0  │
#> │ 75%        ┆ 2.0  │
#> │ max        ┆ 2.0  │
#> └────────────┴──────┘
#> 
#> The key value of `a` is c and the key value of `b` is 3
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic  ┆ c    │
#> │ ---        ┆ ---  │
#> │ str        ┆ f64  │
#> ╞════════════╪══════╡
#> │ count      ┆ 1.0  │
#> │ null_count ┆ 0.0  │
#> │ mean       ┆ 1.0  │
#> │ std        ┆ null │
#> │ min        ┆ 1.0  │
#> │ 25%        ┆ 1.0  │
#> │ 50%        ┆ 1.0  │
#> │ 75%        ┆ 1.0  │
#> │ max        ┆ 1.0  │
#> └────────────┴──────┘
#> 
#> The key value of `a` is b and the key value of `b` is 2
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic  ┆ c    │
#> │ ---        ┆ ---  │
#> │ str        ┆ f64  │
#> ╞════════════╪══════╡
#> │ count      ┆ 1.0  │
#> │ null_count ┆ 0.0  │
#> │ mean       ┆ 4.0  │
#> │ std        ┆ null │
#> │ min        ┆ 4.0  │
#> │ 25%        ┆ 4.0  │
#> │ 50%        ┆ 4.0  │
#> │ 75%        ┆ 4.0  │
#> │ max        ┆ 4.0  │
#> └────────────┴──────┘
#> 
#> The key value of `a` is a and the key value of `b` is 1
#> shape: (9, 2)
#> ┌────────────┬──────────┐
#> │ statistic  ┆ c        │
#> │ ---        ┆ ---      │
#> │ str        ┆ f64      │
#> ╞════════════╪══════════╡
#> │ count      ┆ 2.0      │
#> │ null_count ┆ 0.0      │
#> │ mean       ┆ 4.0      │
#> │ std        ┆ 1.414214 │
#> │ min        ┆ 3.0      │
#> │ 25%        ┆ 3.0      │
#> │ 50%        ┆ 5.0      │
#> │ 75%        ┆ 5.0      │
#> │ max        ┆ 5.0      │
#> └────────────┴──────────┘