Split a DataFrame into multiple DataFrames
Description
Similar to $group_by()
. Group by the given columns and
return the groups as separate DataFrames. It is useful to use this in
combination with functions like lapply()
or
purrr::map()
.
Usage
<DataFrame>$partition_by(
...,
maintain_order = TRUE,
include_key = TRUE,
as_nested_list = FALSE
)
Arguments
…
|
Characters of column names to group by. Passed to pl$col() .
|
maintain_order
|
If TRUE , ensure that the order of the groups is consistent
with the input data. This is slower than a default partition by
operation.
|
include_key
|
If TRUE , include the columns used to partition the
DataFrame in the output.
|
as_nested_list
|
This affects the format of the output. If FALSE (default),
the output is a flat list of DataFrames. IF TRUE and one of
the maintain_order or include_key argument is
TRUE , then each element of the output has two children:
key and data . See the examples for more
details.
|
Value
A list of DataFrames. See the examples for details.
See Also
-
\
$group_by()
Examples
library("polars")
df = pl$DataFrame(
a = c("a", "b", "a", "b", "c"),
b = c(1, 2, 1, 3, 3),
c = c(5, 4, 3, 2, 1)
)
df
#> shape: (5, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a ┆ 1.0 ┆ 5.0 │
#> │ b ┆ 2.0 ┆ 4.0 │
#> │ a ┆ 1.0 ┆ 3.0 │
#> │ b ┆ 3.0 ┆ 2.0 │
#> │ c ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a ┆ 1.0 ┆ 5.0 │
#> │ a ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#>
#> [[2]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 2.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#>
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a ┆ 1.0 ┆ 5.0 │
#> │ a ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#>
#> [[2]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 2.0 ┆ 4.0 │
#> └─────┴─────┴─────┘
#>
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#>
#> [[4]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
#> [[1]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a ┆ 1.0 ┆ 5.0 │
#> │ a ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#>
#> [[2]]
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 2.0 ┆ 4.0 │
#> │ b ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#>
#> [[3]]
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# If `as_nested_list = TRUE`, the output is a list whose elements have a `key` and a `data` field.
# The `key` is a named list of the key values, and the `data` is the DataFrame.
df$partition_by("a", "b", as_nested_list = TRUE)
#> [[1]]
#> [[1]]$key
#> [[1]]$key$a
#> [1] "a"
#>
#> [[1]]$key$b
#> [1] 1
#>
#>
#> [[1]]$data
#> shape: (2, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ a ┆ 1.0 ┆ 5.0 │
#> │ a ┆ 1.0 ┆ 3.0 │
#> └─────┴─────┴─────┘
#>
#>
#> [[2]]
#> [[2]]$key
#> [[2]]$key$a
#> [1] "b"
#>
#> [[2]]$key$b
#> [1] 2
#>
#>
#> [[2]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 2.0 ┆ 4.0 │
#> └─────┴─────┴─────┘
#>
#>
#> [[3]]
#> [[3]]$key
#> [[3]]$key$a
#> [1] "b"
#>
#> [[3]]$key$b
#> [1] 3
#>
#>
#> [[3]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ b ┆ 3.0 ┆ 2.0 │
#> └─────┴─────┴─────┘
#>
#>
#> [[4]]
#> [[4]]$key
#> [[4]]$key$a
#> [1] "c"
#>
#> [[4]]$key$b
#> [1] 3
#>
#>
#> [[4]]$data
#> shape: (1, 3)
#> ┌─────┬─────┬─────┐
#> │ a ┆ b ┆ c │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╡
#> │ c ┆ 3.0 ┆ 1.0 │
#> └─────┴─────┴─────┘
# `as_nested_list = TRUE` should be used with `maintain_order = TRUE` or `include_key = TRUE`.
tryCatch(
df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE),
warning = function(w) w
)
#> <simpleWarning in df$partition_by("a", "b", maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE): cannot use `$partition_by` with `maintain_order = FALSE, include_key = FALSE, as_nested_list = TRUE`. Fall back to a flat list.>
# Example of using with lapply(), and printing the key and the data summary
df$partition_by("a", "b", maintain_order = FALSE, as_nested_list = TRUE) |>
lapply(\(x) {
sprintf("\nThe key value of `a` is %s and the key value of `b` is %s\n", x$key$a, x$key$b) |>
cat()
x$data$drop(names(x$key))$describe() |>
print()
invisible(NULL)
}) |>
invisible()
#>
#> The key value of `a` is b and the key value of `b` is 3
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic ┆ c │
#> │ --- ┆ --- │
#> │ str ┆ f64 │
#> ╞════════════╪══════╡
#> │ count ┆ 1.0 │
#> │ null_count ┆ 0.0 │
#> │ mean ┆ 2.0 │
#> │ std ┆ null │
#> │ min ┆ 2.0 │
#> │ 25% ┆ 2.0 │
#> │ 50% ┆ 2.0 │
#> │ 75% ┆ 2.0 │
#> │ max ┆ 2.0 │
#> └────────────┴──────┘
#>
#> The key value of `a` is a and the key value of `b` is 1
#> shape: (9, 2)
#> ┌────────────┬──────────┐
#> │ statistic ┆ c │
#> │ --- ┆ --- │
#> │ str ┆ f64 │
#> ╞════════════╪══════════╡
#> │ count ┆ 2.0 │
#> │ null_count ┆ 0.0 │
#> │ mean ┆ 4.0 │
#> │ std ┆ 1.414214 │
#> │ min ┆ 3.0 │
#> │ 25% ┆ 3.0 │
#> │ 50% ┆ 5.0 │
#> │ 75% ┆ 5.0 │
#> │ max ┆ 5.0 │
#> └────────────┴──────────┘
#>
#> The key value of `a` is c and the key value of `b` is 3
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic ┆ c │
#> │ --- ┆ --- │
#> │ str ┆ f64 │
#> ╞════════════╪══════╡
#> │ count ┆ 1.0 │
#> │ null_count ┆ 0.0 │
#> │ mean ┆ 1.0 │
#> │ std ┆ null │
#> │ min ┆ 1.0 │
#> │ 25% ┆ 1.0 │
#> │ 50% ┆ 1.0 │
#> │ 75% ┆ 1.0 │
#> │ max ┆ 1.0 │
#> └────────────┴──────┘
#>
#> The key value of `a` is b and the key value of `b` is 2
#> shape: (9, 2)
#> ┌────────────┬──────┐
#> │ statistic ┆ c │
#> │ --- ┆ --- │
#> │ str ┆ f64 │
#> ╞════════════╪══════╡
#> │ count ┆ 1.0 │
#> │ null_count ┆ 0.0 │
#> │ mean ┆ 4.0 │
#> │ std ┆ null │
#> │ min ┆ 4.0 │
#> │ 25% ┆ 4.0 │
#> │ 50% ┆ 4.0 │
#> │ 75% ┆ 4.0 │
#> │ max ┆ 4.0 │
#> └────────────┴──────┘