Pivot a frame from long to wide format
Description
Only available in eager mode. See "Examples" section below for how to do a "lazy pivot" if you know the unique column values in advance.
Usage
<DataFrame>$pivot(
on,
...,
index = NULL,
values = NULL,
aggregate_function = NULL,
maintain_order = TRUE,
sort_columns = FALSE,
separator = "_"
)
Arguments
on
|
The column(s) whose values will be used as the new columns of the output DataFrame. |
…
|
These dots are for future extensions and must be empty. |
index
|
The column(s) that remain from the input to the output. The output
DataFrame will have one row for each unique combination of the
index’s values. If NULL, all remaining columns
not specified in on and values will be used.
At least one of index and values must be
specified.
|
values
|
The existing column(s) of values which will be moved under the new
columns from index. If an aggregation is specified, these
are the values on which the aggregation will be computed. If
NULL, all remaining columns not specified in
on and index will be used. At least one of
index and values must be specified.
|
aggregate_function
|
Choose from:
|
maintain_order
|
Ensure the values of index are sorted by discovery order.
|
sort_columns
|
Sort the transposed columns by name. Default is by order of discovery. |
separator
|
Used as separator/delimiter in generated column names in case of multiple values columns. |
Value
A polars DataFrame
Examples
library("polars")
# Suppose we have a dataframe of test scores achieved by some students,
# where each row represents a distinct test.
df <- pl$DataFrame(
name = c("Cady", "Cady", "Karen", "Karen"),
subject = c("maths", "physics", "maths", "physics"),
test_1 = c(98, 99, 61, 58),
test_2 = c(100, 100, 60, 60)
)
df
#> shape: (4, 4)
#> ┌───────┬─────────┬────────┬────────┐
#> │ name ┆ subject ┆ test_1 ┆ test_2 │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ str ┆ f64 ┆ f64 │
#> ╞═══════╪═════════╪════════╪════════╡
#> │ Cady ┆ maths ┆ 98.0 ┆ 100.0 │
#> │ Cady ┆ physics ┆ 99.0 ┆ 100.0 │
#> │ Karen ┆ maths ┆ 61.0 ┆ 60.0 │
#> │ Karen ┆ physics ┆ 58.0 ┆ 60.0 │
#> └───────┴─────────┴────────┴────────┘
# Using pivot(), we can reshape so we have one row per student, with
# different subjects as columns, and their `test_1` scores as values:
df$pivot("subject", index = "name", values = "test_1")
#> shape: (2, 3)
#> ┌───────┬───────┬─────────┐
#> │ name ┆ maths ┆ physics │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞═══════╪═══════╪═════════╡
#> │ Cady ┆ 98.0 ┆ 99.0 │
#> │ Karen ┆ 61.0 ┆ 58.0 │
#> └───────┴───────┴─────────┘
# You can use selectors too - here we include all test scores
# in the pivoted table:
df$pivot("subject", values = cs$starts_with("test"))
#> shape: (2, 5)
#> ┌───────┬──────────────┬────────────────┬──────────────┬────────────────┐
#> │ name ┆ test_1_maths ┆ test_1_physics ┆ test_2_maths ┆ test_2_physics │
#> │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
#> ╞═══════╪══════════════╪════════════════╪══════════════╪════════════════╡
#> │ Cady ┆ 98.0 ┆ 99.0 ┆ 100.0 ┆ 100.0 │
#> │ Karen ┆ 61.0 ┆ 58.0 ┆ 60.0 ┆ 60.0 │
#> └───────┴──────────────┴────────────────┴──────────────┴────────────────┘
# If you end up with multiple values per cell, you can specify how to
# aggregate them with `aggregate_function`:
df <- pl$DataFrame(
ix = c(1, 1, 2, 2, 1, 2),
col = c("a", "a", "a", "a", "b", "b"),
foo = c(0, 1, 2, 2, 7, 1),
bar = c(0, 2, 0, 0, 9, 4)
)
df
#> shape: (6, 4)
#> ┌─────┬─────┬─────┬─────┐
#> │ ix ┆ col ┆ foo ┆ bar │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ f64 ┆ str ┆ f64 ┆ f64 │
#> ╞═════╪═════╪═════╪═════╡
#> │ 1.0 ┆ a ┆ 0.0 ┆ 0.0 │
#> │ 1.0 ┆ a ┆ 1.0 ┆ 2.0 │
#> │ 2.0 ┆ a ┆ 2.0 ┆ 0.0 │
#> │ 2.0 ┆ a ┆ 2.0 ┆ 0.0 │
#> │ 1.0 ┆ b ┆ 7.0 ┆ 9.0 │
#> │ 2.0 ┆ b ┆ 1.0 ┆ 4.0 │
#> └─────┴─────┴─────┴─────┘
#> shape: (2, 5)
#> ┌─────┬───────┬───────┬───────┬───────┐
#> │ ix ┆ foo_a ┆ foo_b ┆ bar_a ┆ bar_b │
#> │ --- ┆ --- ┆ --- ┆ --- ┆ --- │
#> │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 │
#> ╞═════╪═══════╪═══════╪═══════╪═══════╡
#> │ 1.0 ┆ 1.0 ┆ 7.0 ┆ 2.0 ┆ 9.0 │
#> │ 2.0 ┆ 4.0 ┆ 1.0 ┆ 0.0 ┆ 4.0 │
#> └─────┴───────┴───────┴───────┴───────┘
# You can also pass a custom aggregation function using `pl$element()`:
df <- pl$DataFrame(
col1 = c("a", "a", "a", "b", "b", "b"),
col2 = c("x", "x", "x", "x", "y", "y"),
col3 = c(6, 7, 3, 2, 5, 7),
)
df$pivot(
"col2",
index = "col1",
values = "col3",
aggregate_function = pl$element()$tanh()$mean(),
)
#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x ┆ y │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞══════╪══════════╪══════════╡
#> │ a ┆ 0.998347 ┆ null │
#> │ b ┆ 0.964028 ┆ 0.999954 │
#> └──────┴──────────┴──────────┘
# Note that pivot is only available in eager mode. If you know the unique
# column values in advance, you can use `$group_by()` on a LazyFrame to get
# the same result as above in lazy mode:
index <- pl$col("col1")
on <- pl$col("col2")
values <- pl$col("col3")
unique_column_values <- c("x", "y")
aggregate_function <- \(col) col$tanh()$mean()
funs <- lapply(unique_column_values, \(value) {
aggregate_function(values$filter(on == value))$alias(value)
})
df$lazy()$group_by(index)$agg(!!!funs)$collect()
#> shape: (2, 3)
#> ┌──────┬──────────┬──────────┐
#> │ col1 ┆ x ┆ y │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ f64 ┆ f64 │
#> ╞══════╪══════════╪══════════╡
#> │ a ┆ 0.998347 ┆ null │
#> │ b ┆ 0.964028 ┆ 0.999954 │
#> └──────┴──────────┴──────────┘