Skip to content

Compute expressions over the given groups

Source code

Description

This expression is similar to performing a group by aggregation and joining the result back into the original DataFrame. The outcome is similar to how window functions work in PostgreSQL.

Usage

<Expr>$over(..., order_by = NULL, mapping_strategy = "group_to_rows")

Arguments

Column(s) to group by. Accepts expression input. Characters are parsed as column names.
order_by Order the window functions/aggregations with the partitioned groups by the result of the expression passed to order_by. Can be an Expr. Strings are parsed as column names.
mapping_strategy One of the following:
  • “group_to_rows” (default): if the aggregation results in multiple values, assign them back to their position in the DataFrame. This can only be done if the group yields the same elements before aggregation as after.
  • “join”: join the groups as List\ to the row positions. Note that this can be memory intensive.
  • “explode”: don’t do any mapping, but simply flatten the group. This only makes sense if the input data is sorted.

Value

Expr

Examples

library("polars")

# Pass the name of a column to compute the expression over that column.
df = pl$DataFrame(
  a = c("a", "a", "b", "b", "b"),
  b = c(1, 2, 3, 5, 3),
  c = c(5, 4, 2, 1, 3)
)

df$with_columns(
  pl$col("c")$max()$over("a")$name$suffix("_max")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a   ┆ b   ┆ c   ┆ c_max │
#> │ --- ┆ --- ┆ --- ┆ ---   │
#> │ str ┆ f64 ┆ f64 ┆ f64   │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ 5.0   │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ 5.0   │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ 3.0   │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ 3.0   │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ 3.0   │
#> └─────┴─────┴─────┴───────┘
# Expression input is supported.
df$with_columns(
  pl$col("c")$max()$over(pl$col("b") %/% 2)$name$suffix("_max")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a   ┆ b   ┆ c   ┆ c_max │
#> │ --- ┆ --- ┆ --- ┆ ---   │
#> │ str ┆ f64 ┆ f64 ┆ f64   │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ 5.0   │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ 4.0   │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ 4.0   │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ 1.0   │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ 4.0   │
#> └─────┴─────┴─────┴───────┘
# Group by multiple columns by passing a character vector of column names
# or list of expressions.
df$with_columns(
  pl$col("c")$min()$over(c("a", "b"))$name$suffix("_min")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a   ┆ b   ┆ c   ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ ---   │
#> │ str ┆ f64 ┆ f64 ┆ f64   │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ 5.0   │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ 4.0   │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ 2.0   │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ 1.0   │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ 2.0   │
#> └─────┴─────┴─────┴───────┘
df$with_columns(
  pl$col("c")$min()$over(list(pl$col("a"), pl$col("b")))$name$suffix("_min")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a   ┆ b   ┆ c   ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ ---   │
#> │ str ┆ f64 ┆ f64 ┆ f64   │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ 5.0   │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ 4.0   │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ 2.0   │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ 1.0   │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ 2.0   │
#> └─────┴─────┴─────┴───────┘
# Or use positional arguments to group by multiple columns in the same way.
df$with_columns(
  pl$col("c")$min()$over("a", pl$col("b") %% 2)$name$suffix("_min")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬───────┐
#> │ a   ┆ b   ┆ c   ┆ c_min │
#> │ --- ┆ --- ┆ --- ┆ ---   │
#> │ str ┆ f64 ┆ f64 ┆ f64   │
#> ╞═════╪═════╪═════╪═══════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ 5.0   │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ 4.0   │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ 1.0   │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ 1.0   │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ 1.0   │
#> └─────┴─────┴─────┴───────┘
# Alternative mapping strategy: join values in a list output
df$with_columns(
  top_2 = pl$col("c")$top_k(2)$over("a", mapping_strategy = "join")
)
#> shape: (5, 4)
#> ┌─────┬─────┬─────┬────────────┐
#> │ a   ┆ b   ┆ c   ┆ top_2      │
#> │ --- ┆ --- ┆ --- ┆ ---        │
#> │ str ┆ f64 ┆ f64 ┆ list[f64]  │
#> ╞═════╪═════╪═════╪════════════╡
#> │ a   ┆ 1.0 ┆ 5.0 ┆ [5.0, 4.0] │
#> │ a   ┆ 2.0 ┆ 4.0 ┆ [5.0, 4.0] │
#> │ b   ┆ 3.0 ┆ 2.0 ┆ [2.0, 3.0] │
#> │ b   ┆ 5.0 ┆ 1.0 ┆ [2.0, 3.0] │
#> │ b   ┆ 3.0 ┆ 3.0 ┆ [2.0, 3.0] │
#> └─────┴─────┴─────┴────────────┘
# order_by specifies how values are sorted within a group, which is
# essential when the operation depends on the order of values
df = pl$DataFrame(
  g = c(1, 1, 1, 1, 2, 2, 2, 2),
  t = c(1, 2, 3, 4, 4, 1, 2, 3),
  x = c(10, 20, 30, 40, 10, 20, 30, 40)
)

# without order_by, the first and second values in the second group would
# be inverted, which would be wrong
df$with_columns(
  x_lag = pl$col("x")$shift(1)$over("g", order_by = "t")
)
#> shape: (8, 4)
#> ┌─────┬─────┬──────┬───────┐
#> │ g   ┆ t   ┆ x    ┆ x_lag │
#> │ --- ┆ --- ┆ ---  ┆ ---   │
#> │ f64 ┆ f64 ┆ f64  ┆ f64   │
#> ╞═════╪═════╪══════╪═══════╡
#> │ 1.0 ┆ 1.0 ┆ 10.0 ┆ null  │
#> │ 1.0 ┆ 2.0 ┆ 20.0 ┆ 10.0  │
#> │ 1.0 ┆ 3.0 ┆ 30.0 ┆ 20.0  │
#> │ 1.0 ┆ 4.0 ┆ 40.0 ┆ 30.0  │
#> │ 2.0 ┆ 4.0 ┆ 10.0 ┆ 40.0  │
#> │ 2.0 ┆ 1.0 ┆ 20.0 ┆ null  │
#> │ 2.0 ┆ 2.0 ┆ 30.0 ┆ 20.0  │
#> │ 2.0 ┆ 3.0 ┆ 40.0 ┆ 30.0  │
#> └─────┴─────┴──────┴───────┘