Aggregate over a DynamicGroupBy

Description

Aggregate a DataFrame over a time or integer window created with $group_by_dynamic().

Usage

<DynamicGroupBy>$agg(...)

Arguments

`…`	Exprs to aggregate over. Those can also be passed wrapped in a list, e.g `$agg(list(e1,e2,e3))`.

Value

An aggregated DataFrame

Examples

library("polars")

df = pl$DataFrame(
  time = pl$datetime_range(
    start = strptime("2021-12-16 00:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
    end = strptime("2021-12-16 03:00:00", format = "%Y-%m-%d %H:%M:%S", tz = "UTC"),
    interval = "30m"
  ),
  n = 0:6
)

# get the sum in the following hour relative to the "time" column
df$group_by_dynamic("time", every = "1h")$agg(
  vals = pl$col("n"),
  sum = pl$col("n")$sum()
)

#> shape: (4, 3)
#> ┌─────────────────────────┬───────────┬─────┐
#> │ time                    ┆ vals      ┆ sum │
#> │ ---                     ┆ ---       ┆ --- │
#> │ datetime[ms, UTC]       ┆ list[i32] ┆ i32 │
#> ╞═════════════════════════╪═══════════╪═════╡
#> │ 2021-12-16 00:00:00 UTC ┆ [0, 1]    ┆ 1   │
#> │ 2021-12-16 01:00:00 UTC ┆ [2, 3]    ┆ 5   │
#> │ 2021-12-16 02:00:00 UTC ┆ [4, 5]    ┆ 9   │
#> │ 2021-12-16 03:00:00 UTC ┆ [6]       ┆ 6   │
#> └─────────────────────────┴───────────┴─────┘

# using "include_boundaries = TRUE" is helpful to see the period considered
df$group_by_dynamic("time", every = "1h", include_boundaries = TRUE)$agg(
  vals = pl$col("n")
)

#> shape: (4, 4)
#> ┌─────────────────────────┬─────────────────────────┬─────────────────────────┬───────────┐
#> │ _lower_boundary         ┆ _upper_boundary         ┆ time                    ┆ vals      │
#> │ ---                     ┆ ---                     ┆ ---                     ┆ ---       │
#> │ datetime[ms, UTC]       ┆ datetime[ms, UTC]       ┆ datetime[ms, UTC]       ┆ list[i32] │
#> ╞═════════════════════════╪═════════════════════════╪═════════════════════════╪═══════════╡
#> │ 2021-12-16 00:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ [0, 1]    │
#> │ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ [2, 3]    │
#> │ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ [4, 5]    │
#> │ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 04:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ [6]       │
#> └─────────────────────────┴─────────────────────────┴─────────────────────────┴───────────┘

# in the example above, the values didn't include the one *exactly* 1h after
# the start because "closed = 'left'" by default.
# Changing it to "right" includes values that are exactly 1h after. Note that
# the value at 00:00:00 now becomes included in the interval [23:00:00 - 00:00:00],
# even if this interval wasn't there originally
df$group_by_dynamic("time", every = "1h", closed = "right")$agg(
  vals = pl$col("n")
)

#> shape: (4, 2)
#> ┌─────────────────────────┬───────────┐
#> │ time                    ┆ vals      │
#> │ ---                     ┆ ---       │
#> │ datetime[ms, UTC]       ┆ list[i32] │
#> ╞═════════════════════════╪═══════════╡
#> │ 2021-12-15 23:00:00 UTC ┆ [0]       │
#> │ 2021-12-16 00:00:00 UTC ┆ [1, 2]    │
#> │ 2021-12-16 01:00:00 UTC ┆ [3, 4]    │
#> │ 2021-12-16 02:00:00 UTC ┆ [5, 6]    │
#> └─────────────────────────┴───────────┘

# To keep both boundaries, we use "closed = 'both'". Some values now belong to
# several groups:
df$group_by_dynamic("time", every = "1h", closed = "both")$agg(
  vals = pl$col("n")
)

#> shape: (5, 2)
#> ┌─────────────────────────┬───────────┐
#> │ time                    ┆ vals      │
#> │ ---                     ┆ ---       │
#> │ datetime[ms, UTC]       ┆ list[i32] │
#> ╞═════════════════════════╪═══════════╡
#> │ 2021-12-15 23:00:00 UTC ┆ [0]       │
#> │ 2021-12-16 00:00:00 UTC ┆ [0, 1, 2] │
#> │ 2021-12-16 01:00:00 UTC ┆ [2, 3, 4] │
#> │ 2021-12-16 02:00:00 UTC ┆ [4, 5, 6] │
#> │ 2021-12-16 03:00:00 UTC ┆ [6]       │
#> └─────────────────────────┴───────────┘

# Dynamic group bys can also be combined with grouping on normal keys
df = df$with_columns(
  groups = as_polars_series(c("a", "a", "a", "b", "b", "a", "a"))
)
df

#> shape: (7, 3)
#> ┌─────────────────────────┬─────┬────────┐
#> │ time                    ┆ n   ┆ groups │
#> │ ---                     ┆ --- ┆ ---    │
#> │ datetime[ms, UTC]       ┆ i32 ┆ str    │
#> ╞═════════════════════════╪═════╪════════╡
#> │ 2021-12-16 00:00:00 UTC ┆ 0   ┆ a      │
#> │ 2021-12-16 00:30:00 UTC ┆ 1   ┆ a      │
#> │ 2021-12-16 01:00:00 UTC ┆ 2   ┆ a      │
#> │ 2021-12-16 01:30:00 UTC ┆ 3   ┆ b      │
#> │ 2021-12-16 02:00:00 UTC ┆ 4   ┆ b      │
#> │ 2021-12-16 02:30:00 UTC ┆ 5   ┆ a      │
#> │ 2021-12-16 03:00:00 UTC ┆ 6   ┆ a      │
#> └─────────────────────────┴─────┴────────┘

df$group_by_dynamic(
  "time",
  every = "1h",
  closed = "both",
  group_by = "groups",
  include_boundaries = TRUE
)$agg(pl$col("n"))

#> shape: (7, 5)
#> ┌────────┬─────────────────────────┬─────────────────────────┬─────────────────────────┬───────────┐
#> │ groups ┆ _lower_boundary         ┆ _upper_boundary         ┆ time                    ┆ n         │
#> │ ---    ┆ ---                     ┆ ---                     ┆ ---                     ┆ ---       │
#> │ str    ┆ datetime[ms, UTC]       ┆ datetime[ms, UTC]       ┆ datetime[ms, UTC]       ┆ list[i32] │
#> ╞════════╪═════════════════════════╪═════════════════════════╪═════════════════════════╪═══════════╡
#> │ a      ┆ 2021-12-15 23:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ 2021-12-15 23:00:00 UTC ┆ [0]       │
#> │ a      ┆ 2021-12-16 00:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 00:00:00 UTC ┆ [0, 1, 2] │
#> │ a      ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ [2]       │
#> │ a      ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ [5, 6]    │
#> │ a      ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 04:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ [6]       │
#> │ b      ┆ 2021-12-16 01:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 01:00:00 UTC ┆ [3, 4]    │
#> │ b      ┆ 2021-12-16 02:00:00 UTC ┆ 2021-12-16 03:00:00 UTC ┆ 2021-12-16 02:00:00 UTC ┆ [4]       │
#> └────────┴─────────────────────────┴─────────────────────────┴─────────────────────────┴───────────┘

# We can also create a dynamic group by based on an index column
df = pl$LazyFrame(
  idx = 0:5,
  A = c("A", "A", "B", "B", "B", "C")
)$with_columns(pl$col("idx")$set_sorted())
df

#> polars LazyFrame
#>  $explain(): Show the optimized query plan.
#> 
#> Naive plan:
#>  WITH_COLUMNS:
#>  [col("idx").map()] 
#>   DF ["idx", "A"]; PROJECT */2 COLUMNS; SELECTION: None

df$group_by_dynamic(
  "idx",
  every = "2i",
  period = "3i",
  include_boundaries = TRUE,
  closed = "right"
)$agg(A_agg_list = pl$col("A"))

#> polars LazyFrame
#>  $explain(): Show the optimized query plan.
#> 
#> Naive plan:
#> AGGREGATE
#>  [col("A").alias("A_agg_list")] BY [] FROM
#>    WITH_COLUMNS:
#>    [col("idx").map()] 
#>     DF ["idx", "A"]; PROJECT */2 COLUMNS; SELECTION: None