Note: To use this notebook, you must first install the Rust excvr jupyter kernel. Also, note that clone() is used fairly often in the examples. This is because we tend to create one dataset for multiple examples. When this dataset is used, the rust ownership system will move that dataframe, which will make it unavailable to later examples. By cloneing, we can keep using it over and over.

:dep polars = { version = "0.23.2", features = ["lazy", "csv-file", "strings", "temporal", "dtype-duration", "dtype-categorical", "concat_str", "list", "list_eval", "rank", "lazy_regex"]}
:dep color-eyre = {version = "0.6.2"}
:dep rand = {version = "0.8.5"}
:dep reqwest = { version = "0.11.11", features = ["blocking"]}

use color_eyre::{Result};
use polars::prelude::*;

Expressions

fn(Series) -> Series

  • Lazily evaluated
    • Can be optimized
    • Gives the library writer context and informed decisions can be made
  • Embarrassingly parallel
  • Context dependent
    • selection/projection -> Series = COLUMN, LITERAL, or VALUE
    • aggregation -> Series = GROUPS
let df = df! [
    "A"        => [1, 2, 3, 4, 5],
    "fruits"   => ["banana", "banana", "apple", "apple", "banana"],
    "B"        => [5, 4, 3, 2, 1],
    "cars"     => ["beetle", "audi", "beetle", "beetle", "beetle"],
    "optional" => [Some(28), Some(300), None, Some(2), Some(-30)],
]?;
df
shape: (5, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ optional │
│ --- ┆ ---    ┆ --- ┆ ---    ┆ ---      │
│ i32 ┆ str    ┆ i32 ┆ str    ┆ i32      │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ 28       │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ 300      │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ null     │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ 2        │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ -30      │
└─────┴────────┴─────┴────────┴──────────┘

Selection context

// We can select by name
// We'll be re-using the dataframe a bunch, so we'll clone copies as we go.
df.clone().lazy().select([
    col("A"),
    col("B"),
    lit("B"),  // we must tell polars we mean the literal "B"
    col("fruits"),
]).collect()?
shape: (5, 4)
┌─────┬─────┬─────────┬────────┐
│ A   ┆ B   ┆ literal ┆ fruits │
│ --- ┆ --- ┆ ---     ┆ ---    │
│ i32 ┆ i32 ┆ str     ┆ str    │
╞═════╪═════╪═════════╪════════╡
│ 1   ┆ 5   ┆ B       ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2   ┆ 4   ┆ B       ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3   ┆ 3   ┆ B       ┆ apple  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4   ┆ 2   ┆ B       ┆ apple  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5   ┆ 1   ┆ B       ┆ banana │
└─────┴─────┴─────────┴────────┘
// you can select columns with a regex if it starts with '^' and ends with '$'

df.clone().lazy().select([
    col("^A|B$").sum()
]).collect()?
shape: (1, 2)
┌─────┬─────┐
│ A   ┆ B   │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15  ┆ 15  │
└─────┴─────┘
// you can select multiple columns by name

df.clone().lazy().select([
    cols(["A", "B"]).sum()
]).collect()?
shape: (1, 2)
┌─────┬─────┐
│ A   ┆ B   │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15  ┆ 15  │
└─────┴─────┘
// We select everything in normal order
// Then we select everything in reversed order

df.clone().lazy().select([
    all(),
    all().reverse().suffix("_reverse")
]).collect()?
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────────┬───────────┬──────────────┬──────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ ... ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse ┆ opti │
│ --- ┆ ---    ┆ --- ┆ ---    ┆     ┆ ---            ┆ ---       ┆ ---          ┆ onal │
│ i32 ┆ str    ┆ i32 ┆ str    ┆     ┆ str            ┆ i32       ┆ str          ┆ _rev │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ erse │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ ---  │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ i32  │
╞═════╪════════╪═════╪════════╪═════╪════════════════╪═══════════╪══════════════╪══════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ ... ┆ banana         ┆ 1         ┆ beetle       ┆ -30  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ ... ┆ apple          ┆ 2         ┆ beetle       ┆ 2    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ ... ┆ apple          ┆ 3         ┆ beetle       ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ ... ┆ banana         ┆ 4         ┆ audi         ┆ 300  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ ... ┆ banana         ┆ 5         ┆ beetle       ┆ 28   │
└─────┴────────┴─────┴────────┴─────┴────────────────┴───────────┴──────────────┴──────┘
// all expressions run in parallel
// single valued `Series` are broadcasted to the shape of the `DataFrame`

df.clone().lazy().select([
    all(),
    all().sum().suffix("_sum")
]).collect()?
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────┬───────┬──────────┬──────────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ ... ┆ fruits_sum ┆ B_sum ┆ cars_sum ┆ optional_sum │
│ --- ┆ ---    ┆ --- ┆ ---    ┆     ┆ ---        ┆ ---   ┆ ---      ┆ ---          │
│ i32 ┆ str    ┆ i32 ┆ str    ┆     ┆ str        ┆ i32   ┆ str      ┆ i32          │
╞═════╪════════╪═════╪════════╪═════╪════════════╪═══════╪══════════╪══════════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
└─────┴────────┴─────┴────────┴─────┴────────────┴───────┴──────────┴──────────────┘
// there are `str` and `dt` namespaces for specialized functions

let predicate = col("fruits").str().contains("^b.*");

df.clone().lazy().select([
    predicate
]).collect()?
shape: (5, 1)
┌────────┐
│ fruits │
│ ---    │
│ bool   │
╞════════╡
│ true   │
├╌╌╌╌╌╌╌╌┤
│ true   │
├╌╌╌╌╌╌╌╌┤
│ false  │
├╌╌╌╌╌╌╌╌┤
│ false  │
├╌╌╌╌╌╌╌╌┤
│ true   │
└────────┘
// use the predicate to filter
let predicate = col("fruits").str().contains("^b.*");
df.clone().lazy().filter(predicate).collect()?
shape: (3, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ optional │
│ --- ┆ ---    ┆ --- ┆ ---    ┆ ---      │
│ i32 ┆ str    ┆ i32 ┆ str    ┆ i32      │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ 28       │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ 300      │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ -30      │
└─────┴────────┴─────┴────────┴──────────┘
// predicate expressions can be used to filter

df.clone().lazy().select([
    col("A").filter(col("fruits").str().contains("^b.*")).sum(),
    (col("B").filter(col("cars").str().contains("^b.*")).sum() * col("B").sum()).alias("some_compute()"),
]).collect()?
shape: (1, 2)
┌─────┬────────────────┐
│ A   ┆ some_compute() │
│ --- ┆ ---            │
│ i32 ┆ i32            │
╞═════╪════════════════╡
│ 8   ┆ 165            │
└─────┴────────────────┘
// We can do arithmetic on columns and (literal) values
// can be evaluated to 1 without programmer knowing

let some_var = 1;

df.clone().lazy().select([
    ((col("A") / lit(124.0) * col("B")) / sum("B") * lit(some_var)).alias("computed")
]).collect()?
shape: (5, 1)
┌──────────┐
│ computed │
│ ---      │
│ f64      │
╞══════════╡
│ 0.002688 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004839 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.002688 │
└──────────┘
// We can combine columns by a predicate
// This doesn't work.  It seems like the condition always evaluates to true
df.clone().lazy().select([
    col("fruits"),
    col("B"),
    when(col("fruits") == lit("banana")).then(col("B")).otherwise(-1).alias("b when not banana")
]).collect()?
shape: (5, 3)
┌────────┬─────┬───────────────────┐
│ fruits ┆ B   ┆ b when not banana │
│ ---    ┆ --- ┆ ---               │
│ str    ┆ i32 ┆ i32               │
╞════════╪═════╪═══════════════════╡
│ banana ┆ 5   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ -1                │
└────────┴─────┴───────────────────┘
// We can combine columns by a fold operation on column level

df.clone().lazy().select([
    col("A"),
    col("B"),
    fold_exprs(lit(0), |a, b| Ok(&a + &b), [
        col("A"),
        lit("B"),
        col("B").pow(lit(2)),
        col("A") / lit(2.0)
    ]).alias("fold")
]).collect()?
shape: (5, 3)
┌─────┬─────┬───────────┐
│ A   ┆ B   ┆ fold      │
│ --- ┆ --- ┆ ---       │
│ i32 ┆ i32 ┆ str       │
╞═════╪═════╪═══════════╡
│ 1   ┆ 5   ┆ 1B25.00.5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 4   ┆ 2B16.01.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ 3   ┆ 3B9.01.5  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ 2   ┆ 4B4.02.0  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ 1   ┆ 5B1.02.5  │
└─────┴─────┴───────────┘
// even combine all
use std::convert::TryInto;
let height: i32 = df.height().try_into()?;
df.clone().lazy().select([
    range(0i32, height).alias("idx"),
    col("A"),
    col("A").shift(1).alias("A_shifted"),
    concat_str([all()], "").alias("str_concat_1"),  // prefer this
    fold_exprs(col("A"), |a, b| Ok(a + b), [all().exclude(["A"])]).alias("str_concat_2"), // over this (accidentally O(n^2))
]).collect()?
shape: (5, 5)
┌─────┬─────┬───────────┬───────────────────┬───────────────────┐
│ idx ┆ A   ┆ A_shifted ┆ str_concat_1      ┆ str_concat_2      │
│ --- ┆ --- ┆ ---       ┆ ---               ┆ ---               │
│ i32 ┆ i32 ┆ i32       ┆ str               ┆ str               │
╞═════╪═════╪═══════════╪═══════════════════╪═══════════════════╡
│ 0   ┆ 1   ┆ null      ┆ 1banana5beetle28  ┆ 1banana5beetle28  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1   ┆ 2   ┆ 1         ┆ 2banana4audi300   ┆ 2banana4audi300   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 3   ┆ 2         ┆ null              ┆ null              │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ 4   ┆ 3         ┆ 4apple2beetle2    ┆ 4apple2beetle2    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ 5   ┆ 4         ┆ 5banana1beetle-30 ┆ 5banana1beetle-30 │
└─────┴─────┴───────────┴───────────────────┴───────────────────┘

Aggregation context

  • expressions are applied over groups instead of columns
// we can still combine many expressions

df.clone().lazy().sort("cars", SortOptions::default()).groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        sum("B").alias("B_sum2"),  // syntactic sugar for the first
        col("fruits").first().alias("fruits_first"),
        col("A").count().alias("count"),
        col("cars").reverse()
    ]).collect()?
shape: (2, 6)
┌────────┬───────┬────────┬──────────────┬───────┬──────────────────────────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars                         │
│ ---    ┆ ---   ┆ ---    ┆ ---          ┆ ---   ┆ ---                          │
│ str    ┆ i32   ┆ i32    ┆ str          ┆ u32   ┆ list[str]                    │
╞════════╪═══════╪════════╪══════════════╪═══════╪══════════════════════════════╡
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ ["beetle", "beetle"]         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ ["beetle", "beetle", "audi"] │
└────────┴───────┴────────┴──────────────┴───────┴──────────────────────────────┘
// We can explode the list column "cars"

df.clone().lazy()
    .sort("cars", SortOptions { descending: false, nulls_last: false })
    .groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        sum("B").alias("B_sum2"),  // syntactic sugar for the first
        col("fruits").first().alias("fruits_first"),
        col("A").count().alias("count"),
        col("cars").reverse()
    ])
    .explode(["cars"])
    .collect()?
shape: (5, 6)
┌────────┬───────┬────────┬──────────────┬───────┬────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars   │
│ ---    ┆ ---   ┆ ---    ┆ ---          ┆ ---   ┆ ---    │
│ str    ┆ i32   ┆ i32    ┆ str          ┆ u32   ┆ str    │
╞════════╪═══════╪════════╪══════════════╪═══════╪════════╡
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ audi   │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ beetle │
└────────┴───────┴────────┴──────────────┴───────┴────────┘
df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        col("fruits").first().alias("fruits_first"),
        count(),
        col("B").shift(1).alias("B_shifted")
    ])
    .explode(["B_shifted"])
    .collect()?
shape: (5, 5)
┌────────┬───────┬──────────────┬───────┬───────────┐
│ fruits ┆ B_sum ┆ fruits_first ┆ count ┆ B_shifted │
│ ---    ┆ ---   ┆ ---          ┆ ---   ┆ ---       │
│ str    ┆ i32   ┆ str          ┆ u32   ┆ i32       │
╞════════╪═══════╪══════════════╪═══════╪═══════════╡
│ apple  ┆ 5     ┆ apple        ┆ 2     ┆ null      │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ apple        ┆ 2     ┆ 3         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ null      │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ 5         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ 4         │
└────────┴───────┴──────────────┴───────┴───────────┘
// we can also get the list of the groups

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").shift(1).alias("shift_B"),
        col("B").reverse().alias("rev_B"),
    ])
    .collect()?
shape: (2, 3)
┌────────┬──────────────┬───────────┐
│ fruits ┆ shift_B      ┆ rev_B     │
│ ---    ┆ ---          ┆ ---       │
│ str    ┆ list[i32]    ┆ list[i32] │
╞════════╪══════════════╪═══════════╡
│ banana ┆ [null, 5, 4] ┆ [1, 4, 5] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ [null, 3]    ┆ [2, 3]    │
└────────┴──────────────┴───────────┘
// we can do predicates in the groupby as well

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").filter(col("B").gt(lit(1))).list().keep_name(),
    ])
    .collect()?
shape: (2, 2)
┌────────┬───────────┐
│ fruits ┆ B         │
│ ---    ┆ ---       │
│ str    ┆ list[i32] │
╞════════╪═══════════╡
│ apple  ┆ [3, 2]    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [5, 4]    │
└────────┴───────────┘
// and sum only by the values where the predicates are true

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").filter(col("B").gt(lit(1))).mean(),
    ])
    .collect()?
shape: (2, 2)
┌────────┬─────┐
│ fruits ┆ B   │
│ ---    ┆ --- │
│ str    ┆ f64 │
╞════════╪═════╡
│ apple  ┆ 2.5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ banana ┆ 4.5 │
└────────┴─────┘
// Another example

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").shift_and_fill(1, 0).alias("shifted"),
        col("B").shift_and_fill(1, 0).sum().alias("shifted_sum"),
    ])
    .collect()?
shape: (2, 3)
┌────────┬───────────┬─────────────┐
│ fruits ┆ shifted   ┆ shifted_sum │
│ ---    ┆ ---       ┆ ---         │
│ str    ┆ list[i32] ┆ i32         │
╞════════╪═══════════╪═════════════╡
│ apple  ┆ [0, 3]    ┆ 3           │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [0, 5, 4] ┆ 9           │
└────────┴───────────┴─────────────┘

Window functions!

  • Expression with superpowers.
  • Aggregation in selection context

#![allow(unused)]
fn main() {
col("foo").aggregation_expression(..).over("column_used_to_group")
}
// groupby 2 different columns

df.clone().lazy()
    .select([
        col("fruits"),
        col("cars"),
        col("B"),
        col("B").sum().over(["fruits"]).alias("B_sum_by_fruits"),
        col("B").sum().over(["cars"]).alias("B_sum_by_cars"),
    ])
    .collect()?
shape: (5, 5)
┌────────┬────────┬─────┬─────────────────┬───────────────┐
│ fruits ┆ cars   ┆ B   ┆ B_sum_by_fruits ┆ B_sum_by_cars │
│ ---    ┆ ---    ┆ --- ┆ ---             ┆ ---           │
│ str    ┆ str    ┆ i32 ┆ i32             ┆ i32           │
╞════════╪════════╪═════╪═════════════════╪═══════════════╡
│ banana ┆ beetle ┆ 5   ┆ 10              ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ audi   ┆ 4   ┆ 10              ┆ 4             │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ beetle ┆ 3   ┆ 5               ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ beetle ┆ 2   ┆ 5               ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ beetle ┆ 1   ┆ 10              ┆ 11            │
└────────┴────────┴─────┴─────────────────┴───────────────┘
// reverse B by groups and show the results in original DF

df.clone().lazy()
    .select([
        col("fruits"),
        col("B"),
        col("B").reverse().over(["fruits"]).alias("B_reversed_by_fruits")
    ])
    .collect()?
shape: (5, 3)
┌────────┬─────┬──────────────────────┐
│ fruits ┆ B   ┆ B_reversed_by_fruits │
│ ---    ┆ --- ┆ ---                  │
│ str    ┆ i32 ┆ i32                  │
╞════════╪═════╪══════════════════════╡
│ banana ┆ 5   ┆ 1                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ 4                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ 2                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ 3                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ 5                    │
└────────┴─────┴──────────────────────┘
// Lag a column within "fruits"

df.clone().lazy()
    .select([
        col("fruits"),
        col("B"),
        col("B").shift(1).over(["fruits"]).alias("lag_B_by_fruits")
    ])
    .collect()?
shape: (5, 3)
┌────────┬─────┬─────────────────┐
│ fruits ┆ B   ┆ lag_B_by_fruits │
│ ---    ┆ --- ┆ ---             │
│ str    ┆ i32 ┆ i32             │
╞════════╪═════╪═════════════════╡
│ banana ┆ 5   ┆ null            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ 5               │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ null            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ 3               │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ 4               │
└────────┴─────┴─────────────────┘