Note: To use this notebook, you must first install the Rust excvr jupyter kernel. Also, note that clone() is used fairly often in the examples. This is because we tend to create one dataset for multiple examples. When this dataset is used, the rust ownership system will move that dataframe, which will make it unavailable to later examples. By cloneing, we can keep using it over and over.


#![allow(unused)]
fn main() {
:dep polars = { version = "0.23.2", features = ["lazy", "csv-file", "strings", "temporal", "dtype-duration", "dtype-categorical", "concat_str", "list", "list_eval", "rank", "lazy_regex"]}
:dep color-eyre = {version = "0.6.2"}
:dep rand = {version = "0.8.5"}
:dep reqwest = { version = "0.11.11", features = ["blocking"]}

use color_eyre::{Result};
use polars::prelude::*;
}

Expressions

fn(Series) -> Series

  • Lazily evaluated
    • Can be optimized
    • Gives the library writer context and informed decisions can be made
  • Embarrassingly parallel
  • Context dependent
    • selection/projection -> Series = COLUMN, LITERAL, or VALUE
    • aggregation -> Series = GROUPS

#![allow(unused)]
fn main() {
let df = df! [
    "A"        => [1, 2, 3, 4, 5],
    "fruits"   => ["banana", "banana", "apple", "apple", "banana"],
    "B"        => [5, 4, 3, 2, 1],
    "cars"     => ["beetle", "audi", "beetle", "beetle", "beetle"],
    "optional" => [Some(28), Some(300), None, Some(2), Some(-30)],
]?;
df
}
shape: (5, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ optional │
│ --- ┆ ---    ┆ --- ┆ ---    ┆ ---      │
│ i32 ┆ str    ┆ i32 ┆ str    ┆ i32      │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ 28       │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ 300      │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ null     │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ 2        │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ -30      │
└─────┴────────┴─────┴────────┴──────────┘

Selection context


#![allow(unused)]
fn main() {
// We can select by name
// We'll be re-using the dataframe a bunch, so we'll clone copies as we go.
df.clone().lazy().select([
    col("A"),
    col("B"),
    lit("B"),  // we must tell polars we mean the literal "B"
    col("fruits"),
]).collect()?
}
shape: (5, 4)
┌─────┬─────┬─────────┬────────┐
│ A   ┆ B   ┆ literal ┆ fruits │
│ --- ┆ --- ┆ ---     ┆ ---    │
│ i32 ┆ i32 ┆ str     ┆ str    │
╞═════╪═════╪═════════╪════════╡
│ 1   ┆ 5   ┆ B       ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2   ┆ 4   ┆ B       ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3   ┆ 3   ┆ B       ┆ apple  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4   ┆ 2   ┆ B       ┆ apple  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5   ┆ 1   ┆ B       ┆ banana │
└─────┴─────┴─────────┴────────┘

#![allow(unused)]
fn main() {
// you can select columns with a regex if it starts with '^' and ends with '$'

df.clone().lazy().select([
    col("^A|B$").sum()
]).collect()?
}
shape: (1, 2)
┌─────┬─────┐
│ A   ┆ B   │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15  ┆ 15  │
└─────┴─────┘

#![allow(unused)]
fn main() {
// you can select multiple columns by name

df.clone().lazy().select([
    cols(["A", "B"]).sum()
]).collect()?
}
shape: (1, 2)
┌─────┬─────┐
│ A   ┆ B   │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15  ┆ 15  │
└─────┴─────┘

#![allow(unused)]
fn main() {
// We select everything in normal order
// Then we select everything in reversed order

df.clone().lazy().select([
    all(),
    all().reverse().suffix("_reverse")
]).collect()?
}
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────────┬───────────┬──────────────┬──────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ ... ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse ┆ opti │
│ --- ┆ ---    ┆ --- ┆ ---    ┆     ┆ ---            ┆ ---       ┆ ---          ┆ onal │
│ i32 ┆ str    ┆ i32 ┆ str    ┆     ┆ str            ┆ i32       ┆ str          ┆ _rev │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ erse │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ ---  │
│     ┆        ┆     ┆        ┆     ┆                ┆           ┆              ┆ i32  │
╞═════╪════════╪═════╪════════╪═════╪════════════════╪═══════════╪══════════════╪══════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ ... ┆ banana         ┆ 1         ┆ beetle       ┆ -30  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ ... ┆ apple          ┆ 2         ┆ beetle       ┆ 2    │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ ... ┆ apple          ┆ 3         ┆ beetle       ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ ... ┆ banana         ┆ 4         ┆ audi         ┆ 300  │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ ... ┆ banana         ┆ 5         ┆ beetle       ┆ 28   │
└─────┴────────┴─────┴────────┴─────┴────────────────┴───────────┴──────────────┴──────┘

#![allow(unused)]
fn main() {
// all expressions run in parallel
// single valued `Series` are broadcasted to the shape of the `DataFrame`

df.clone().lazy().select([
    all(),
    all().sum().suffix("_sum")
]).collect()?
}
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────┬───────┬──────────┬──────────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ ... ┆ fruits_sum ┆ B_sum ┆ cars_sum ┆ optional_sum │
│ --- ┆ ---    ┆ --- ┆ ---    ┆     ┆ ---        ┆ ---   ┆ ---      ┆ ---          │
│ i32 ┆ str    ┆ i32 ┆ str    ┆     ┆ str        ┆ i32   ┆ str      ┆ i32          │
╞═════╪════════╪═════╪════════╪═════╪════════════╪═══════╪══════════╪══════════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ apple  ┆ 3   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ apple  ┆ 2   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ ... ┆ null       ┆ 15    ┆ null     ┆ 300          │
└─────┴────────┴─────┴────────┴─────┴────────────┴───────┴──────────┴──────────────┘

#![allow(unused)]
fn main() {
// there are `str` and `dt` namespaces for specialized functions

let predicate = col("fruits").str().contains("^b.*");

df.clone().lazy().select([
    predicate
]).collect()?
}
shape: (5, 1)
┌────────┐
│ fruits │
│ ---    │
│ bool   │
╞════════╡
│ true   │
├╌╌╌╌╌╌╌╌┤
│ true   │
├╌╌╌╌╌╌╌╌┤
│ false  │
├╌╌╌╌╌╌╌╌┤
│ false  │
├╌╌╌╌╌╌╌╌┤
│ true   │
└────────┘

#![allow(unused)]
fn main() {
// use the predicate to filter
let predicate = col("fruits").str().contains("^b.*");
df.clone().lazy().filter(predicate).collect()?
}
shape: (3, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A   ┆ fruits ┆ B   ┆ cars   ┆ optional │
│ --- ┆ ---    ┆ --- ┆ ---    ┆ ---      │
│ i32 ┆ str    ┆ i32 ┆ str    ┆ i32      │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1   ┆ banana ┆ 5   ┆ beetle ┆ 28       │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ banana ┆ 4   ┆ audi   ┆ 300      │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ banana ┆ 1   ┆ beetle ┆ -30      │
└─────┴────────┴─────┴────────┴──────────┘

#![allow(unused)]
fn main() {
// predicate expressions can be used to filter

df.clone().lazy().select([
    col("A").filter(col("fruits").str().contains("^b.*")).sum(),
    (col("B").filter(col("cars").str().contains("^b.*")).sum() * col("B").sum()).alias("some_compute()"),
]).collect()?
}
shape: (1, 2)
┌─────┬────────────────┐
│ A   ┆ some_compute() │
│ --- ┆ ---            │
│ i32 ┆ i32            │
╞═════╪════════════════╡
│ 8   ┆ 165            │
└─────┴────────────────┘

#![allow(unused)]
fn main() {
// We can do arithmetic on columns and (literal) values
// can be evaluated to 1 without programmer knowing

let some_var = 1;

df.clone().lazy().select([
    ((col("A") / lit(124.0) * col("B")) / sum("B") * lit(some_var)).alias("computed")
]).collect()?
}
shape: (5, 1)
┌──────────┐
│ computed │
│ ---      │
│ f64      │
╞══════════╡
│ 0.002688 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004839 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.002688 │
└──────────┘

#![allow(unused)]
fn main() {
// We can combine columns by a predicate
// This doesn't work.  It seems like the condition always evaluates to true
df.clone().lazy().select([
    col("fruits"),
    col("B"),
    when(col("fruits") == lit("banana")).then(col("B")).otherwise(-1).alias("b when not banana")
]).collect()?
}
shape: (5, 3)
┌────────┬─────┬───────────────────┐
│ fruits ┆ B   ┆ b when not banana │
│ ---    ┆ --- ┆ ---               │
│ str    ┆ i32 ┆ i32               │
╞════════╪═════╪═══════════════════╡
│ banana ┆ 5   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ -1                │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ -1                │
└────────┴─────┴───────────────────┘

#![allow(unused)]
fn main() {
// We can combine columns by a fold operation on column level

df.clone().lazy().select([
    col("A"),
    col("B"),
    fold_exprs(lit(0), |a, b| Ok(&a + &b), [
        col("A"),
        lit("B"),
        col("B").pow(lit(2)),
        col("A") / lit(2.0)
    ]).alias("fold")
]).collect()?
}
shape: (5, 3)
┌─────┬─────┬───────────┐
│ A   ┆ B   ┆ fold      │
│ --- ┆ --- ┆ ---       │
│ i32 ┆ i32 ┆ str       │
╞═════╪═════╪═══════════╡
│ 1   ┆ 5   ┆ 1B25.00.5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 4   ┆ 2B16.01.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ 3   ┆ 3B9.01.5  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ 2   ┆ 4B4.02.0  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 5   ┆ 1   ┆ 5B1.02.5  │
└─────┴─────┴───────────┘

#![allow(unused)]
fn main() {
// even combine all
use std::convert::TryInto;
let height: i32 = df.height().try_into()?;
df.clone().lazy().select([
    range(0i32, height).alias("idx"),
    col("A"),
    col("A").shift(1).alias("A_shifted"),
    concat_str([all()], "").alias("str_concat_1"),  // prefer this
    fold_exprs(col("A"), |a, b| Ok(a + b), [all().exclude(["A"])]).alias("str_concat_2"), // over this (accidentally O(n^2))
]).collect()?
}
shape: (5, 5)
┌─────┬─────┬───────────┬───────────────────┬───────────────────┐
│ idx ┆ A   ┆ A_shifted ┆ str_concat_1      ┆ str_concat_2      │
│ --- ┆ --- ┆ ---       ┆ ---               ┆ ---               │
│ i32 ┆ i32 ┆ i32       ┆ str               ┆ str               │
╞═════╪═════╪═══════════╪═══════════════════╪═══════════════════╡
│ 0   ┆ 1   ┆ null      ┆ 1banana5beetle28  ┆ 1banana5beetle28  │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1   ┆ 2   ┆ 1         ┆ 2banana4audi300   ┆ 2banana4audi300   │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2   ┆ 3   ┆ 2         ┆ null              ┆ null              │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3   ┆ 4   ┆ 3         ┆ 4apple2beetle2    ┆ 4apple2beetle2    │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4   ┆ 5   ┆ 4         ┆ 5banana1beetle-30 ┆ 5banana1beetle-30 │
└─────┴─────┴───────────┴───────────────────┴───────────────────┘

Aggregation context

  • expressions are applied over groups instead of columns

#![allow(unused)]
fn main() {
// we can still combine many expressions

df.clone().lazy().sort("cars", SortOptions::default()).groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        sum("B").alias("B_sum2"),  // syntactic sugar for the first
        col("fruits").first().alias("fruits_first"),
        col("A").count().alias("count"),
        col("cars").reverse()
    ]).collect()?
}
shape: (2, 6)
┌────────┬───────┬────────┬──────────────┬───────┬──────────────────────────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars                         │
│ ---    ┆ ---   ┆ ---    ┆ ---          ┆ ---   ┆ ---                          │
│ str    ┆ i32   ┆ i32    ┆ str          ┆ u32   ┆ list[str]                    │
╞════════╪═══════╪════════╪══════════════╪═══════╪══════════════════════════════╡
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ ["beetle", "beetle"]         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ ["beetle", "beetle", "audi"] │
└────────┴───────┴────────┴──────────────┴───────┴──────────────────────────────┘

#![allow(unused)]
fn main() {
// We can explode the list column "cars"

df.clone().lazy()
    .sort("cars", SortOptions { descending: false, nulls_last: false })
    .groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        sum("B").alias("B_sum2"),  // syntactic sugar for the first
        col("fruits").first().alias("fruits_first"),
        col("A").count().alias("count"),
        col("cars").reverse()
    ])
    .explode(["cars"])
    .collect()?
}
shape: (5, 6)
┌────────┬───────┬────────┬──────────────┬───────┬────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars   │
│ ---    ┆ ---   ┆ ---    ┆ ---          ┆ ---   ┆ ---    │
│ str    ┆ i32   ┆ i32    ┆ str          ┆ u32   ┆ str    │
╞════════╪═══════╪════════╪══════════════╪═══════╪════════╡
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ 10     ┆ banana       ┆ 3     ┆ audi   │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ 5      ┆ apple        ┆ 2     ┆ beetle │
└────────┴───────┴────────┴──────────────┴───────┴────────┘

#![allow(unused)]
fn main() {
df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").sum().alias("B_sum"),
        col("fruits").first().alias("fruits_first"),
        count(),
        col("B").shift(1).alias("B_shifted")
    ])
    .explode(["B_shifted"])
    .collect()?
}
shape: (5, 5)
┌────────┬───────┬──────────────┬───────┬───────────┐
│ fruits ┆ B_sum ┆ fruits_first ┆ count ┆ B_shifted │
│ ---    ┆ ---   ┆ ---          ┆ ---   ┆ ---       │
│ str    ┆ i32   ┆ str          ┆ u32   ┆ i32       │
╞════════╪═══════╪══════════════╪═══════╪═══════════╡
│ apple  ┆ 5     ┆ apple        ┆ 2     ┆ null      │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 5     ┆ apple        ┆ 2     ┆ 3         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ null      │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ 5         │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10    ┆ banana       ┆ 3     ┆ 4         │
└────────┴───────┴──────────────┴───────┴───────────┘

#![allow(unused)]
fn main() {
// we can also get the list of the groups

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").shift(1).alias("shift_B"),
        col("B").reverse().alias("rev_B"),
    ])
    .collect()?
}
shape: (2, 3)
┌────────┬──────────────┬───────────┐
│ fruits ┆ shift_B      ┆ rev_B     │
│ ---    ┆ ---          ┆ ---       │
│ str    ┆ list[i32]    ┆ list[i32] │
╞════════╪══════════════╪═══════════╡
│ banana ┆ [null, 5, 4] ┆ [1, 4, 5] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ [null, 3]    ┆ [2, 3]    │
└────────┴──────────────┴───────────┘

#![allow(unused)]
fn main() {
// we can do predicates in the groupby as well

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").filter(col("B").gt(lit(1))).list().keep_name(),
    ])
    .collect()?
}
shape: (2, 2)
┌────────┬───────────┐
│ fruits ┆ B         │
│ ---    ┆ ---       │
│ str    ┆ list[i32] │
╞════════╪═══════════╡
│ apple  ┆ [3, 2]    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [5, 4]    │
└────────┴───────────┘

#![allow(unused)]
fn main() {
// and sum only by the values where the predicates are true

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").filter(col("B").gt(lit(1))).mean(),
    ])
    .collect()?
}
shape: (2, 2)
┌────────┬─────┐
│ fruits ┆ B   │
│ ---    ┆ --- │
│ str    ┆ f64 │
╞════════╪═════╡
│ apple  ┆ 2.5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ banana ┆ 4.5 │
└────────┴─────┘

#![allow(unused)]
fn main() {
// Another example

df.clone().lazy()
    .groupby(["fruits"])
    .agg([
        col("B").shift_and_fill(1, 0).alias("shifted"),
        col("B").shift_and_fill(1, 0).sum().alias("shifted_sum"),
    ])
    .collect()?
}
shape: (2, 3)
┌────────┬───────────┬─────────────┐
│ fruits ┆ shifted   ┆ shifted_sum │
│ ---    ┆ ---       ┆ ---         │
│ str    ┆ list[i32] ┆ i32         │
╞════════╪═══════════╪═════════════╡
│ apple  ┆ [0, 3]    ┆ 3           │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [0, 5, 4] ┆ 9           │
└────────┴───────────┴─────────────┘

Window functions!

  • Expression with superpowers.
  • Aggregation in selection context

#![allow(unused)]
fn main() {
col("foo").aggregation_expression(..).over("column_used_to_group")
}

#![allow(unused)]
fn main() {
// groupby 2 different columns

df.clone().lazy()
    .select([
        col("fruits"),
        col("cars"),
        col("B"),
        col("B").sum().over(["fruits"]).alias("B_sum_by_fruits"),
        col("B").sum().over(["cars"]).alias("B_sum_by_cars"),
    ])
    .collect()?
}
shape: (5, 5)
┌────────┬────────┬─────┬─────────────────┬───────────────┐
│ fruits ┆ cars   ┆ B   ┆ B_sum_by_fruits ┆ B_sum_by_cars │
│ ---    ┆ ---    ┆ --- ┆ ---             ┆ ---           │
│ str    ┆ str    ┆ i32 ┆ i32             ┆ i32           │
╞════════╪════════╪═════╪═════════════════╪═══════════════╡
│ banana ┆ beetle ┆ 5   ┆ 10              ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ audi   ┆ 4   ┆ 10              ┆ 4             │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ beetle ┆ 3   ┆ 5               ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ beetle ┆ 2   ┆ 5               ┆ 11            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ beetle ┆ 1   ┆ 10              ┆ 11            │
└────────┴────────┴─────┴─────────────────┴───────────────┘

#![allow(unused)]
fn main() {
// reverse B by groups and show the results in original DF

df.clone().lazy()
    .select([
        col("fruits"),
        col("B"),
        col("B").reverse().over(["fruits"]).alias("B_reversed_by_fruits")
    ])
    .collect()?
}
shape: (5, 3)
┌────────┬─────┬──────────────────────┐
│ fruits ┆ B   ┆ B_reversed_by_fruits │
│ ---    ┆ --- ┆ ---                  │
│ str    ┆ i32 ┆ i32                  │
╞════════╪═════╪══════════════════════╡
│ banana ┆ 5   ┆ 1                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ 4                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ 2                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ 3                    │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ 5                    │
└────────┴─────┴──────────────────────┘

#![allow(unused)]
fn main() {
// Lag a column within "fruits"

df.clone().lazy()
    .select([
        col("fruits"),
        col("B"),
        col("B").shift(1).over(["fruits"]).alias("lag_B_by_fruits")
    ])
    .collect()?
}
shape: (5, 3)
┌────────┬─────┬─────────────────┐
│ fruits ┆ B   ┆ lag_B_by_fruits │
│ ---    ┆ --- ┆ ---             │
│ str    ┆ i32 ┆ i32             │
╞════════╪═════╪═════════════════╡
│ banana ┆ 5   ┆ null            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4   ┆ 5               │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 3   ┆ null            │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple  ┆ 2   ┆ 3               │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1   ┆ 4               │
└────────┴─────┴─────────────────┘

#![allow(unused)]

fn main() {
}