Note: To use this notebook, you must first install the Rust
excvr
jupyter kernel. Also, note thatclone()
is used fairly often in the examples. This is because we tend to create one dataset for multiple examples. When this dataset is used, the rust ownership system willmove
that dataframe, which will make it unavailable to later examples. Byclone
ing, we can keep using it over and over.
:dep polars = { version = "0.23.2", features = ["lazy", "csv-file", "strings", "temporal", "dtype-duration", "dtype-categorical", "concat_str", "list", "list_eval", "rank", "lazy_regex"]}
:dep color-eyre = {version = "0.6.2"}
:dep rand = {version = "0.8.5"}
:dep reqwest = { version = "0.11.11", features = ["blocking"]}
use color_eyre::{Result};
use polars::prelude::*;
Expressions
fn(Series) -> Series
- Lazily evaluated
- Can be optimized
- Gives the library writer context and informed decisions can be made
- Embarrassingly parallel
- Context dependent
- selection/projection ->
Series
= COLUMN, LITERAL, or VALUE - aggregation ->
Series
= GROUPS
- selection/projection ->
let df = df! [
"A" => [1, 2, 3, 4, 5],
"fruits" => ["banana", "banana", "apple", "apple", "banana"],
"B" => [5, 4, 3, 2, 1],
"cars" => ["beetle", "audi", "beetle", "beetle", "beetle"],
"optional" => [Some(28), Some(300), None, Some(2), Some(-30)],
]?;
df
shape: (5, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A ┆ fruits ┆ B ┆ cars ┆ optional │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ str ┆ i32 ┆ str ┆ i32 │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ apple ┆ 3 ┆ beetle ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ apple ┆ 2 ┆ beetle ┆ 2 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │
└─────┴────────┴─────┴────────┴──────────┘
Selection context
// We can select by name
// We'll be re-using the dataframe a bunch, so we'll clone copies as we go.
df.clone().lazy().select([
col("A"),
col("B"),
lit("B"), // we must tell polars we mean the literal "B"
col("fruits"),
]).collect()?
shape: (5, 4)
┌─────┬─────┬─────────┬────────┐
│ A ┆ B ┆ literal ┆ fruits │
│ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ i32 ┆ str ┆ str │
╞═════╪═════╪═════════╪════════╡
│ 1 ┆ 5 ┆ B ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ B ┆ banana │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 3 ┆ 3 ┆ B ┆ apple │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 4 ┆ 2 ┆ B ┆ apple │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ 5 ┆ 1 ┆ B ┆ banana │
└─────┴─────┴─────────┴────────┘
// you can select columns with a regex if it starts with '^' and ends with '$'
df.clone().lazy().select([
col("^A|B$").sum()
]).collect()?
shape: (1, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15 ┆ 15 │
└─────┴─────┘
// you can select multiple columns by name
df.clone().lazy().select([
cols(["A", "B"]).sum()
]).collect()?
shape: (1, 2)
┌─────┬─────┐
│ A ┆ B │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪═════╡
│ 15 ┆ 15 │
└─────┴─────┘
// We select everything in normal order
// Then we select everything in reversed order
df.clone().lazy().select([
all(),
all().reverse().suffix("_reverse")
]).collect()?
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────────┬───────────┬──────────────┬──────┐
│ A ┆ fruits ┆ B ┆ cars ┆ ... ┆ fruits_reverse ┆ B_reverse ┆ cars_reverse ┆ opti │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ onal │
│ i32 ┆ str ┆ i32 ┆ str ┆ ┆ str ┆ i32 ┆ str ┆ _rev │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ erse │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ --- │
│ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ i32 │
╞═════╪════════╪═════╪════════╪═════╪════════════════╪═══════════╪══════════════╪══════╡
│ 1 ┆ banana ┆ 5 ┆ beetle ┆ ... ┆ banana ┆ 1 ┆ beetle ┆ -30 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ banana ┆ 4 ┆ audi ┆ ... ┆ apple ┆ 2 ┆ beetle ┆ 2 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ apple ┆ 3 ┆ beetle ┆ ... ┆ apple ┆ 3 ┆ beetle ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 4 ┆ apple ┆ 2 ┆ beetle ┆ ... ┆ banana ┆ 4 ┆ audi ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 5 ┆ banana ┆ 1 ┆ beetle ┆ ... ┆ banana ┆ 5 ┆ beetle ┆ 28 │
└─────┴────────┴─────┴────────┴─────┴────────────────┴───────────┴──────────────┴──────┘
// all expressions run in parallel
// single valued `Series` are broadcasted to the shape of the `DataFrame`
df.clone().lazy().select([
all(),
all().sum().suffix("_sum")
]).collect()?
shape: (5, 10)
┌─────┬────────┬─────┬────────┬─────┬────────────┬───────┬──────────┬──────────────┐
│ A ┆ fruits ┆ B ┆ cars ┆ ... ┆ fruits_sum ┆ B_sum ┆ cars_sum ┆ optional_sum │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ str ┆ i32 ┆ str ┆ ┆ str ┆ i32 ┆ str ┆ i32 │
╞═════╪════════╪═════╪════════╪═════╪════════════╪═══════╪══════════╪══════════════╡
│ 1 ┆ banana ┆ 5 ┆ beetle ┆ ... ┆ null ┆ 15 ┆ null ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ banana ┆ 4 ┆ audi ┆ ... ┆ null ┆ 15 ┆ null ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ apple ┆ 3 ┆ beetle ┆ ... ┆ null ┆ 15 ┆ null ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ apple ┆ 2 ┆ beetle ┆ ... ┆ null ┆ 15 ┆ null ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ banana ┆ 1 ┆ beetle ┆ ... ┆ null ┆ 15 ┆ null ┆ 300 │
└─────┴────────┴─────┴────────┴─────┴────────────┴───────┴──────────┴──────────────┘
// there are `str` and `dt` namespaces for specialized functions
let predicate = col("fruits").str().contains("^b.*");
df.clone().lazy().select([
predicate
]).collect()?
shape: (5, 1)
┌────────┐
│ fruits │
│ --- │
│ bool │
╞════════╡
│ true │
├╌╌╌╌╌╌╌╌┤
│ true │
├╌╌╌╌╌╌╌╌┤
│ false │
├╌╌╌╌╌╌╌╌┤
│ false │
├╌╌╌╌╌╌╌╌┤
│ true │
└────────┘
// use the predicate to filter
let predicate = col("fruits").str().contains("^b.*");
df.clone().lazy().filter(predicate).collect()?
shape: (3, 5)
┌─────┬────────┬─────┬────────┬──────────┐
│ A ┆ fruits ┆ B ┆ cars ┆ optional │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ str ┆ i32 ┆ str ┆ i32 │
╞═════╪════════╪═════╪════════╪══════════╡
│ 1 ┆ banana ┆ 5 ┆ beetle ┆ 28 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ banana ┆ 4 ┆ audi ┆ 300 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ banana ┆ 1 ┆ beetle ┆ -30 │
└─────┴────────┴─────┴────────┴──────────┘
// predicate expressions can be used to filter
df.clone().lazy().select([
col("A").filter(col("fruits").str().contains("^b.*")).sum(),
(col("B").filter(col("cars").str().contains("^b.*")).sum() * col("B").sum()).alias("some_compute()"),
]).collect()?
shape: (1, 2)
┌─────┬────────────────┐
│ A ┆ some_compute() │
│ --- ┆ --- │
│ i32 ┆ i32 │
╞═════╪════════════════╡
│ 8 ┆ 165 │
└─────┴────────────────┘
// We can do arithmetic on columns and (literal) values
// can be evaluated to 1 without programmer knowing
let some_var = 1;
df.clone().lazy().select([
((col("A") / lit(124.0) * col("B")) / sum("B") * lit(some_var)).alias("computed")
]).collect()?
shape: (5, 1)
┌──────────┐
│ computed │
│ --- │
│ f64 │
╞══════════╡
│ 0.002688 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004839 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.004301 │
├╌╌╌╌╌╌╌╌╌╌┤
│ 0.002688 │
└──────────┘
// We can combine columns by a predicate
// This doesn't work. It seems like the condition always evaluates to true
df.clone().lazy().select([
col("fruits"),
col("B"),
when(col("fruits") == lit("banana")).then(col("B")).otherwise(-1).alias("b when not banana")
]).collect()?
shape: (5, 3)
┌────────┬─────┬───────────────────┐
│ fruits ┆ B ┆ b when not banana │
│ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 │
╞════════╪═════╪═══════════════════╡
│ banana ┆ 5 ┆ -1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4 ┆ -1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 3 ┆ -1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 2 ┆ -1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1 ┆ -1 │
└────────┴─────┴───────────────────┘
// We can combine columns by a fold operation on column level
df.clone().lazy().select([
col("A"),
col("B"),
fold_exprs(lit(0), |a, b| Ok(&a + &b), [
col("A"),
lit("B"),
col("B").pow(lit(2)),
col("A") / lit(2.0)
]).alias("fold")
]).collect()?
shape: (5, 3)
┌─────┬─────┬───────────┐
│ A ┆ B ┆ fold │
│ --- ┆ --- ┆ --- │
│ i32 ┆ i32 ┆ str │
╞═════╪═════╪═══════════╡
│ 1 ┆ 5 ┆ 1B25.00.5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 4 ┆ 2B16.01.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 3 ┆ 3B9.01.5 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 2 ┆ 4B4.02.0 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 5 ┆ 1 ┆ 5B1.02.5 │
└─────┴─────┴───────────┘
// even combine all
use std::convert::TryInto;
let height: i32 = df.height().try_into()?;
df.clone().lazy().select([
range(0i32, height).alias("idx"),
col("A"),
col("A").shift(1).alias("A_shifted"),
concat_str([all()], "").alias("str_concat_1"), // prefer this
fold_exprs(col("A"), |a, b| Ok(a + b), [all().exclude(["A"])]).alias("str_concat_2"), // over this (accidentally O(n^2))
]).collect()?
shape: (5, 5)
┌─────┬─────┬───────────┬───────────────────┬───────────────────┐
│ idx ┆ A ┆ A_shifted ┆ str_concat_1 ┆ str_concat_2 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ i32 ┆ i32 ┆ i32 ┆ str ┆ str │
╞═════╪═════╪═══════════╪═══════════════════╪═══════════════════╡
│ 0 ┆ 1 ┆ null ┆ 1banana5beetle28 ┆ 1banana5beetle28 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 1 ┆ 2banana4audi300 ┆ 2banana4audi300 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ 3 ┆ 2 ┆ null ┆ null │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 3 ┆ 4 ┆ 3 ┆ 4apple2beetle2 ┆ 4apple2beetle2 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 4 ┆ 5 ┆ 4 ┆ 5banana1beetle-30 ┆ 5banana1beetle-30 │
└─────┴─────┴───────────┴───────────────────┴───────────────────┘
Aggregation context
- expressions are applied over groups instead of columns
// we can still combine many expressions
df.clone().lazy().sort("cars", SortOptions::default()).groupby(["fruits"])
.agg([
col("B").sum().alias("B_sum"),
sum("B").alias("B_sum2"), // syntactic sugar for the first
col("fruits").first().alias("fruits_first"),
col("A").count().alias("count"),
col("cars").reverse()
]).collect()?
shape: (2, 6)
┌────────┬───────┬────────┬──────────────┬───────┬──────────────────────────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 ┆ str ┆ u32 ┆ list[str] │
╞════════╪═══════╪════════╪══════════════╪═══════╪══════════════════════════════╡
│ apple ┆ 5 ┆ 5 ┆ apple ┆ 2 ┆ ["beetle", "beetle"] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ 10 ┆ banana ┆ 3 ┆ ["beetle", "beetle", "audi"] │
└────────┴───────┴────────┴──────────────┴───────┴──────────────────────────────┘
// We can explode the list column "cars"
df.clone().lazy()
.sort("cars", SortOptions { descending: false, nulls_last: false })
.groupby(["fruits"])
.agg([
col("B").sum().alias("B_sum"),
sum("B").alias("B_sum2"), // syntactic sugar for the first
col("fruits").first().alias("fruits_first"),
col("A").count().alias("count"),
col("cars").reverse()
])
.explode(["cars"])
.collect()?
shape: (5, 6)
┌────────┬───────┬────────┬──────────────┬───────┬────────┐
│ fruits ┆ B_sum ┆ B_sum2 ┆ fruits_first ┆ count ┆ cars │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 ┆ str ┆ u32 ┆ str │
╞════════╪═══════╪════════╪══════════════╪═══════╪════════╡
│ banana ┆ 10 ┆ 10 ┆ banana ┆ 3 ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ 10 ┆ banana ┆ 3 ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ 10 ┆ banana ┆ 3 ┆ audi │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple ┆ 5 ┆ 5 ┆ apple ┆ 2 ┆ beetle │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┤
│ apple ┆ 5 ┆ 5 ┆ apple ┆ 2 ┆ beetle │
└────────┴───────┴────────┴──────────────┴───────┴────────┘
df.clone().lazy()
.groupby(["fruits"])
.agg([
col("B").sum().alias("B_sum"),
col("fruits").first().alias("fruits_first"),
count(),
col("B").shift(1).alias("B_shifted")
])
.explode(["B_shifted"])
.collect()?
shape: (5, 5)
┌────────┬───────┬──────────────┬───────┬───────────┐
│ fruits ┆ B_sum ┆ fruits_first ┆ count ┆ B_shifted │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ str ┆ u32 ┆ i32 │
╞════════╪═══════╪══════════════╪═══════╪═══════════╡
│ apple ┆ 5 ┆ apple ┆ 2 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 5 ┆ apple ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ banana ┆ 3 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ banana ┆ 3 ┆ 5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 10 ┆ banana ┆ 3 ┆ 4 │
└────────┴───────┴──────────────┴───────┴───────────┘
// we can also get the list of the groups
df.clone().lazy()
.groupby(["fruits"])
.agg([
col("B").shift(1).alias("shift_B"),
col("B").reverse().alias("rev_B"),
])
.collect()?
shape: (2, 3)
┌────────┬──────────────┬───────────┐
│ fruits ┆ shift_B ┆ rev_B │
│ --- ┆ --- ┆ --- │
│ str ┆ list[i32] ┆ list[i32] │
╞════════╪══════════════╪═══════════╡
│ banana ┆ [null, 5, 4] ┆ [1, 4, 5] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ [null, 3] ┆ [2, 3] │
└────────┴──────────────┴───────────┘
// we can do predicates in the groupby as well
df.clone().lazy()
.groupby(["fruits"])
.agg([
col("B").filter(col("B").gt(lit(1))).list().keep_name(),
])
.collect()?
shape: (2, 2)
┌────────┬───────────┐
│ fruits ┆ B │
│ --- ┆ --- │
│ str ┆ list[i32] │
╞════════╪═══════════╡
│ apple ┆ [3, 2] │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [5, 4] │
└────────┴───────────┘
// and sum only by the values where the predicates are true
df.clone().lazy()
.groupby(["fruits"])
.agg([
col("B").filter(col("B").gt(lit(1))).mean(),
])
.collect()?
shape: (2, 2)
┌────────┬─────┐
│ fruits ┆ B │
│ --- ┆ --- │
│ str ┆ f64 │
╞════════╪═════╡
│ apple ┆ 2.5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ banana ┆ 4.5 │
└────────┴─────┘
// Another example
df.clone().lazy()
.groupby(["fruits"])
.agg([
col("B").shift_and_fill(1, 0).alias("shifted"),
col("B").shift_and_fill(1, 0).sum().alias("shifted_sum"),
])
.collect()?
shape: (2, 3)
┌────────┬───────────┬─────────────┐
│ fruits ┆ shifted ┆ shifted_sum │
│ --- ┆ --- ┆ --- │
│ str ┆ list[i32] ┆ i32 │
╞════════╪═══════════╪═════════════╡
│ apple ┆ [0, 3] ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ [0, 5, 4] ┆ 9 │
└────────┴───────────┴─────────────┘
Window functions!
- Expression with superpowers.
- Aggregation in selection context
#![allow(unused)] fn main() { col("foo").aggregation_expression(..).over("column_used_to_group") }
// groupby 2 different columns
df.clone().lazy()
.select([
col("fruits"),
col("cars"),
col("B"),
col("B").sum().over(["fruits"]).alias("B_sum_by_fruits"),
col("B").sum().over(["cars"]).alias("B_sum_by_cars"),
])
.collect()?
shape: (5, 5)
┌────────┬────────┬─────┬─────────────────┬───────────────┐
│ fruits ┆ cars ┆ B ┆ B_sum_by_fruits ┆ B_sum_by_cars │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i32 ┆ i32 ┆ i32 │
╞════════╪════════╪═════╪═════════════════╪═══════════════╡
│ banana ┆ beetle ┆ 5 ┆ 10 ┆ 11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ audi ┆ 4 ┆ 10 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ beetle ┆ 3 ┆ 5 ┆ 11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ beetle ┆ 2 ┆ 5 ┆ 11 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ beetle ┆ 1 ┆ 10 ┆ 11 │
└────────┴────────┴─────┴─────────────────┴───────────────┘
// reverse B by groups and show the results in original DF
df.clone().lazy()
.select([
col("fruits"),
col("B"),
col("B").reverse().over(["fruits"]).alias("B_reversed_by_fruits")
])
.collect()?
shape: (5, 3)
┌────────┬─────┬──────────────────────┐
│ fruits ┆ B ┆ B_reversed_by_fruits │
│ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 │
╞════════╪═════╪══════════════════════╡
│ banana ┆ 5 ┆ 1 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4 ┆ 4 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 3 ┆ 2 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1 ┆ 5 │
└────────┴─────┴──────────────────────┘
// Lag a column within "fruits"
df.clone().lazy()
.select([
col("fruits"),
col("B"),
col("B").shift(1).over(["fruits"]).alias("lag_B_by_fruits")
])
.collect()?
shape: (5, 3)
┌────────┬─────┬─────────────────┐
│ fruits ┆ B ┆ lag_B_by_fruits │
│ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 │
╞════════╪═════╪═════════════════╡
│ banana ┆ 5 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 4 ┆ 5 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 3 ┆ null │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ apple ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ banana ┆ 1 ┆ 4 │
└────────┴─────┴─────────────────┘