import polars as pl

Expressions

fn(Series) -> Series

  • Lazily evaluated
    • Can be optimized
    • Gives the library writer context and informed decision can be made
  • Embarrassingly parallel
  • Context dependent
    • selection / projection -> Series = COLUMN, LITERAL or VALUE
    • aggregation -> Series = GROUPS
df = pl.DataFrame(
    {
        "A": [1, 2, 3, 4, 5],
        "fruits": ["banana", "banana", "apple", "apple", "banana"],
        "B": [5, 4, 3, 2, 1],
        "cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
        "optional": [28, 300, None, 2, -30],
    }
)
df
A fruits B cars optional
i64 str i64 str i64
1 "banana" 5 "beetle" 28
2 "banana" 4 "audi" 300
3 "apple" 3 "beetle" null
4 "apple" 2 "beetle" 2
5 "banana" 1 "beetle" -30

Selection context

# We can select by name

(df.select([
    pl.col("A"),
    "B",      # the col part is inferred
    pl.lit("B"),  # we must tell polars we mean the literal "B"
    pl.col("fruits"),
]))
A B literal fruits
i64 i64 str str
1 5 "B" "banana"
2 4 "B" "banana"
3 3 "B" "apple"
4 2 "B" "apple"
5 1 "B" "banana"
# you can select columns with a regex if it starts with '^' and ends with '$'

(df.select([
    pl.col("^A|B$").sum()
]))
A B
i64 i64
15 15
# you can select multiple columns by name

(df.select([
    pl.col(["A", "B"]).sum()
]))
A B
i64 i64
15 15
# We select everything in normal order
# Then we select everything in reversed order

(df.select([
    pl.all(),
    pl.all().reverse().suffix("_reverse")
]))
A fruits B cars optional A_reverse fruits_reverse B_reverse cars_reverse optional_reverse
i64 str i64 str i64 i64 str i64 str i64
1 "banana" 5 "beetle" 28 5 "banana" 1 "beetle" -30
2 "banana" 4 "audi" 300 4 "apple" 2 "beetle" 2
3 "apple" 3 "beetle" null 3 "apple" 3 "beetle" null
4 "apple" 2 "beetle" 2 2 "banana" 4 "audi" 300
5 "banana" 1 "beetle" -30 1 "banana" 5 "beetle" 28
# all expressions run in parallel
# single valued `Series` are broadcasted to the shape of the `DataFrame`

(df.select([
    pl.all(),
    pl.all().sum().suffix("_sum")
]))
A fruits B cars optional A_sum fruits_sum B_sum cars_sum optional_sum
i64 str i64 str i64 i64 str i64 str i64
1 "banana" 5 "beetle" 28 15 null 15 null 300
2 "banana" 4 "audi" 300 15 null 15 null 300
3 "apple" 3 "beetle" null 15 null 15 null 300
4 "apple" 2 "beetle" 2 15 null 15 null 300
5 "banana" 1 "beetle" -30 15 null 15 null 300
# there are `str` and `dt` namespaces for specialized functions

predicate = pl.col("fruits").str.contains("^b.*")

(df.select([
    predicate
]))
fruits
bool
true
true
false
false
true
# use the predicate to filter

df.filter(predicate)
A fruits B cars optional
i64 str i64 str i64
1 "banana" 5 "beetle" 28
2 "banana" 4 "audi" 300
5 "banana" 1 "beetle" -30
# predicate expressions can be used to filter

(df.select([
    pl.col("A").filter(pl.col("fruits").str.contains("^b.*")).sum(),
    (pl.col("B").filter(pl.col("cars").str.contains("^b.*")).sum() * pl.col("B").sum()).alias("some_compute()"),
]))
A some_compute()
i64 i64
8 165
# We can do arithmetic on columns and (literal) values
# can be evaluated to 1 without programmer knowing

some_var = 1

(df.select([
    ((pl.col("A") / 124.0 * pl.col("B")) / pl.sum("B") * some_var).alias("computed")
]))
computed
f64
0.0
0.0
0.0
0.0
0.0
# We can combine columns by a predicate

(df.select([
    "fruits",
    "B",
    pl.when(pl.col("fruits") == "banana").then(pl.col("B")).otherwise(-1).alias("b")
]))
fruits B b
str i64 i64
"banana" 5 5
"banana" 4 4
"apple" 3 -1
"apple" 2 -1
"banana" 1 1
# We can combine columns by a fold operation on column level

(df.select([
    "A",
    "B",
    pl.fold(0, lambda a, b: a + b, [pl.col("A"), "B", pl.col("B")**2, pl.col("A") / 2.0]).alias("fold")
]))
A B fold
i64 i64 f64
1 5 31
2 4 23
3 3 16
4 2 12
5 1 9
# even combine all

(df.select([
    pl.arange(0, df.height).alias("idx"),
    "A",
    pl.col("A").shift().alias("A_shifted"),
    pl.concat_str(pl.all(), "-").alias("str_concat_1"),  # prefer this
    pl.fold(pl.col("A"), lambda a, b: a + "-" + b, pl.all().exclude("A")).alias("str_concat_2"),  # over this (accidentally O(n^2))
]))
idx A A_shifted str_concat_1 str_concat_2
i64 i64 i64 str str
0 1 null "1-banana-5-beetle-28" "1-banana-5-beetle-28"
1 2 1 "2-banana-4-audi-300" "2-banana-4-audi-300"
2 3 2 null null
3 4 3 "4-apple-2-beetle-2" "4-apple-2-beetle-2"
4 5 4 "5-banana-1-beetle--30" "5-banana-1-beetle--30"

Aggregation context

  • expressions are applied over groups instead of columns
# we can still combine many expressions

(df.sort("cars").groupby("fruits")
    .agg([
        pl.col("B").sum().alias("B_sum"),
        pl.sum("B").alias("B_sum2"),  # syntactic sugar for the first
        pl.first("fruits").alias("fruits_first"),
        pl.count("A").alias("count"),
        pl.col("cars").reverse()
    ]))
fruits B_sum B_sum2 fruits_first count cars
str i64 i64 str u32 list
"banana" 10 10 "banana" 3 [beetle, beetle, audi]
"apple" 5 5 "apple" 2 [beetle, beetle]
# We can explode the list column "cars"

(df.sort("cars").groupby("fruits")
    .agg([
        pl.col("B").sum().alias("B_sum"),
        pl.sum("B").alias("B_sum2"),  # syntactic sugar for the first
        pl.first("fruits").alias("fruits_first"),
        pl.count("A").alias("count"),
        pl.col("cars").reverse()
    ])).explode("cars")
fruits B_sum B_sum2 fruits_first count cars
str i64 i64 str u32 str
"apple" 5 5 "apple" 2 "beetle"
"apple" 5 5 "apple" 2 "beetle"
"banana" 10 10 "banana" 3 "beetle"
"banana" 10 10 "banana" 3 "beetle"
"banana" 10 10 "banana" 3 "audi"
(df.groupby("fruits")
    .agg([
        pl.col("B").sum().alias("B_sum"),
        pl.sum("B").alias("B_sum2"),  # syntactic sugar for the first
        pl.first("fruits").alias("fruits_first"),
        pl.count(),
        pl.col("B").shift().alias("B_shifted")
    ])
 .explode("B_shifted")
)
fruits B_sum B_sum2 fruits_first count B_shifted
str i64 i64 str u32 i64
"apple" 5 5 "apple" 2 null
"apple" 5 5 "apple" 2 3
"banana" 10 10 "banana" 3 null
"banana" 10 10 "banana" 3 5
"banana" 10 10 "banana" 3 4
# We can explode the list column "cars"

(df.sort("cars").groupby("fruits")
    .agg([
        pl.col("B").sum(),
        pl.sum("B").alias("B_sum2"),  # syntactic sugar for the first
        pl.first("fruits").alias("fruits_first"),
        pl.count("A").alias("count"),
        pl.col("cars").reverse()
    ])).explode("cars")
fruits B_sum B_sum2 fruits_first count cars
str i64 i64 str u32 str
"apple" 5 5 "apple" 2 "beetle"
"apple" 5 5 "apple" 2 "beetle"
"banana" 10 10 "banana" 3 "beetle"
"banana" 10 10 "banana" 3 "beetle"
"banana" 10 10 "banana" 3 "audi"
# we can also get the list of the groups

(df.groupby("fruits")
    .agg([
         pl.col("B").shift().alias("shift_B"),
         pl.col("B").reverse().alias("rev_B"),
    ]))
fruits shift_B rev_B
str list list
"apple" [null, 3] [2, 3]
"banana" [null, 5, 4] [1, 4, 5]
# we can do predicates in the groupby as well

(df.groupby("fruits")
    .agg([
        pl.col("B").filter(pl.col("B") > 1).list().keep_name(),
    ]))
fruits B
str list
"banana" [5, 4]
"apple" [3, 2]
# and sum only by the values where the predicates are true

(df.groupby("fruits")
    .agg([
        pl.col("B").filter(pl.col("B") > 1).mean(),
    ]))
fruits B_mean
str f64
"banana" 4.5
"apple" 2.5
# Another example

(df.groupby("fruits")
    .agg([
        pl.col("B").shift_and_fill(1, fill_value=0).alias("shifted"),
        pl.col("B").shift_and_fill(1, fill_value=0).sum().alias("shifted_sum"),
    ]))
fruits shifted shifted_sum
str list i64
"apple" [0, 3] 3
"banana" [0, 5, 4] 9

Window functions!

  • Expression with superpowers.
  • Aggregation in selection context
pl.col("foo").aggregation_expression(..).over("column_used_to_group")
# groupby 2 different columns

(df.select([
    "fruits",
    "cars",
    "B",
    pl.col("B").sum().over("fruits").alias("B_sum_by_fruits"),
    pl.col("B").sum().over("cars").alias("B_sum_by_cars"),
]))
fruits cars B B_sum_by_fruits B_sum_by_cars
str str i64 i64 i64
"apple" "beetle" 3 5 11
"apple" "beetle" 2 5 11
"banana" "beetle" 5 10 11
"banana" "audi" 4 10 4
"banana" "beetle" 1 10 11
# reverse B by groups and show the results in original DF

(df.select([
    "fruits",
    "B",
    pl.col("B").reverse().over("fruits").alias("B_reversed_by_fruits")
]))
fruits B B_reversed_by_fruits
str i64 i64
"apple" 3 2
"apple" 2 3
"banana" 5 1
"banana" 4 4
"banana" 1 5
# Lag a column within "fruits"

(df.select([
    "fruits",
    "B",
    pl.col("B").shift().over("fruits").alias("lag_B_by_fruits")
]))

fruits B lag_B_by_fruits
str i64 i64
"apple" 3 null
"apple" 2 3
"banana" 5 null
"banana" 4 5
"banana" 1 4