import polars as pl
fn(Series) -> Series
- Lazily evaluated
- Can be optimized
- Gives the library writer context and informed decision can be made
- Embarrassingly parallel
- Context dependent
- selection / projection ->
Series
= COLUMN, LITERAL or VALUE
- aggregation ->
Series
= GROUPS
df = pl.DataFrame(
{
"A": [1, 2, 3, 4, 5],
"fruits": ["banana", "banana", "apple", "apple", "banana"],
"B": [5, 4, 3, 2, 1],
"cars": ["beetle", "audi", "beetle", "beetle", "beetle"],
"optional": [28, 300, None, 2, -30],
}
)
df
A
|
fruits
|
B
|
cars
|
optional
|
i64
|
str
|
i64
|
str
|
i64
|
1
|
"banana"
|
5
|
"beetle"
|
28
|
2
|
"banana"
|
4
|
"audi"
|
300
|
3
|
"apple"
|
3
|
"beetle"
|
null
|
4
|
"apple"
|
2
|
"beetle"
|
2
|
5
|
"banana"
|
1
|
"beetle"
|
-30
|
# We can select by name
(df.select([
pl.col("A"),
"B", # the col part is inferred
pl.lit("B"), # we must tell polars we mean the literal "B"
pl.col("fruits"),
]))
A
|
B
|
literal
|
fruits
|
i64
|
i64
|
str
|
str
|
1
|
5
|
"B"
|
"banana"
|
2
|
4
|
"B"
|
"banana"
|
3
|
3
|
"B"
|
"apple"
|
4
|
2
|
"B"
|
"apple"
|
5
|
1
|
"B"
|
"banana"
|
# you can select columns with a regex if it starts with '^' and ends with '$'
(df.select([
pl.col("^A|B$").sum()
]))
# you can select multiple columns by name
(df.select([
pl.col(["A", "B"]).sum()
]))
# We select everything in normal order
# Then we select everything in reversed order
(df.select([
pl.all(),
pl.all().reverse().suffix("_reverse")
]))
A
|
fruits
|
B
|
cars
|
optional
|
A_reverse
|
fruits_reverse
|
B_reverse
|
cars_reverse
|
optional_reverse
|
i64
|
str
|
i64
|
str
|
i64
|
i64
|
str
|
i64
|
str
|
i64
|
1
|
"banana"
|
5
|
"beetle"
|
28
|
5
|
"banana"
|
1
|
"beetle"
|
-30
|
2
|
"banana"
|
4
|
"audi"
|
300
|
4
|
"apple"
|
2
|
"beetle"
|
2
|
3
|
"apple"
|
3
|
"beetle"
|
null
|
3
|
"apple"
|
3
|
"beetle"
|
null
|
4
|
"apple"
|
2
|
"beetle"
|
2
|
2
|
"banana"
|
4
|
"audi"
|
300
|
5
|
"banana"
|
1
|
"beetle"
|
-30
|
1
|
"banana"
|
5
|
"beetle"
|
28
|
# all expressions run in parallel
# single valued `Series` are broadcasted to the shape of the `DataFrame`
(df.select([
pl.all(),
pl.all().sum().suffix("_sum")
]))
A
|
fruits
|
B
|
cars
|
optional
|
A_sum
|
fruits_sum
|
B_sum
|
cars_sum
|
optional_sum
|
i64
|
str
|
i64
|
str
|
i64
|
i64
|
str
|
i64
|
str
|
i64
|
1
|
"banana"
|
5
|
"beetle"
|
28
|
15
|
null
|
15
|
null
|
300
|
2
|
"banana"
|
4
|
"audi"
|
300
|
15
|
null
|
15
|
null
|
300
|
3
|
"apple"
|
3
|
"beetle"
|
null
|
15
|
null
|
15
|
null
|
300
|
4
|
"apple"
|
2
|
"beetle"
|
2
|
15
|
null
|
15
|
null
|
300
|
5
|
"banana"
|
1
|
"beetle"
|
-30
|
15
|
null
|
15
|
null
|
300
|
# there are `str` and `dt` namespaces for specialized functions
predicate = pl.col("fruits").str.contains("^b.*")
(df.select([
predicate
]))
fruits
|
bool
|
true
|
true
|
false
|
false
|
true
|
# use the predicate to filter
df.filter(predicate)
A
|
fruits
|
B
|
cars
|
optional
|
i64
|
str
|
i64
|
str
|
i64
|
1
|
"banana"
|
5
|
"beetle"
|
28
|
2
|
"banana"
|
4
|
"audi"
|
300
|
5
|
"banana"
|
1
|
"beetle"
|
-30
|
# predicate expressions can be used to filter
(df.select([
pl.col("A").filter(pl.col("fruits").str.contains("^b.*")).sum(),
(pl.col("B").filter(pl.col("cars").str.contains("^b.*")).sum() * pl.col("B").sum()).alias("some_compute()"),
]))
A
|
some_compute()
|
i64
|
i64
|
8
|
165
|
# We can do arithmetic on columns and (literal) values
# can be evaluated to 1 without programmer knowing
some_var = 1
(df.select([
((pl.col("A") / 124.0 * pl.col("B")) / pl.sum("B") * some_var).alias("computed")
]))
computed
|
f64
|
0.0
|
0.0
|
0.0
|
0.0
|
0.0
|
# We can combine columns by a predicate
(df.select([
"fruits",
"B",
pl.when(pl.col("fruits") == "banana").then(pl.col("B")).otherwise(-1).alias("b")
]))
fruits
|
B
|
b
|
str
|
i64
|
i64
|
"banana"
|
5
|
5
|
"banana"
|
4
|
4
|
"apple"
|
3
|
-1
|
"apple"
|
2
|
-1
|
"banana"
|
1
|
1
|
# We can combine columns by a fold operation on column level
(df.select([
"A",
"B",
pl.fold(0, lambda a, b: a + b, [pl.col("A"), "B", pl.col("B")**2, pl.col("A") / 2.0]).alias("fold")
]))
A
|
B
|
fold
|
i64
|
i64
|
f64
|
1
|
5
|
31
|
2
|
4
|
23
|
3
|
3
|
16
|
4
|
2
|
12
|
5
|
1
|
9
|
# even combine all
(df.select([
pl.arange(0, df.height).alias("idx"),
"A",
pl.col("A").shift().alias("A_shifted"),
pl.concat_str(pl.all(), "-").alias("str_concat_1"), # prefer this
pl.fold(pl.col("A"), lambda a, b: a + "-" + b, pl.all().exclude("A")).alias("str_concat_2"), # over this (accidentally O(n^2))
]))
idx
|
A
|
A_shifted
|
str_concat_1
|
str_concat_2
|
i64
|
i64
|
i64
|
str
|
str
|
0
|
1
|
null
|
"1-banana-5-beetle-28"
|
"1-banana-5-beetle-28"
|
1
|
2
|
1
|
"2-banana-4-audi-300"
|
"2-banana-4-audi-300"
|
2
|
3
|
2
|
null
|
null
|
3
|
4
|
3
|
"4-apple-2-beetle-2"
|
"4-apple-2-beetle-2"
|
4
|
5
|
4
|
"5-banana-1-beetle--30"
|
"5-banana-1-beetle--30"
|
- expressions are applied over groups instead of columns
# we can still combine many expressions
(df.sort("cars").groupby("fruits")
.agg([
pl.col("B").sum().alias("B_sum"),
pl.sum("B").alias("B_sum2"), # syntactic sugar for the first
pl.first("fruits").alias("fruits_first"),
pl.count("A").alias("count"),
pl.col("cars").reverse()
]))
fruits
|
B_sum
|
B_sum2
|
fruits_first
|
count
|
cars
|
str
|
i64
|
i64
|
str
|
u32
|
list
|
"banana"
|
10
|
10
|
"banana"
|
3
|
[beetle, beetle, audi]
|
"apple"
|
5
|
5
|
"apple"
|
2
|
[beetle, beetle]
|
# We can explode the list column "cars"
(df.sort("cars").groupby("fruits")
.agg([
pl.col("B").sum().alias("B_sum"),
pl.sum("B").alias("B_sum2"), # syntactic sugar for the first
pl.first("fruits").alias("fruits_first"),
pl.count("A").alias("count"),
pl.col("cars").reverse()
])).explode("cars")
fruits
|
B_sum
|
B_sum2
|
fruits_first
|
count
|
cars
|
str
|
i64
|
i64
|
str
|
u32
|
str
|
"apple"
|
5
|
5
|
"apple"
|
2
|
"beetle"
|
"apple"
|
5
|
5
|
"apple"
|
2
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"audi"
|
(df.groupby("fruits")
.agg([
pl.col("B").sum().alias("B_sum"),
pl.sum("B").alias("B_sum2"), # syntactic sugar for the first
pl.first("fruits").alias("fruits_first"),
pl.count(),
pl.col("B").shift().alias("B_shifted")
])
.explode("B_shifted")
)
fruits
|
B_sum
|
B_sum2
|
fruits_first
|
count
|
B_shifted
|
str
|
i64
|
i64
|
str
|
u32
|
i64
|
"apple"
|
5
|
5
|
"apple"
|
2
|
null
|
"apple"
|
5
|
5
|
"apple"
|
2
|
3
|
"banana"
|
10
|
10
|
"banana"
|
3
|
null
|
"banana"
|
10
|
10
|
"banana"
|
3
|
5
|
"banana"
|
10
|
10
|
"banana"
|
3
|
4
|
# We can explode the list column "cars"
(df.sort("cars").groupby("fruits")
.agg([
pl.col("B").sum(),
pl.sum("B").alias("B_sum2"), # syntactic sugar for the first
pl.first("fruits").alias("fruits_first"),
pl.count("A").alias("count"),
pl.col("cars").reverse()
])).explode("cars")
fruits
|
B_sum
|
B_sum2
|
fruits_first
|
count
|
cars
|
str
|
i64
|
i64
|
str
|
u32
|
str
|
"apple"
|
5
|
5
|
"apple"
|
2
|
"beetle"
|
"apple"
|
5
|
5
|
"apple"
|
2
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"beetle"
|
"banana"
|
10
|
10
|
"banana"
|
3
|
"audi"
|
# we can also get the list of the groups
(df.groupby("fruits")
.agg([
pl.col("B").shift().alias("shift_B"),
pl.col("B").reverse().alias("rev_B"),
]))
fruits
|
shift_B
|
rev_B
|
str
|
list
|
list
|
"apple"
|
[null, 3]
|
[2, 3]
|
"banana"
|
[null, 5, 4]
|
[1, 4, 5]
|
# we can do predicates in the groupby as well
(df.groupby("fruits")
.agg([
pl.col("B").filter(pl.col("B") > 1).list().keep_name(),
]))
fruits
|
B
|
str
|
list
|
"banana"
|
[5, 4]
|
"apple"
|
[3, 2]
|
# and sum only by the values where the predicates are true
(df.groupby("fruits")
.agg([
pl.col("B").filter(pl.col("B") > 1).mean(),
]))
fruits
|
B_mean
|
str
|
f64
|
"banana"
|
4.5
|
"apple"
|
2.5
|
# Another example
(df.groupby("fruits")
.agg([
pl.col("B").shift_and_fill(1, fill_value=0).alias("shifted"),
pl.col("B").shift_and_fill(1, fill_value=0).sum().alias("shifted_sum"),
]))
fruits
|
shifted
|
shifted_sum
|
str
|
list
|
i64
|
"apple"
|
[0, 3]
|
3
|
"banana"
|
[0, 5, 4]
|
9
|
- Expression with superpowers.
- Aggregation in selection context
pl.col("foo").aggregation_expression(..).over("column_used_to_group")
# groupby 2 different columns
(df.select([
"fruits",
"cars",
"B",
pl.col("B").sum().over("fruits").alias("B_sum_by_fruits"),
pl.col("B").sum().over("cars").alias("B_sum_by_cars"),
]))
fruits
|
cars
|
B
|
B_sum_by_fruits
|
B_sum_by_cars
|
str
|
str
|
i64
|
i64
|
i64
|
"apple"
|
"beetle"
|
3
|
5
|
11
|
"apple"
|
"beetle"
|
2
|
5
|
11
|
"banana"
|
"beetle"
|
5
|
10
|
11
|
"banana"
|
"audi"
|
4
|
10
|
4
|
"banana"
|
"beetle"
|
1
|
10
|
11
|
# reverse B by groups and show the results in original DF
(df.select([
"fruits",
"B",
pl.col("B").reverse().over("fruits").alias("B_reversed_by_fruits")
]))
fruits
|
B
|
B_reversed_by_fruits
|
str
|
i64
|
i64
|
"apple"
|
3
|
2
|
"apple"
|
2
|
3
|
"banana"
|
5
|
1
|
"banana"
|
4
|
4
|
"banana"
|
1
|
5
|
# Lag a column within "fruits"
(df.select([
"fruits",
"B",
pl.col("B").shift().over("fruits").alias("lag_B_by_fruits")
]))
fruits
|
B
|
lag_B_by_fruits
|
str
|
i64
|
i64
|
"apple"
|
3
|
null
|
"apple"
|
2
|
3
|
"banana"
|
5
|
null
|
"banana"
|
4
|
5
|
"banana"
|
1
|
4
|