Map a custom/user-defined function (UDF) to each element of a column
Description
The UDF is applied to each element of a column. See Details for more information on specificities related to the context.
Usage
<Expr>$map_elements(
f,
return_type = NULL,
strict_return_type = TRUE,
allow_fail_eval = FALSE,
in_background = FALSE
)
Arguments
f
|
Function to map |
return_type
|
DataType of the output Series. If NULL , the dtype will be
pl$Unknown .
|
strict_return_type
|
If TRUE (default), error if not correct datatype returned
from R. If FALSE , the output will be converted to a polars
null value.
|
allow_fail_eval
|
If FALSE (default), raise an error if the function fails.
If TRUE , the result will be converted to a polars null
value.
|
in_background
|
Whether to run the function in a background R process, default is
FALSE . Combined with setting
e.g. options(polars.rpool_cap = 4) , this can speed up some
slow R functions as they can run in parallel R sessions. The
communication speed between processes is quite slower than between
threads. This will likely only give a speed-up in a "low IO - high CPU"
usecase. A single map will not be paralleled, only in case of multiple
$map_elements() in the query can
these run in parallel.
|
Details
Note that, in a GroupBy context, the column will have been pre-aggregated and so each element will itself be a Series. Therefore, depending on the context, requirements for function differ:
-
in
$select()
or$with_columns()
(selection context), the function must operate on R values of length 1. Polars will convert each element into an R value and pass it to the function. The output of the user function will be converted back into a polars type (the return type must match, see argumentreturn_type
). Using$map_elements()
in this context should be avoided as alapply()
has half the overhead. -
in
$agg()
(GroupBy context), the function must take aSeries
and return aSeries
or an R object convertible toSeries
, e.g. a vector. In this context, it is much faster if there are the number of groups is much lower than the number of rows, as the iteration is only across the groups. The R user function could e.g. convert theSeries
to a vector with$to_r()
and perform some vectorized operations.
Note that it is preferred to express your function in polars syntax, which will almost always be significantly faster and more memory efficient because:
- the native expression engine runs in Rust; functions run in R.
- use of R functions forces the DataFrame to be materialized in memory.
- Polars-native expressions can be parallelized (R functions cannot).
- Polars-native expressions can be logically optimized (R functions cannot).
Wherever possible you should strongly prefer the native expression API
to achieve the best performance and avoid using
$map_elements()
.
Value
Expr
Examples
library("polars")
# apply over groups: here, the input must be a Series
# prepare two expressions, one to compute the sum of each variable, one to
# get the first two values of each variable and store them in a list
e_sum = pl$all()$map_elements(\(s) sum(s$to_r()))$name$suffix("_sum")
e_head = pl$all()$map_elements(\(s) head(s$to_r(), 2))$name$suffix("_head")
as_polars_df(iris)$group_by("Species")$agg(e_sum, e_head)
#> shape: (3, 9)
#> ┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
#> │ Species ┆ Sepal.Len ┆ Sepal.Wid ┆ Petal.Len ┆ … ┆ Sepal.Len ┆ Sepal.Wid ┆ Petal.Len ┆ Petal.Wi │
#> │ --- ┆ gth_sum ┆ th_sum ┆ gth_sum ┆ ┆ gth_head ┆ th_head ┆ gth_head ┆ dth_head │
#> │ cat ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
#> │ ┆ f64 ┆ f64 ┆ f64 ┆ ┆ list[f64] ┆ list[f64] ┆ list[f64] ┆ list[f64 │
#> │ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ┆ ] │
#> ╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
#> │ setosa ┆ 250.3 ┆ 171.4 ┆ 73.1 ┆ … ┆ [5.1, ┆ [3.5, ┆ [1.4, ┆ [0.2, │
#> │ ┆ ┆ ┆ ┆ ┆ 4.9] ┆ 3.0] ┆ 1.4] ┆ 0.2] │
#> │ versicolo ┆ 296.8 ┆ 138.5 ┆ 213.0 ┆ … ┆ [7.0, ┆ [3.2, ┆ [4.7, ┆ [1.4, │
#> │ r ┆ ┆ ┆ ┆ ┆ 6.4] ┆ 3.2] ┆ 4.5] ┆ 1.5] │
#> │ virginica ┆ 329.4 ┆ 148.7 ┆ 277.6 ┆ … ┆ [6.3, ┆ [3.3, ┆ [6.0, ┆ [2.5, │
#> │ ┆ ┆ ┆ ┆ ┆ 5.8] ┆ 2.7] ┆ 5.1] ┆ 1.9] │
#> └───────────┴───────────┴───────────┴───────────┴───┴───────────┴───────────┴───────────┴──────────┘
# apply a function on each value (should be avoided): here the input is an R
# value of length 1
# select only Float64 columns
my_selection = pl$col(pl$dtypes$Float64)
# prepare two expressions, the first one only adds 10 to each element, the
# second returns the letter whose index matches the element
e_add10 = my_selection$map_elements(\(x) {
x + 10
})$name$suffix("_sum")
e_letter = my_selection$map_elements(\(x) {
letters[ceiling(x)]
}, return_type = pl$dtypes$String)$name$suffix("_letter")
as_polars_df(iris)$select(e_add10, e_letter)
#> shape: (150, 8)
#> ┌────────────┬────────────┬────────────┬───────────┬───────────┬───────────┬───────────┬───────────┐
#> │ Sepal.Leng ┆ Sepal.Widt ┆ Petal.Leng ┆ Petal.Wid ┆ Sepal.Len ┆ Sepal.Wid ┆ Petal.Len ┆ Petal.Wid │
#> │ th_sum ┆ h_sum ┆ th_sum ┆ th_sum ┆ gth_lette ┆ th_letter ┆ gth_lette ┆ th_letter │
#> │ --- ┆ --- ┆ --- ┆ --- ┆ r ┆ --- ┆ r ┆ --- │
#> │ f64 ┆ f64 ┆ f64 ┆ f64 ┆ --- ┆ str ┆ --- ┆ str │
#> │ ┆ ┆ ┆ ┆ str ┆ ┆ str ┆ │
#> ╞════════════╪════════════╪════════════╪═══════════╪═══════════╪═══════════╪═══════════╪═══════════╡
#> │ 15.1 ┆ 13.5 ┆ 11.4 ┆ 10.2 ┆ f ┆ d ┆ b ┆ a │
#> │ 14.9 ┆ 13.0 ┆ 11.4 ┆ 10.2 ┆ e ┆ c ┆ b ┆ a │
#> │ 14.7 ┆ 13.2 ┆ 11.3 ┆ 10.2 ┆ e ┆ d ┆ b ┆ a │
#> │ 14.6 ┆ 13.1 ┆ 11.5 ┆ 10.2 ┆ e ┆ d ┆ b ┆ a │
#> │ 15.0 ┆ 13.6 ┆ 11.4 ┆ 10.2 ┆ e ┆ d ┆ b ┆ a │
#> │ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │
#> │ 16.7 ┆ 13.0 ┆ 15.2 ┆ 12.3 ┆ g ┆ c ┆ f ┆ c │
#> │ 16.3 ┆ 12.5 ┆ 15.0 ┆ 11.9 ┆ g ┆ c ┆ e ┆ b │
#> │ 16.5 ┆ 13.0 ┆ 15.2 ┆ 12.0 ┆ g ┆ c ┆ f ┆ b │
#> │ 16.2 ┆ 13.4 ┆ 15.4 ┆ 12.3 ┆ g ┆ d ┆ f ┆ c │
#> │ 15.9 ┆ 13.0 ┆ 15.1 ┆ 11.8 ┆ f ┆ c ┆ f ┆ b │
#> └────────────┴────────────┴────────────┴───────────┴───────────┴───────────┴───────────┴───────────┘
# Small benchmark --------------------------------
# Using `$map_elements()` is much slower than a more polars-native approach.
# First we multiply each element of a Series of 1M elements by 2.
n = 1000000L
set.seed(1)
df = pl$DataFrame(list(
a = 1:n,
b = sample(letters, n, replace = TRUE)
))
system.time({
df$with_columns(
bob = pl$col("a")$map_elements(\(x) {
x * 2L
})
)
})
#> user system elapsed
#> 2.358 0.001 2.538
# Comparing this to the standard polars syntax:
system.time({
df$with_columns(
bob = pl$col("a") * 2L
)
})
#> user system elapsed
#> 0.004 0.000 0.009
# Running in parallel --------------------------------
# here, we use Sys.sleep() to imitate some CPU expensive computation.
# use apply over each Species-group in each column equal to 12 sequential
# runs ~1.2 sec.
system.time({
as_polars_lf(iris)$group_by("Species")$agg(
pl$all()$map_elements(\(s) {
Sys.sleep(.1)
s$sum()
})
)$collect()
})
#> user system elapsed
#> 0.031 0.000 1.234
# first run in parallel: there is some overhead to start up extra R processes
# drop any previous processes, just to show start-up overhead here
options(polars.rpool_cap = 0)
# set back to 4, the default
options(polars.rpool_cap = 4)
polars_options()$rpool_cap
#> [1] 4
system.time({
as_polars_lf(iris)$group_by("Species")$agg(
pl$all()$map_elements(\(s) {
Sys.sleep(.1)
s$sum()
}, in_background = TRUE)
)$collect()
})
#> user system elapsed
#> 0.011 0.000 0.992
# second run in parallel: this reuses R processes in "polars global_rpool".
polars_options()$rpool_cap
#> [1] 4
system.time({
as_polars_lf(iris)$group_by("Species")$agg(
pl$all()$map_elements(\(s) {
Sys.sleep(.1)
s$sum()
}, in_background = TRUE)
)$collect()
})
#> user system elapsed
#> 0.011 0.000 0.322