Split the string by a substring
Description
Split the string by a substring
Usage
<Expr>$str$split(by, ..., inclusive = FALSE, literal = TRUE, strict = TRUE)
Arguments
by
|
Substring (or regex if literal = FALSE) to split by. Can be
an Expr.
|
…
|
These dots are for future extensions and must be empty. |
inclusive
|
If TRUE, include the split character/string in the results.
|
literal
|
If TRUE (default), treat by as a literal
string, not as a regular expression.
|
strict
|
If TRUE (default), raise an error if the underlying pattern
is not a valid regex, otherwise mask out with a null value.
|
Value
A polars expression
Examples
library("polars")
df <- pl$DataFrame(s = c("foo bar", "foo-bar", "foo bar baz"))
df$select(pl$col("s")$str$split(by = " "))
#> shape: (3, 1)
#> ┌───────────────────────┐
#> │ s │
#> │ --- │
#> │ list[str] │
#> ╞═══════════════════════╡
#> │ ["foo", "bar"] │
#> │ ["foo-bar"] │
#> │ ["foo", "bar", "baz"] │
#> └───────────────────────┘
#> shape: (3, 2)
#> ┌─────────────┬─────┐
#> │ s ┆ by │
#> │ --- ┆ --- │
#> │ str ┆ str │
#> ╞═════════════╪═════╡
#> │ foo^bar ┆ _ │
#> │ foo_bar ┆ _ │
#> │ foo*bar*baz ┆ * │
#> └─────────────┴─────┘
#> shape: (3, 1)
#> ┌───────────────────────┐
#> │ split │
#> │ --- │
#> │ list[str] │
#> ╞═══════════════════════╡
#> │ ["foo^bar"] │
#> │ ["foo", "bar"] │
#> │ ["foo", "bar", "baz"] │
#> └───────────────────────┘
#> shape: (3, 1)
#> ┌─────────────┐
#> │ s │
#> │ --- │
#> │ str │
#> ╞═════════════╡
#> │ foo1bar │
#> │ foo99bar │
#> │ foo1bar2baz │
#> └─────────────┘
df$with_columns(
pl$col("s")$str$split(by = "\\d+", literal = FALSE)$alias("split_regex"),
pl$col("s")$str$split(by = "\\d+", literal = FALSE, inclusive = TRUE)$alias(
"split_regex_inclusive"
),
)
#> shape: (3, 3)
#> ┌─────────────┬───────────────────────┬─────────────────────────┐
#> │ s ┆ split_regex ┆ split_regex_inclusive │
#> │ --- ┆ --- ┆ --- │
#> │ str ┆ list[str] ┆ list[str] │
#> ╞═════════════╪═══════════════════════╪═════════════════════════╡
#> │ foo1bar ┆ ["foo", "bar"] ┆ ["foo1", "bar"] │
#> │ foo99bar ┆ ["foo", "bar"] ┆ ["foo99", "bar"] │
#> │ foo1bar2baz ┆ ["foo", "bar", "baz"] ┆ ["foo1", "bar2", "baz"] │
#> └─────────────┴───────────────────────┴─────────────────────────┘
df <- pl$DataFrame(
s = c("foo1bar", "foo bar", "foo-bar baz"),
by = c("\\d", "\\s", "-"),
)
df$with_columns(
pl$col("s")$str$split(by = pl$col("by"), literal = FALSE)$alias("split_regex"),
pl$col("s")$str$split(by = pl$col("by"), literal = FALSE, inclusive = TRUE)$alias(
"split_regex_inclusive"
),
)
#> shape: (3, 4)
#> ┌─────────────┬─────┬────────────────────┬───────────────────────┐
#> │ s ┆ by ┆ split_regex ┆ split_regex_inclusive │
#> │ --- ┆ --- ┆ --- ┆ --- │
#> │ str ┆ str ┆ list[str] ┆ list[str] │
#> ╞═════════════╪═════╪════════════════════╪═══════════════════════╡
#> │ foo1bar ┆ \d ┆ ["foo", "bar"] ┆ ["foo1", "bar"] │
#> │ foo bar ┆ \s ┆ ["foo", "bar"] ┆ ["foo ", "bar"] │
#> │ foo-bar baz ┆ - ┆ ["foo", "bar baz"] ┆ ["foo-", "bar baz"] │
#> └─────────────┴─────┴────────────────────┴───────────────────────┘