Split the string by a substring

Description

Usage

<Expr>$str$split(by, ..., inclusive = FALSE, literal = TRUE, strict = TRUE)

Arguments

by Substring (or regex if literal = FALSE) to split by. Can be an Expr.

… These dots are for future extensions and must be empty.

inclusive If TRUE, include the split character/string in the results.

literal If TRUE (default), treat by as a literal string, not as a regular expression.

strict If TRUE (default), raise an error if the underlying pattern is not a valid regex, otherwise mask out with a null value.

Value

A polars expression

Examples

library("polars")

df <- pl$DataFrame(s = c("foo bar", "foo-bar", "foo bar baz"))
df$select(pl$col("s")$str$split(by = " "))

#> shape: (3, 1)
#> ┌───────────────────────┐
#> │ s                     │
#> │ ---                   │
#> │ list[str]             │
#> ╞═══════════════════════╡
#> │ ["foo", "bar"]        │
#> │ ["foo-bar"]           │
#> │ ["foo", "bar", "baz"] │
#> └───────────────────────┘

df <- pl$DataFrame(
  s = c("foo^bar", "foo_bar", "foo*bar*baz"),
  by = c("_", "_", "*")
)
df

#> shape: (3, 2)
#> ┌─────────────┬─────┐
#> │ s           ┆ by  │
#> │ ---         ┆ --- │
#> │ str         ┆ str │
#> ╞═════════════╪═════╡
#> │ foo^bar     ┆ _   │
#> │ foo_bar     ┆ _   │
#> │ foo*bar*baz ┆ *   │
#> └─────────────┴─────┘

df$select(split = pl$col("s")$str$split(by = pl$col("by")))

#> shape: (3, 1)
#> ┌───────────────────────┐
#> │ split                 │
#> │ ---                   │
#> │ list[str]             │
#> ╞═══════════════════════╡
#> │ ["foo^bar"]           │
#> │ ["foo", "bar"]        │
#> │ ["foo", "bar", "baz"] │
#> └───────────────────────┘

df <- pl$DataFrame(s = c("foo1bar", "foo99bar", "foo1bar2baz"))
df

#> shape: (3, 1)
#> ┌─────────────┐
#> │ s           │
#> │ ---         │
#> │ str         │
#> ╞═════════════╡
#> │ foo1bar     │
#> │ foo99bar    │
#> │ foo1bar2baz │
#> └─────────────┘

df$with_columns(
  pl$col("s")$str$split(by = "\\d+", literal = FALSE)$alias("split_regex"),
  pl$col("s")$str$split(by = "\\d+", literal = FALSE, inclusive = TRUE)$alias(
    "split_regex_inclusive"
  ),
)

#> shape: (3, 3)
#> ┌─────────────┬───────────────────────┬─────────────────────────┐
#> │ s           ┆ split_regex           ┆ split_regex_inclusive   │
#> │ ---         ┆ ---                   ┆ ---                     │
#> │ str         ┆ list[str]             ┆ list[str]               │
#> ╞═════════════╪═══════════════════════╪═════════════════════════╡
#> │ foo1bar     ┆ ["foo", "bar"]        ┆ ["foo1", "bar"]         │
#> │ foo99bar    ┆ ["foo", "bar"]        ┆ ["foo99", "bar"]        │
#> │ foo1bar2baz ┆ ["foo", "bar", "baz"] ┆ ["foo1", "bar2", "baz"] │
#> └─────────────┴───────────────────────┴─────────────────────────┘

df <- pl$DataFrame(
  s = c("foo1bar", "foo bar", "foo-bar baz"),
  by = c("\\d", "\\s", "-"),
)
df$with_columns(
  pl$col("s")$str$split(by = pl$col("by"), literal = FALSE)$alias("split_regex"),
  pl$col("s")$str$split(by = pl$col("by"), literal = FALSE, inclusive = TRUE)$alias(
    "split_regex_inclusive"
  ),
)

#> shape: (3, 4)
#> ┌─────────────┬─────┬────────────────────┬───────────────────────┐
#> │ s           ┆ by  ┆ split_regex        ┆ split_regex_inclusive │
#> │ ---         ┆ --- ┆ ---                ┆ ---                   │
#> │ str         ┆ str ┆ list[str]          ┆ list[str]             │
#> ╞═════════════╪═════╪════════════════════╪═══════════════════════╡
#> │ foo1bar     ┆ \d  ┆ ["foo", "bar"]     ┆ ["foo1", "bar"]       │
#> │ foo bar     ┆ \s  ┆ ["foo", "bar"]     ┆ ["foo ", "bar"]       │
#> │ foo-bar baz ┆ -   ┆ ["foo", "bar baz"] ┆ ["foo-", "bar baz"]   │
#> └─────────────┴─────┴────────────────────┴───────────────────────┘