Partitioning scheme to write files
Description
Partitioning schemes are used to write multiple files with
sink_ and
write_ methods.
-
pl$PartitionBy(): Configuration for writing to multiple output files. Supports partitioning by key expressions, file size limits, or both.
The following functions are deprecated and will be removed in a future release:
-
pl$PartitionByKey(): Usepl$PartitionBy(key = …)instead. -
pl$PartitionMaxSize(): Usepl$PartitionBy(max_rows_per_file = …)instead. -
pl$PartitionParted(): Usepl$PartitionBy(key = …)with pre-sorted data instead.
Usage
pl__PartitionBy(
base_path,
...,
key = NULL,
include_key = NULL,
max_rows_per_file = NULL,
approximate_bytes_per_file = NULL
)
pl__PartitionByKey(
base_path,
...,
by,
include_key = TRUE,
per_partition_sort_by = NULL
)
pl__PartitionMaxSize(base_path, ..., max_size, per_partition_sort_by = NULL)
pl__PartitionParted(
base_path,
...,
by,
include_key = TRUE,
per_partition_sort_by = NULL
)
Arguments
Examples
library("polars")
# Partitioning by columns
temp_dir_1 <- withr::local_tempdir()
as_polars_lf(mtcars)$sink_parquet(
pl$PartitionBy(
temp_dir_1,
key = c("cyl", "am"),
include_key = FALSE
),
mkdir = TRUE
)
list.files(temp_dir_1, recursive = TRUE)
#> [1] "cyl=4.0/am=0.0/00000000.parquet" "cyl=4.0/am=1.0/00000000.parquet"
#> [3] "cyl=6.0/am=0.0/00000000.parquet" "cyl=6.0/am=1.0/00000000.parquet"
#> [5] "cyl=8.0/am=0.0/00000000.parquet" "cyl=8.0/am=1.0/00000000.parquet"
# Partitioning by max row size
temp_dir_2 <- withr::local_tempdir()
as_polars_lf(mtcars)$sink_csv(
pl$PartitionBy(
temp_dir_2,
max_rows_per_file = 10
),
mkdir = TRUE
)
files <- list.files(temp_dir_2, full.names = TRUE)
files
#> [1] "/tmp/Rtmpa9K513/filec9d828713abf/00000000.csv"
#> [2] "/tmp/Rtmpa9K513/filec9d828713abf/00000001.csv"
#> [3] "/tmp/Rtmpa9K513/filec9d828713abf/00000002.csv"
#> [4] "/tmp/Rtmpa9K513/filec9d828713abf/00000003.csv"
#> [[1]]
#> [1] 10
#>
#> [[2]]
#> [1] 10
#>
#> [[3]]
#> [1] 10
#>
#> [[4]]
#> [1] 2
# Partitioning by both key and size
temp_dir_3 <- withr::local_tempdir()
as_polars_lf(mtcars)$sink_parquet(
pl$PartitionBy(
temp_dir_3,
key = "cyl",
max_rows_per_file = 5,
approximate_bytes_per_file = 1000000
),
mkdir = TRUE
)
list.files(temp_dir_3, recursive = TRUE)
#> [1] "cyl=4.0/00000000.parquet" "cyl=4.0/00000001.parquet"
#> [3] "cyl=4.0/00000002.parquet" "cyl=6.0/00000000.parquet"
#> [5] "cyl=6.0/00000001.parquet" "cyl=8.0/00000000.parquet"
#> [7] "cyl=8.0/00000001.parquet" "cyl=8.0/00000002.parquet"