> const df: pl.DataFrame = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> df.schema
// {
// foo: Float64;
// bar: Float64;
// ham: Utf8;
// }
Optional
destOrOptions: anyOptional
options: anysince 0.4.0 use writeCSV
Write the DataFrame disk in avro format.
Optional
options: WriteAvroOptionsOptional
options: WriteAvroOptionsWrite DataFrame to comma-separated values file (csv).
If no options are specified, it will return a new string containing the contents
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
> df.writeCSV();
foo,bar,ham
1,6,a
2,7,b
3,8,c
// using a file path
> df.head(1).writeCSV("./foo.csv")
// foo.csv
foo,bar,ham
1,6,a
// using a write stream
> const writeStream = new Stream.Writable({
... write(chunk, encoding, callback) {
... console.log("writeStream: %O', chunk.toString());
... callback(null);
... }
... });
> df.head(1).writeCSV(writeStream, {includeHeader: false});
writeStream: '1,6,a'
Optional
options: WriteCsvOptionsWrite to Arrow IPC feather file, either to a file path or to a write stream.
Optional
options: WriteIPCOptionsOptional
options: WriteIPCOptionsWrite to Arrow IPC stream file, either to a file path or to a write stream.
Optional
options: WriteIPCOptionsOptional
options: WriteIPCOptionsWrite Dataframe to JSON string, file, or write stream
Optional
options: { json | lines
> const df = pl.DataFrame({
... foo: [1,2,3],
... bar: ['a','b','c']
... })
> df.writeJSON({format:"json"})
`[ {"foo":1.0,"bar":"a"}, {"foo":2.0,"bar":"b"}, {"foo":3.0,"bar":"c"}]`
> df.writeJSON({format:"lines"})
`{"foo":1.0,"bar":"a"}
{"foo":2.0,"bar":"b"}
{"foo":3.0,"bar":"c"}`
// writing to a file
> df.writeJSON("/path/to/file.json", {format:'lines'})
Optional
options: { Write the DataFrame disk in parquet format.
Optional
options: WriteParquetOptionsOptional
options: WriteParquetOptionsOptional
destination: anyOptional
options: anysince 0.4.0 use writeIPC
Optional
destination: anyOptional
options: anysince 0.4.0 use writeParquet
Sample from this DataFrame by setting either n
or frac
.
Optional
opts: { Optional
seed?: number | bigintOptional
withOptional
opts: { Optional
seed?: number | bigintOptional
withOptional
n: numberOptional
frac: numberOptional
withReplacement: booleanOptional
seed: number | bigintSummary statistics for a DataFrame.
Only summarizes numeric datatypes at the moment and returns nulls for non numeric datatypes.
Example
> const df = pl.DataFrame({
... 'a': [1.0, 2.8, 3.0],
... 'b': [4, 5, 6],
... "c": [True, False, True]
... });
... df.describe()
shape: (5, 4)
╭──────────┬───────┬─────┬──────╮
│ describe ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞══════════╪═══════╪═════╪══════╡
│ "mean" ┆ 2.267 ┆ 5 ┆ null │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ "std" ┆ 1.102 ┆ 1 ┆ null │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ "min" ┆ 1 ┆ 4 ┆ 0.0 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ "max" ┆ 3 ┆ 6 ┆ 1 │
├╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┤
│ "median" ┆ 2.8 ┆ 5 ┆ null │
╰──────────┴───────┴─────┴──────╯
Remove column from DataFrame and return as new.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ['a', 'b', 'c'],
... "apple": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// apple: pl.Series<Utf8, "apple">;
// }>
> const df2 = df.drop(['ham', 'apple']);
// df2: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// }>
> console.log(df2.toString());
shape: (3, 2)
╭─────┬─────╮
│ foo ┆ bar │
│ --- ┆ --- │
│ i64 ┆ f64 │
╞═════╪═════╡
│ 1 ┆ 6 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 │
├╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 │
╰─────┴─────╯
Return a new DataFrame where the null values are dropped.
This method only drops nulls row-wise if any single value of the row is null.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, null, 8],
... "ham": ['a', 'b', 'c']
... });
> console.log(df.dropNulls().toString());
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
└─────┴─────┴─────┘
Rest
...columns: (keyof T)[]Explode DataFrame
to long format by exploding a column with Lists.
> const df = pl.DataFrame({
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [[1, 2], [1, 3], [4, 3], [5, 5, 5], [6], [2, 1, 2]]
... });
> console.log(df.toString());
shape: (6, 2)
╭─────────┬────────────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ list [i64] │
╞═════════╪════════════╡
│ "c" ┆ [1, 2] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [1, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [4, 3] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "c" ┆ [5, 5, 5] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "a" ┆ [6] │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ "b" ┆ [2, 1, 2] │
╰─────────┴────────────╯
> df.explode("nrs")
shape: (13, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ ... ┆ ... │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 2 │
╰─────────┴─────╯
Rest
...columns: ExprOrString[]Extend the memory backed by this DataFrame
with the values from other
.
Different from vstack
which adds the chunks from other
to the chunks of this DataFrame
extent
appends the data from other
to the underlying memory locations and thus may cause a reallocation.
If this does not cause a reallocation, the resulting data structure will not have any extra chunks and thus will yield faster queries.
Prefer extend
over vstack
when you want to do a query after a single append. For instance during
online operations where you add n
rows and rerun a query.
Prefer vstack
over extend
when you want to append many times before doing a query. For instance
when you read in multiple files and when to store them in a single DataFrame
.
In the latter case, finish the sequence of vstack
operations with a rechunk
.
Filter the rows in the DataFrame based on a predicate expression.
Expression that evaluates to a boolean Series.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// Filter on one condition
> df.filter(pl.col("foo").lt(3))
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 ┆ b │
└─────┴─────┴─────┘
// Filter on multiple conditions
> df.filter(
... pl.col("foo").lt(3)
... .and(pl.col("ham").eq(pl.lit("a")))
... )
shape: (1, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ a │
└─────┴─────┴─────┘
Find the index of a column by name.
Name of the column to find.
Apply a horizontal reduction on a DataFrame.
This can be used to effectively determine aggregations on a row level, and can be applied to any DataType that can be supercasted (casted to a similar parent type).
An example of the supercast rules when applying an arithmetic operation on two DataTypes are for instance:
Series
> // A horizontal sum operation
> let df = pl.DataFrame({
... "a": [2, 1, 3],
... "b": [1, 2, 3],
... "c": [1.0, 2.0, 3.0]
... });
> df.fold((s1, s2) => s1.plus(s2))
Series: 'a' [f64]
[
4
5
9
]
> // A horizontal minimum operation
> df = pl.DataFrame({
... "a": [2, 1, 3],
... "b": [1, 2, 3],
... "c": [1.0, 2.0, 3.0]
... });
> df.fold((s1, s2) => s1.zipWith(s1.lt(s2), s2))
Series: 'a' [f64]
[
1
1
3
]
> // A horizontal string concatenation
> df = pl.DataFrame({
... "a": ["foo", "bar", 2],
... "b": [1, 2, 3],
... "c": [1.0, 2.0, 3.0]
... })
> df.fold((s1, s2) => s.plus(s2))
Series: '' [f64]
[
"foo11"
"bar22
"233"
]
Get a single column as Series by name.
> const df = pl.DataFrame({
... foo: [1, 2, 3],
... bar: [6, null, 8],
... ham: ["a", "b", "c"],
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> const column = df.getColumn("foo");
// column: pl.Series<Float64, "foo">
> const df = pl.DataFrame({
... foo: [1, 2, 3],
... bar: [6, null, 8],
... ham: ["a", "b", "c"],
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> const columns = df.getColumns();
// columns: (pl.Series<Float64, "foo"> | pl.Series<Float64, "bar"> | pl.Series<Utf8, "ham">)[]
Groups based on a time value (or index value of type Int32, Int64). Time windows are calculated and rows are assigned to windows. Different from a normal groupby is that a row can be member of multiple groups. The time/index window could be seen as a rolling window, with a window size determined by dates/times/values instead of slots in the DataFrame.
A window is defined by:
The every
, period
and offset
arguments are created with
the following string language:
Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
In case of a groupbyDynamic on an integer column, the windows are defined by:
Optional
by?: ColumnsOrExprOptional
check_Optional
closed?: Optional
includeOptional
offset?: stringOptional
period?: stringCreate rolling groups based on a time column (or index value of type Int32, Int64).
Different from a rolling groupby the windows are now determined by the individual values and are not of constant intervals. For constant intervals use groupByDynamic
The period
and offset
arguments are created with
the following string language:
Or combine them: "3d12h4m25s" # 3 days, 12 hours, 4 minutes, and 25 seconds
In case of a groupby_rolling on an integer column, the windows are defined by:
Optional
by?: ColumnsOrExprOptional
check_Optional
closed?: Optional
offset?: string
>dates = [
... "2020-01-01 13:45:48",
... "2020-01-01 16:42:13",
... "2020-01-01 16:45:09",
... "2020-01-02 18:12:48",
... "2020-01-03 19:45:32",
... "2020-01-08 23:16:43",
... ]
>df = pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]}).withColumn(
... pl.col("dt").str.strptime(pl.Datetime)
... )
>out = df.groupbyRolling({indexColumn:"dt", period:"2d"}).agg(
... [
... pl.sum("a").alias("sum_a"),
... pl.min("a").alias("min_a"),
... pl.max("a").alias("max_a"),
... ]
... )
>assert(out["sum_a"].toArray() === [3, 10, 15, 24, 11, 1])
>assert(out["max_a"].toArray() === [3, 7, 7, 9, 9, 1])
>assert(out["min_a"].toArray() === [3, 3, 3, 3, 2, 1])
>out
shape: (6, 4)
┌─────────────────────┬───────┬───────┬───────┐
│ dt ┆ a_sum ┆ a_max ┆ a_min │
│ --- ┆ --- ┆ --- ┆ --- │
│ datetime[ms] ┆ i64 ┆ i64 ┆ i64 │
╞═════════════════════╪═══════╪═══════╪═══════╡
│ 2020-01-01 13:45:48 ┆ 3 ┆ 3 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2020-01-01 16:42:13 ┆ 10 ┆ 7 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2020-01-01 16:45:09 ┆ 15 ┆ 7 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2020-01-02 18:12:48 ┆ 24 ┆ 9 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2020-01-03 19:45:32 ┆ 11 ┆ 9 ┆ 2 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2020-01-08 23:16:43 ┆ 1 ┆ 1 ┆ 1 │
└─────────────────────┴───────┴───────┴───────┘
Hash and combine the rows in this DataFrame. (Hash value is UInt64)
Optional
k0: numberseed parameter
Optional
k1: numberseed parameter
Optional
k2: numberseed parameter
Optional
k3: numberseed parameter
Optional
k0?: numberOptional
k1?: numberOptional
k2?: numberOptional
k3?: numberGet first N rows as DataFrame.
Optional
length: numberLength of the head.
> const df = pl.DataFrame({
... "foo": [1, 2, 3, 4, 5],
... "bar": [6, 7, 8, 9, 10],
... "ham": ['a', 'b', 'c', 'd','e']
... });
> df.head(3)
shape: (3, 3)
╭─────┬─────┬─────╮
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
╰─────┴─────┴─────╯
Return a new DataFrame grown horizontally by stacking multiple Series to it.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> const x = pl.Series("apple", [10, 20, 30])
// x: pl.Series<Float64, "apple">
> df.hstack([x])
// pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// apple: pl.Series<Float64, "apple">;
// }>
shape: (3, 4)
╭─────┬─────┬─────┬───────╮
│ foo ┆ bar ┆ ham ┆ apple │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str ┆ i64 │
╞═════╪═════╪═════╪═══════╡
│ 1 ┆ 6 ┆ "a" ┆ 10 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" ┆ 20 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" ┆ 30 │
╰─────┴─────┴─────┴───────╯
SQL like joins.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ['a', 'b', 'c']
... });
> const otherDF = pl.DataFrame({
... "apple": ['x', 'y', 'z'],
... "ham": ['a', 'b', 'd']
... });
> df.join(otherDF, {on: 'ham'})
shape: (2, 4)
╭─────┬─────┬─────┬───────╮
│ foo ┆ bar ┆ ham ┆ apple │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ f64 ┆ str ┆ str │
╞═════╪═════╪═════╪═══════╡
│ 1 ┆ 6 ┆ "a" ┆ "x" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" ┆ "y" │
╰─────┴─────┴─────┴───────╯
Perform an asof join. This is similar to a left-join except that we match on nearest key rather than equal keys.
Both DataFrames must be sorted by the asofJoin key.
For each row in the left DataFrame:
A "backward" search selects the last row in the right DataFrame whose 'on' key is less than or equal to the left's key.
A "forward" search selects the first row in the right DataFrame whose 'on' key is greater than or equal to the left's key.
A "nearest" search selects the last row in the right DataFrame whose value is nearest to the left's key. String keys are not currently supported for a nearest search.
The default is "backward".
DataFrame to join with.
Optional
allowAllow the physical plan to optionally evaluate the computation of both DataFrames up to the join in parallel.
Optional
by?: string | string[]Optional
byjoin on these columns before doing asof join
Optional
byjoin on these columns before doing asof join
Optional
forceForce the physical plan to evaluate the computation of both DataFrames up to the join in parallel.
Optional
leftJoin column of the left DataFrame.
Optional
on?: stringJoin column of both DataFrames. If set, leftOn
and rightOn
should be undefined.
Optional
rightJoin column of the right DataFrame.
Optional
strategy?: "backward" | "forward" | "nearest"One of 'forward', 'backward', 'nearest'
Optional
suffix?: stringSuffix to append to columns with a duplicate name.
Optional
tolerance?: string | numberNumeric tolerance. By setting this the join will only be done if the near keys are within this distance. If an asof join is done on columns of dtype "Date", "Datetime" you use the following string language:
Or combine them:
> const gdp = pl.DataFrame({
... date: [
... new Date('2016-01-01'),
... new Date('2017-01-01'),
... new Date('2018-01-01'),
... new Date('2019-01-01'),
... ], // note record date: Jan 1st (sorted!)
... gdp: [4164, 4411, 4566, 4696],
... })
> const population = pl.DataFrame({
... date: [
... new Date('2016-05-12'),
... new Date('2017-05-12'),
... new Date('2018-05-12'),
... new Date('2019-05-12'),
... ], // note record date: May 12th (sorted!)
... "population": [82.19, 82.66, 83.12, 83.52],
... })
> population.joinAsof(
... gdp,
... {leftOn:"date", rightOn:"date", strategy:"backward"}
... )
shape: (4, 3)
┌─────────────────────┬────────────┬──────┐
│ date ┆ population ┆ gdp │
│ --- ┆ --- ┆ --- │
│ datetime[μs] ┆ f64 ┆ i64 │
╞═════════════════════╪════════════╪══════╡
│ 2016-05-12 00:00:00 ┆ 82.19 ┆ 4164 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2017-05-12 00:00:00 ┆ 82.66 ┆ 4411 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2018-05-12 00:00:00 ┆ 83.12 ┆ 4566 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2019-05-12 00:00:00 ┆ 83.52 ┆ 4696 │
└─────────────────────┴────────────┴──────┘
Aggregate the columns of this DataFrame to their mean value.
Optional
nullStrategy: "ignore" | "propagate"Create a spreadsheet-style pivot table as a DataFrame.
The existing column(s) of values which will be moved under the new columns from index. If an
aggregation is specified, these are the values on which the aggregation will be computed.
If None, all remaining columns not specified on on
and index
will be used.
At least one of index
and values
must be specified.
Optional
aggregateAny of: - "sum" - "max" - "min" - "mean" - "median" - "first" - "last" - "count" Defaults to "first"
The column(s) that remain from the input to the output. The output DataFrame will have one row
for each unique combination of the index
's values.
If None, all remaining columns not specified on on
and values
will be used. At least one
of index
and values
must be specified.
Optional
maintainSort the grouped keys so that the output order is predictable.
The column(s) whose values will be used as the new columns of the output DataFrame.
Optional
separator?: stringUsed as separator/delimiter in generated column names.
Optional
sortSort the transposed columns by name. Default is by order of discovery.
> const df = pl.DataFrame(
... {
... "foo": ["one", "one", "one", "two", "two", "two"],
... "bar": ["A", "B", "C", "A", "B", "C"],
... "baz": [1, 2, 3, 4, 5, 6],
... }
... );
> df.pivot("baz", {index:"foo", on:"bar"});
shape: (2, 4)
┌─────┬─────┬─────┬─────┐
│ foo ┆ A ┆ B ┆ C │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 ┆ f64 │
╞═════╪═════╪═════╪═════╡
│ one ┆ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ two ┆ 4 ┆ 5 ┆ 6 │
└─────┴─────┴─────┴─────┘
Rename column names.
Key value pairs that map from old name to new name.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> df.rename({"foo": "apple"});
╭───────┬─────┬─────╮
│ apple ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═══════╪═════╪═════╡
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
╰───────┴─────┴─────╯
Replace a column at an index location.
typescript cannot encode type mutation, so the type of the DataFrame will be incorrect. cast the type of dataframe manually.
> const df: pl.DataFrame = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> const x = pl.Series("apple", [10, 20, 30]);
// x: pl.Series<Float64, "apple">
> df.replaceAtIdx(0, x);
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">; <- notice how the type is still the same!
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
shape: (3, 3)
╭───────┬─────┬─────╮
│ apple ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═══════╪═════╪═════╡
│ 10 ┆ 6 ┆ "a" │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 20 ┆ 7 ┆ "b" │
├╌╌╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 30 ┆ 8 ┆ "c" │
╰───────┴─────┴─────╯
Select columns from this DataFrame.
Rest
...columns: U[]Column or columns to select.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
// df: pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// bar: pl.Series<Float64, "bar">;
// ham: pl.Series<Utf8, "ham">;
// }>
> df.select('foo');
// pl.DataFrame<{
// foo: pl.Series<Float64, "foo">;
// }>
shape: (3, 1)
┌─────┐
│ foo │
│ --- │
│ i64 │
╞═════╡
│ 1 │
├╌╌╌╌╌┤
│ 2 │
├╌╌╌╌╌┤
│ 3 │
└─────┘
Rest
...columns: ExprOrString[]Serializes object to desired format via serde
Shift the values by a given period and fill the parts that will be empty due to this operation
with Nones
.
Number of places to shift (may be negative).
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
> df.shift(1);
shape: (3, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ null ┆ null ┆ null │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" │
└──────┴──────┴──────┘
> df.shift(-1)
shape: (3, 3)
┌──────┬──────┬──────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞══════╪══════╪══════╡
│ 2 ┆ 7 ┆ "b" │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
├╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌┤
│ null ┆ null ┆ null │
└──────┴──────┴──────┘
Shift the values by a given period and fill the parts that will be empty due to this operation
with the result of the fill_value
expression.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ['a', 'b', 'c']
... });
> df.shiftAndFill({n:1, fill_value:0});
shape: (3, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 0 ┆ 0 ┆ "0" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" │
└─────┴─────┴─────┘
Shrink memory usage of this DataFrame to fit the exact capacity needed to hold the data.
Slice this DataFrame over the rows direction.
Length of the slice
Offset index.
> const df = pl.DataFrame({
... "foo": [1, 2, 3],
... "bar": [6.0, 7.0, 8.0],
... "ham": ['a', 'b', 'c']
... });
> df.slice(1, 2); // Alternatively `df.slice({offset:1, length:2})`
shape: (2, 3)
┌─────┬─────┬─────┐
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 2 ┆ 7 ┆ "b" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
└─────┴─────┴─────┘
Sort the DataFrame by column.
Column(s) to sort by. Accepts expression input, including selectors. Strings are parsed as column names.
Optional
descending: booleanSort in descending order. When sorting by multiple columns, can be specified per column by passing a sequence of booleans.
Optional
nullsLast: booleanPlace null values last; can specify a single boolean applying to all columns or a sequence of booleans for per-column control.
Optional
maintainOrder: booleanWhether the order should be maintained if elements are equal.
Optional
maintainOptional
nullsOptional
reverse?: booleanOptional
descending?: booleanOptional
maintainOptional
nullsAggregate the columns of this DataFrame to their mean value.
Optional
nullStrategy: "ignore" | "propagate"Optional
length: number> const df = pl.DataFrame({
... "letters": ["c", "c", "a", "c", "a", "b"],
... "nrs": [1, 2, 3, 4, 5, 6]
... });
> console.log(df.toString());
shape: (6, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "c" ┆ 1 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
╰─────────┴─────╯
> df.groupby("letters")
... .tail(2)
... .sort("letters")
shape: (5, 2)
╭─────────┬─────╮
│ letters ┆ nrs │
│ --- ┆ --- │
│ str ┆ i64 │
╞═════════╪═════╡
│ "a" ┆ 3 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "a" ┆ 5 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "b" ┆ 6 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 2 │
├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌┤
│ "c" ┆ 4 │
╰─────────┴─────╯
Transpose a DataFrame over the diagonal.
Optional
options: { Optional
columnOptional generator/iterator that yields column names. Will be used to replace the columns in the DataFrame.
Optional
headerIf includeHeader
is set, this determines the name of the column that will be inserted
Optional
includeIf set, the column names will be added as first column.
> const df = pl.DataFrame({"a": [1, 2, 3], "b": [1, 2, 3]});
> df.transpose({includeHeader:true})
shape: (2, 4)
┌────────┬──────────┬──────────┬──────────┐
│ column ┆ column_0 ┆ column_1 ┆ column_2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞════════╪══════════╪══════════╪══════════╡
│ a ┆ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ b ┆ 1 ┆ 2 ┆ 3 │
└────────┴──────────┴──────────┴──────────┘
// replace the auto generated column names with a list
> df.transpose({includeHeader:false, columnNames:["a", "b", "c"]})
shape: (2, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 3 │
└─────┴─────┴─────┘
// Include the header as a separate column
> df.transpose({
... includeHeader:true,
... headerName:"foo",
... columnNames:["a", "b", "c"]
... })
shape: (2, 4)
┌─────┬─────┬─────┬─────┐
│ foo ┆ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╪═════╡
│ a ┆ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ b ┆ 1 ┆ 2 ┆ 3 │
└─────┴─────┴─────┴─────┘
// Replace the auto generated column with column names from a generator function
> function *namesGenerator() {
... const baseName = "my_column_";
... let count = 0;
... let name = `${baseName}_${count}`;
... count++;
... yield name;
... }
> df.transpose({includeHeader:false, columnNames:namesGenerator})
shape: (2, 3)
┌─────────────┬─────────────┬─────────────┐
│ my_column_0 ┆ my_column_1 ┆ my_column_2 │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════════════╪═════════════╪═════════════╡
│ 1 ┆ 2 ┆ 3 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 1 ┆ 2 ┆ 3 │
└─────────────┴─────────────┴─────────────┘
Drop duplicate rows from this DataFrame.
Note that this fails if there is a column of type List
in the DataFrame.
Optional
maintainOrder: booleanOptional
subset: ColumnSelectionsubset to drop duplicates for
Optional
keep: "first" | "last""first" | "last"
Optional
keep?: "first" | "last"Optional
maintainOptional
subset?: ColumnSelectionDecompose a struct into its fields. The fields will be inserted in to the DataFrame
on the
location of the struct
type.
Names of the struct columns that will be decomposed by its fields
> const df = pl.DataFrame({
... "int": [1, 2],
... "str": ["a", "b"],
... "bool": [true, null],
... "list": [[1, 2], [3]],
... })
... .toStruct("my_struct")
... .toFrame();
> df
shape: (2, 1)
┌─────────────────────────────┐
│ my_struct │
│ --- │
│ struct[4]{'int',...,'list'} │
╞═════════════════════════════╡
│ {1,"a",true,[1, 2]} │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
│ {2,"b",null,[3]} │
└─────────────────────────────┘
> df.unnest("my_struct")
shape: (2, 4)
┌─────┬─────┬──────┬────────────┐
│ int ┆ str ┆ bool ┆ list │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ bool ┆ list [i64] │
╞═════╪═════╪══════╪════════════╡
│ 1 ┆ a ┆ true ┆ [1, 2] │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌┤
│ 2 ┆ b ┆ null ┆ [3] │
└─────┴─────┴──────┴────────────┘
Unpivot a DataFrame from wide to long format.
Columns to use as identifier variables.
Values to use as value variables.
> const df1 = pl.DataFrame({
... 'id': [1],
... 'asset_key_1': ['123'],
... 'asset_key_2': ['456'],
... 'asset_key_3': ['abc'],
... });
> df1.unpivot('id', ['asset_key_1', 'asset_key_2', 'asset_key_3']);
shape: (3, 3)
┌─────┬─────────────┬───────┐
│ id ┆ variable ┆ value │
│ --- ┆ --- ┆ --- │
│ f64 ┆ str ┆ str │
╞═════╪═════════════╪═══════╡
│ 1 ┆ asset_key_1 ┆ 123 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ asset_key_2 ┆ 456 │
├╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 1 ┆ asset_key_3 ┆ abc │
└─────┴─────────────┴───────┘
Upsample a DataFrame at a regular frequency.
The every
and offset
arguments are created with the following string language:
Or combine them:
By "calendar day", we mean the corresponding time on the next day (which may not be 24 hours, due to daylight savings). Similarly for "calendar week", "calendar month", "calendar quarter", and "calendar year".
Time column will be used to determine a date range. Note that this column has to be sorted for the output to make sense.
Interval will start 'every' duration.
Optional
by: string | string[]First group by these columns and then upsample for every group.
Optional
maintainOrder: booleanKeep the ordering predictable. This is slower.
DataFrame
Result will be sorted by timeColumn
(but note that if by
columns are passed, it will only be sorted within each by
group).
Upsample a DataFrame by a certain interval.
const df = pl.DataFrame({ "date": [ new Date(2024, 1, 1), new Date(2024, 3, 1), new Date(2024, 4, 1), new Date(2024, 5, 1), ], "groups": ["A", "B", "A", "B"], "values": [0, 1, 2, 3], }) .withColumn(pl.col("date").cast(pl.Date).alias("date")) .sort("date");
df.upsample({timeColumn: "date", every: "1mo", by: "groups", maintainOrder: true}) .select(pl.col("*").forwardFill()); shape: (7, 3) ┌────────────┬────────┬────────┐ │ date ┆ groups ┆ values │ │ --- ┆ --- ┆ --- │ │ date ┆ str ┆ f64 │ ╞════════════╪════════╪════════╡ │ 2024-02-01 ┆ A ┆ 0.0 │ │ 2024-03-01 ┆ A ┆ 0.0 │ │ 2024-04-01 ┆ A ┆ 0.0 │ │ 2024-05-01 ┆ A ┆ 2.0 │ │ 2024-04-01 ┆ B ┆ 1.0 │ │ 2024-05-01 ┆ B ┆ 1.0 │ │ 2024-06-01 ┆ B ┆ 3.0 │ └────────────┴────────┴────────┘
Optional
by?: string | string[]Optional
maintainGrow this DataFrame vertically by stacking a DataFrame to it.
> const df1 = pl.DataFrame({
... "foo": [1, 2],
... "bar": [6, 7],
... "ham": ['a', 'b']
... });
> const df2 = pl.DataFrame({
... "foo": [3, 4],
... "bar": [8 , 9],
... "ham": ['c', 'd']
... });
> df1.vstack(df2);
shape: (4, 3)
╭─────┬─────┬─────╮
│ foo ┆ bar ┆ ham │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ str │
╞═════╪═════╪═════╡
│ 1 ┆ 6 ┆ "a" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 2 ┆ 7 ┆ "b" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 3 ┆ 8 ┆ "c" │
├╌╌╌╌╌┼╌╌╌╌╌┼╌╌╌╌╌┤
│ 4 ┆ 9 ┆ "d" │
╰─────┴─────┴─────╯
Return a new DataFrame with the column added or replaced.
Series, where the name of the Series refers to the column in the DataFrame.
Return a new DataFrame with the column renamed.
A DataFrame is a two-dimensional data structure that represents data as a table with rows and columns.
Param: data
Object, Array, or Series Two-dimensional data in various forms. object must contain Arrays. Array may contain Series or other Arrays.
Param: columns
Array of str, default undefined Column labels to use for resulting DataFrame. If specified, overrides any labels already present in the data. Must match data dimensions.
Param: orient
'col' | 'row' default undefined Whether to interpret two-dimensional data as columns or as rows. If None, the orientation is inferred by matching the columns and data dimensions. If this does not yield conclusive results, column orientation is used.
Example
Constructing a DataFrame from an object :
Notice that the dtype is automatically inferred as a polars Int64:
In order to specify dtypes for your columns, initialize the DataFrame with a list of Series instead:
Constructing a DataFrame from a list of lists, row orientation inferred: