Expressions

df.select(pl.col("*"))

let out = df.clone().lazy().select([col("*")]).collect()?;
println!("{}", out);

shape: (8, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 0   ┆ 0.267969 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 1   ┆ 0.900438 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.405714 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.706133 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.394016 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.500425 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.493089 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.187685 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘

You can also specify the specific columns that you want to return. There are two ways to do this. The first option is to create a list of column names, as seen below.

Python Rust

df.select(pl.col(["a", "b"]))

let out = df.clone().lazy().select([col("a"), col("b")]).collect()?;
println!("{}", out);

shape: (8, 2)
┌─────┬──────────┐
│ a   ┆ b        │
│ --- ┆ ---      │
│ i64 ┆ f64      │
╞═════╪══════════╡
│ 0   ┆ 0.267969 │
│ 1   ┆ 0.900438 │
│ 2   ┆ 0.405714 │
│ 3   ┆ 0.706133 │
│ 4   ┆ 0.394016 │
│ 5   ┆ 0.500425 │
│ 6   ┆ 0.493089 │
│ 7   ┆ 0.187685 │
└─────┴──────────┘

The second option is to specify each column within a list in the select statement. This option is shown below.

Python Rust

df.select([pl.col("a"), pl.col("b")]).limit(3)

let out = df
    .clone()
    .lazy()
    .select([col("a"), col("b")])
    .limit(3)
    .collect()?;
println!("{}", out);

shape: (3, 2)
┌─────┬──────────┐
│ a   ┆ b        │
│ --- ┆ ---      │
│ i64 ┆ f64      │
╞═════╪══════════╡
│ 0   ┆ 0.267969 │
│ 1   ┆ 0.900438 │
│ 2   ┆ 0.405714 │
└─────┴──────────┘

If you want to exclude an entire column from your view, you can simply use exclude in your select statement.

Python Rust

df.select([pl.exclude("a")])

let out = df
    .clone()
    .lazy()
    .select([col("*").exclude(["a"])])
    .collect()?;
println!("{}", out);

shape: (8, 3)
┌──────────┬─────────────────────┬───────┐
│ b        ┆ c                   ┆ d     │
│ ---      ┆ ---                 ┆ ---   │
│ f64      ┆ datetime[μs]        ┆ f64   │
╞══════════╪═════════════════════╪═══════╡
│ 0.267969 ┆ 2022-12-01 00:00:00 ┆ 1.0   │
│ 0.900438 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 0.405714 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 0.706133 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 0.394016 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 0.500425 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 0.493089 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 0.187685 ┆ 2022-12-08 00:00:00 ┆ null  │
└──────────┴─────────────────────┴───────┘

Filter

The filter option allows us to create a subset of the DataFrame. We use the same DataFrame as earlier and we filter between two specified dates.

Python Rust

df.filter(
    pl.col("c").is_between(datetime(2022, 12, 2), datetime(2022, 12, 8)),
)

let start_date = NaiveDate::from_ymd_opt(2022, 12, 2)
    .unwrap()
    .and_hms_opt(0, 0, 0)
    .unwrap();
let end_date = NaiveDate::from_ymd_opt(2022, 12, 8)
    .unwrap()
    .and_hms_opt(0, 0, 0)
    .unwrap();
let out = df
    .clone()
    .lazy()
    .filter(
        col("c")
            .gt_eq(lit(start_date))
            .and(col("c").lt_eq(lit(end_date))),
    )
    .collect()?;
println!("{}", out);

shape: (7, 4)
┌─────┬──────────┬─────────────────────┬───────┐
│ a   ┆ b        ┆ c                   ┆ d     │
│ --- ┆ ---      ┆ ---                 ┆ ---   │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64   │
╞═════╪══════════╪═════════════════════╪═══════╡
│ 1   ┆ 0.900438 ┆ 2022-12-02 00:00:00 ┆ 2.0   │
│ 2   ┆ 0.405714 ┆ 2022-12-03 00:00:00 ┆ NaN   │
│ 3   ┆ 0.706133 ┆ 2022-12-04 00:00:00 ┆ NaN   │
│ 4   ┆ 0.394016 ┆ 2022-12-05 00:00:00 ┆ 0.0   │
│ 5   ┆ 0.500425 ┆ 2022-12-06 00:00:00 ┆ -5.0  │
│ 6   ┆ 0.493089 ┆ 2022-12-07 00:00:00 ┆ -42.0 │
│ 7   ┆ 0.187685 ┆ 2022-12-08 00:00:00 ┆ null  │
└─────┴──────────┴─────────────────────┴───────┘

With filter you can also create more complex filters that include multiple columns.

Python Rust

df.filter((pl.col("a") <= 3) & (pl.col("d").is_not_nan()))

let out = df
    .clone()
    .lazy()
    .filter(col("a").lt_eq(3).and(col("d").is_not_null()))
    .collect()?;
println!("{}", out);

shape: (2, 4)
┌─────┬──────────┬─────────────────────┬─────┐
│ a   ┆ b        ┆ c                   ┆ d   │
│ --- ┆ ---      ┆ ---                 ┆ --- │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64 │
╞═════╪══════════╪═════════════════════╪═════╡
│ 0   ┆ 0.267969 ┆ 2022-12-01 00:00:00 ┆ 1.0 │
│ 1   ┆ 0.900438 ┆ 2022-12-02 00:00:00 ┆ 2.0 │
└─────┴──────────┴─────────────────────┴─────┘

With_columns

with_columns allows you to create new columns for your analyses. We create two new columns e and b+42. First we sum all values from column b and store the results in column e. After that we add 42 to the values of b. Creating a new column b+42 to store these results.

Python Rust

with_columns

df.with_columns([pl.col("b").sum().alias("e"), (pl.col("b") + 42).alias("b+42")])

with_columns

let out = df
    .clone()
    .lazy()
    .with_columns([
        col("b").sum().alias("e"),
        (col("b") + lit(42)).alias("b+42"),
    ])
    .collect()?;
println!("{}", out);

shape: (8, 6)
┌─────┬──────────┬─────────────────────┬───────┬─────────┬───────────┐
│ a   ┆ b        ┆ c                   ┆ d     ┆ e       ┆ b+42      │
│ --- ┆ ---      ┆ ---                 ┆ ---   ┆ ---     ┆ ---       │
│ i64 ┆ f64      ┆ datetime[μs]        ┆ f64   ┆ f64     ┆ f64       │
╞═════╪══════════╪═════════════════════╪═══════╪═════════╪═══════════╡
│ 0   ┆ 0.267969 ┆ 2022-12-01 00:00:00 ┆ 1.0   ┆ 3.85547 ┆ 42.267969 │
│ 1   ┆ 0.900438 ┆ 2022-12-02 00:00:00 ┆ 2.0   ┆ 3.85547 ┆ 42.900438 │
│ 2   ┆ 0.405714 ┆ 2022-12-03 00:00:00 ┆ NaN   ┆ 3.85547 ┆ 42.405714 │
│ 3   ┆ 0.706133 ┆ 2022-12-04 00:00:00 ┆ NaN   ┆ 3.85547 ┆ 42.706133 │
│ 4   ┆ 0.394016 ┆ 2022-12-05 00:00:00 ┆ 0.0   ┆ 3.85547 ┆ 42.394016 │
│ 5   ┆ 0.500425 ┆ 2022-12-06 00:00:00 ┆ -5.0  ┆ 3.85547 ┆ 42.500425 │
│ 6   ┆ 0.493089 ┆ 2022-12-07 00:00:00 ┆ -42.0 ┆ 3.85547 ┆ 42.493089 │
│ 7   ┆ 0.187685 ┆ 2022-12-08 00:00:00 ┆ null  ┆ 3.85547 ┆ 42.187685 │
└─────┴──────────┴─────────────────────┴───────┴─────────┴───────────┘

Group by

We will create a new DataFrame for the Group by functionality. This new DataFrame will include several 'groups' that we want to group by.

Python Rust

DataFrame

df2 = pl.DataFrame(
    {
        "x": np.arange(0, 8),
        "y": ["A", "A", "A", "B", "B", "C", "X", "X"],
    }
)

DataFrame

let df2: DataFrame = df!("x" => 0..8,
    "y"=> &["A", "A", "A", "B", "B", "C", "X", "X"],
)
.expect("should not fail");
println!("{}", df2);

shape: (8, 2)
┌─────┬─────┐
│ x   ┆ y   │
│ --- ┆ --- │
│ i64 ┆ str │
╞═════╪═════╡
│ 0   ┆ A   │
│ 1   ┆ A   │
│ 2   ┆ A   │
│ 3   ┆ B   │
│ 4   ┆ B   │
│ 5   ┆ C   │
│ 6   ┆ X   │
│ 7   ┆ X   │
└─────┴─────┘

Python Rust

df2.group_by("y", maintain_order=True).count()

let out = df2
    .clone()
    .lazy()
    .group_by(["y"])
    .agg([count()])
    .collect()?;
println!("{}", out);

shape: (4, 2)
┌─────┬───────┐
│ y   ┆ count │
│ --- ┆ ---   │
│ str ┆ u32   │
╞═════╪═══════╡
│ A   ┆ 3     │
│ B   ┆ 2     │
│ C   ┆ 1     │
│ X   ┆ 2     │
└─────┴───────┘

Python Rust

df2.group_by("y", maintain_order=True).agg(
    [
        pl.col("*").count().alias("count"),
        pl.col("*").sum().alias("sum"),
    ]
)

let out = df2
    .clone()
    .lazy()
    .group_by(["y"])
    .agg([col("*").count().alias("count"), col("*").sum().alias("sum")])
    .collect()?;
println!("{}", out);

shape: (4, 3)
┌─────┬───────┬─────┐
│ y   ┆ count ┆ sum │
│ --- ┆ ---   ┆ --- │
│ str ┆ u32   ┆ i64 │
╞═════╪═══════╪═════╡
│ A   ┆ 3     ┆ 3   │
│ B   ┆ 2     ┆ 7   │
│ C   ┆ 1     ┆ 5   │
│ X   ┆ 2     ┆ 13  │
└─────┴───────┴─────┘

Combining operations

Below are some examples on how to combine operations to create the DataFrame you require.

Python Rust

df_x = df.with_columns((pl.col("a") * pl.col("b")).alias("a * b")).select(
    [pl.all().exclude(["c", "d"])]
)

print(df_x)

let out = df
    .clone()
    .lazy()
    .with_columns([(col("a") * col("b")).alias("a * b")])
    .select([col("*").exclude(["c", "d"])])
    .collect()?;
println!("{}", out);

shape: (8, 3)
┌─────┬──────────┬──────────┐
│ a   ┆ b        ┆ a * b    │
│ --- ┆ ---      ┆ ---      │
│ i64 ┆ f64      ┆ f64      │
╞═════╪══════════╪══════════╡
│ 0   ┆ 0.267969 ┆ 0.0      │
│ 1   ┆ 0.900438 ┆ 0.900438 │
│ 2   ┆ 0.405714 ┆ 0.811429 │
│ 3   ┆ 0.706133 ┆ 2.118399 │
│ 4   ┆ 0.394016 ┆ 1.576064 │
│ 5   ┆ 0.500425 ┆ 2.502127 │
│ 6   ┆ 0.493089 ┆ 2.958534 │
│ 7   ┆ 0.187685 ┆ 1.313797 │
└─────┴──────────┴──────────┘

Python Rust

df_y = df.with_columns([(pl.col("a") * pl.col("b")).alias("a * b")]).select(
    [pl.all().exclude("d")]
)

print(df_y)