Casting
Casting converts the underlying DataType of a column to a new one. Polars uses Arrow to manage the data in memory and relies on the compute kernels in the rust implementation to do the conversion. Casting is available with the cast() method.
The cast method includes a strict parameter that determines how Polars behaves when it encounters a value that can't be converted from the source DataType to the target DataType. By default, strict=True, which means that Polars will throw an error to notify the user of the failed conversion and provide details on the values that couldn't be cast. On the other hand, if strict=False, any values that can't be converted to the target DataType will be quietly converted to null.
Numerics
Let's take a look at the following DataFrame which contains both integers and floating point numbers.
df = pl.DataFrame(
{
"integers": [1, 2, 3, 4, 5],
"big_integers": [1, 10000002, 3, 10000004, 10000005],
"floats": [4.0, 5.0, 6.0, 7.0, 8.0],
"floats_with_decimal": [4.532, 5.5, 6.5, 7.5, 8.5],
}
)
print(df)
let df = df! (
"integers"=> &[1, 2, 3, 4, 5],
"big_integers"=> &[1, 10000002, 3, 10000004, 10000005],
"floats"=> &[4.0, 5.0, 6.0, 7.0, 8.0],
"floats_with_decimal"=> &[4.532, 5.5, 6.5, 7.5, 8.5],
)?;
println!("{}", &df);
shape: (5, 4)
┌──────────┬──────────────┬────────┬─────────────────────┐
│ integers ┆ big_integers ┆ floats ┆ floats_with_decimal │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ f64 ┆ f64 │
╞══════════╪══════════════╪════════╪═════════════════════╡
│ 1 ┆ 1 ┆ 4.0 ┆ 4.532 │
│ 2 ┆ 10000002 ┆ 5.0 ┆ 5.5 │
│ 3 ┆ 3 ┆ 6.0 ┆ 6.5 │
│ 4 ┆ 10000004 ┆ 7.0 ┆ 7.5 │
│ 5 ┆ 10000005 ┆ 8.0 ┆ 8.5 │
└──────────┴──────────────┴────────┴─────────────────────┘
To perform casting operations between floats and integers, or vice versa, we can invoke the cast() function.
out = df.select(
pl.col("integers").cast(pl.Float32).alias("integers_as_floats"),
pl.col("floats").cast(pl.Int32).alias("floats_as_integers"),
pl.col("floats_with_decimal")
.cast(pl.Int32)
.alias("floats_with_decimal_as_integers"),
)
print(out)
let out = df
.clone()
.lazy()
.select([
col("integers")
.cast(DataType::Float32)
.alias("integers_as_floats"),
col("floats")
.cast(DataType::Int32)
.alias("floats_as_integers"),
col("floats_with_decimal")
.cast(DataType::Int32)
.alias("floats_with_decimal_as_integers"),
])
.collect()?;
println!("{}", &out);
shape: (5, 3)
┌────────────────────┬────────────────────┬─────────────────────────────────┐
│ integers_as_floats ┆ floats_as_integers ┆ floats_with_decimal_as_integers │
│ --- ┆ --- ┆ --- │
│ f32 ┆ i32 ┆ i32 │
╞════════════════════╪════════════════════╪═════════════════════════════════╡
│ 1.0 ┆ 4 ┆ 4 │
│ 2.0 ┆ 5 ┆ 5 │
│ 3.0 ┆ 6 ┆ 6 │
│ 4.0 ┆ 7 ┆ 7 │
│ 5.0 ┆ 8 ┆ 8 │
└────────────────────┴────────────────────┴─────────────────────────────────┘
Note that in the case of decimal values these are rounded downwards when casting to an integer.
Downcast
Reducing the memory footprint is also achievable by modifying the number of bits allocated to an element. As an illustration, the code below demonstrates how casting from Int64 to Int16 and from Float64 to Float32 can be used to lower memory usage.
out = df.select(
pl.col("integers").cast(pl.Int16).alias("integers_smallfootprint"),
pl.col("floats").cast(pl.Float32).alias("floats_smallfootprint"),
)
print(out)
let out = df
.clone()
.lazy()
.select([
col("integers")
.cast(DataType::Int16)
.alias("integers_smallfootprint"),
col("floats")
.cast(DataType::Float32)
.alias("floats_smallfootprint"),
])
.collect();
match out {
Ok(out) => println!("{}", &out),
Err(e) => println!("{:?}", e),
};
shape: (5, 2)
┌─────────────────────────┬───────────────────────┐
│ integers_smallfootprint ┆ floats_smallfootprint │
│ --- ┆ --- │
│ i16 ┆ f32 │
╞═════════════════════════╪═══════════════════════╡
│ 1 ┆ 4.0 │
│ 2 ┆ 5.0 │
│ 3 ┆ 6.0 │
│ 4 ┆ 7.0 │
│ 5 ┆ 8.0 │
└─────────────────────────┴───────────────────────┘
Overflow
When performing downcasting, it is crucial to ensure that the chosen number of bits (such as 64, 32, or 16) is sufficient to accommodate the largest and smallest numbers in the column. For example, using a 32-bit signed integer (Int32) allows handling integers within the range of -2147483648 to +2147483647, while using Int8 covers integers between -128 to 127. Attempting to cast to a DataType that is too small will result in a ComputeError thrown by Polars, as the operation is not supported.
try:
out = df.select(pl.col("big_integers").cast(pl.Int8))
print(out)
except Exception as e:
print(e)
let out = df
.clone()
.lazy()
.select([col("big_integers").strict_cast(DataType::Int8)])
.collect();
match out {
Ok(out) => println!("{}", &out),
Err(e) => println!("{:?}", e),
};
strict conversion from `i64` to `i8` failed for column: big_integers, value(s) [10000002, 10000004, 10000005]; if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`
You can set the strict parameter to False, this converts values that are overflowing to null values.
out = df.select(pl.col("big_integers").cast(pl.Int8, strict=False))
print(out)
let out = df
.clone()
.lazy()
.select([col("big_integers").cast(DataType::Int8)])
.collect();
match out {
Ok(out) => println!("{}", &out),
Err(e) => println!("{:?}", e),
};
shape: (5, 1)
┌──────────────┐
│ big_integers │
│ --- │
│ i8 │
╞══════════════╡
│ 1 │
│ null │
│ 3 │
│ null │
│ null │
└──────────────┘
Strings
Strings can be casted to numerical data types and vice versa:
df = pl.DataFrame(
{
"integers": [1, 2, 3, 4, 5],
"float": [4.0, 5.03, 6.0, 7.0, 8.0],
"floats_as_string": ["4.0", "5.0", "6.0", "7.0", "8.0"],
}
)
out = df.select(
pl.col("integers").cast(pl.Utf8),
pl.col("float").cast(pl.Utf8),
pl.col("floats_as_string").cast(pl.Float64),
)
print(out)
let df = df! (
"integers" => &[1, 2, 3, 4, 5],
"float" => &[4.0, 5.03, 6.0, 7.0, 8.0],
"floats_as_string" => &["4.0", "5.0", "6.0", "7.0", "8.0"],
)?;
let out = df
.clone()
.lazy()
.select([
col("integers").cast(DataType::Utf8),
col("float").cast(DataType::Utf8),
col("floats_as_string").cast(DataType::Float64),
])
.collect()?;
println!("{}", &out);
shape: (5, 3)
┌──────────┬───────┬──────────────────┐
│ integers ┆ float ┆ floats_as_string │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞══════════╪═══════╪══════════════════╡
│ 1 ┆ 4.0 ┆ 4.0 │
│ 2 ┆ 5.03 ┆ 5.0 │
│ 3 ┆ 6.0 ┆ 6.0 │
│ 4 ┆ 7.0 ┆ 7.0 │
│ 5 ┆ 8.0 ┆ 8.0 │
└──────────┴───────┴──────────────────┘
In case the column contains a non-numerical value, Polars will throw a ComputeError detailing the conversion error. Setting strict=False will convert the non float value to null.
df = pl.DataFrame({"strings_not_float": ["4.0", "not_a_number", "6.0", "7.0", "8.0"]})
try:
out = df.select(pl.col("strings_not_float").cast(pl.Float64))
print(out)
except Exception as e:
print(e)
let df = df! ("strings_not_float"=> ["4.0", "not_a_number", "6.0", "7.0", "8.0"])?;
let out = df
.clone()
.lazy()
.select([col("strings_not_float").cast(DataType::Float64)])
.collect();
match out {
Ok(out) => println!("{}", &out),
Err(e) => println!("{:?}", e),
};
strict conversion from `str` to `f64` failed for column: strings_not_float, value(s) ["not_a_number"]; if you were trying to cast Utf8 to temporal dtypes, consider using `strptime`
Booleans
Booleans can be expressed as either 1 (True) or 0 (False). It's possible to perform casting operations between a numerical DataType and a boolean, and vice versa. However, keep in mind that casting from a string (Utf8) to a boolean is not permitted.
df = pl.DataFrame(
{
"integers": [-1, 0, 2, 3, 4],
"floats": [0.0, 1.0, 2.0, 3.0, 4.0],
"bools": [True, False, True, False, True],
}
)
out = df.select(pl.col("integers").cast(pl.Boolean), pl.col("floats").cast(pl.Boolean))
print(out)
let df = df! (
"integers"=> &[-1, 0, 2, 3, 4],
"floats"=> &[0.0, 1.0, 2.0, 3.0, 4.0],
"bools"=> &[true, false, true, false, true],
)?;
let out = df
.clone()
.lazy()
.select([
col("integers").cast(DataType::Boolean),
col("floats").cast(DataType::Boolean),
])
.collect()?;
println!("{}", &out);
shape: (5, 2)
┌──────────┬────────┐
│ integers ┆ floats │
│ --- ┆ --- │
│ bool ┆ bool │
╞══════════╪════════╡
│ true ┆ false │
│ false ┆ true │
│ true ┆ true │
│ true ┆ true │
│ true ┆ true │
└──────────┴────────┘
Dates
Temporal data types such as Date or Datetime are represented as the number of days (Date) and microseconds (Datetime) since epoch. Therefore, casting between the numerical types and the temporal data types is allowed.
from datetime import date, datetime
df = pl.DataFrame(
{
"date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True),
"datetime": pl.date_range(
datetime(2022, 1, 1), datetime(2022, 1, 5), eager=True
),
}
)
out = df.select(pl.col("date").cast(pl.Int64), pl.col("datetime").cast(pl.Int64))
print(out)
use chrono::prelude::*;
use polars::time::*;
let df = df! (
"date" => date_range("date",
NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?.cast(&DataType::Date)?,
"datetime" => date_range("datetime",
NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?,
)?;
let out = df
.clone()
.lazy()
.select([
col("date").cast(DataType::Int64),
col("datetime").cast(DataType::Int64),
])
.collect()?;
println!("{}", &out);
shape: (5, 2)
┌───────┬──────────────────┐
│ date ┆ datetime │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═══════╪══════════════════╡
│ 18993 ┆ 1640995200000000 │
│ 18994 ┆ 1641081600000000 │
│ 18995 ┆ 1641168000000000 │
│ 18996 ┆ 1641254400000000 │
│ 18997 ┆ 1641340800000000 │
└───────┴──────────────────┘
To perform casting operations between strings and Dates/Datetimes, strftime and strptime are utilized. Polars adopts the chrono format syntax for when formatting. It's worth noting that strptime features additional options that support timezone functionality. Refer to the API documentation for further information.
df = pl.DataFrame(
{
"date": pl.date_range(date(2022, 1, 1), date(2022, 1, 5), eager=True),
"string": [
"2022-01-01",
"2022-01-02",
"2022-01-03",
"2022-01-04",
"2022-01-05",
],
}
)
out = df.select(
pl.col("date").dt.strftime("%Y-%m-%d"),
pl.col("string").str.strptime(pl.Datetime, "%Y-%m-%d"),
)
print(out)
let df = df! (
"date" => date_range("date",
NaiveDate::from_ymd_opt(2022, 1, 1).unwrap().and_hms_opt(0, 0, 0).unwrap(), NaiveDate::from_ymd_opt(2022, 1, 5).unwrap().and_hms_opt(0, 0, 0).unwrap(), Duration::parse("1d"),ClosedWindow::Both, TimeUnit::Milliseconds, None)?,
"string" => &[
"2022-01-01",
"2022-01-02",
"2022-01-03",
"2022-01-04",
"2022-01-05",
],
)?;
let out = df
.clone()
.lazy()
.select([
col("date").dt().strftime("%Y-%m-%d"),
col("string").str().strptime(
DataType::Datetime(TimeUnit::Microseconds, None),
StrptimeOptions::default(),
),
])
.collect()?;
println!("{}", &out);
shape: (5, 2)
┌────────────┬─────────────────────┐
│ date ┆ string │
│ --- ┆ --- │
│ str ┆ datetime[μs] │
╞════════════╪═════════════════════╡
│ 2022-01-01 ┆ 2022-01-01 00:00:00 │
│ 2022-01-02 ┆ 2022-01-02 00:00:00 │
│ 2022-01-03 ┆ 2022-01-03 00:00:00 │
│ 2022-01-04 ┆ 2022-01-04 00:00:00 │
│ 2022-01-05 ┆ 2022-01-05 00:00:00 │
└────────────┴─────────────────────┘