import ibis
import ibis.selectors as s
= True
ibis.options.interactive repr.interactive.max_rows = 3 ibis.options.
Overview
Ibis 6.0.0 adds the Oracle backend, revamped UDF support, and many new features. This release also includes a number of refactors, bug fixes, and performance improvements. You can view the full changelog in the release notes.
If you’re new to Ibis, see how to install and the getting started tutorial.
To follow along with this blog, ensure you’re on 'ibis-framework>=6,<7'
. First, we’ll setup Ibis and fetch some sample data to use.
Now, fetch the penguins dataset.
= ibis.examples.penguins.fetch()
t t
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ sex ┃ year ┃ ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ string │ string │ float64 │ float64 │ int64 │ int64 │ string │ int64 │ ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤ │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ 3750 │ male │ 2007 │ │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ 3800 │ female │ 2007 │ │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ 3250 │ female │ 2007 │ │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
Breaking changes
Join duplicate column names
Previously when joining tables with duplicate column names, _x
and _y
suffixes would be appended by default to the left and right tables respectively. You could override this with the suffix
argument, which is now removed in favor of lname
and rname
arguments. The default is changed to no suffix for the left table and _right
for the right table.
"island").select(s.startswith("species")) t.join(t,
┏━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ species ┃ species_right ┃ ┡━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ string │ string │ ├─────────┼───────────────┤ │ Adelie │ Adelie │ │ Adelie │ Adelie │ │ Adelie │ Adelie │ │ … │ … │ └─────────┴───────────────┘
To replicate the previous behavior:
"island", lname="{name}_x", rname="{name}_y").select(
t.join(t, "species")
s.startswith( )
┏━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ species_x ┃ species_y ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━┩ │ string │ string │ ├───────────┼───────────┤ │ Adelie │ Adelie │ │ Adelie │ Adelie │ │ Adelie │ Adelie │ │ … │ … │ └───────────┴───────────┘
.count()
column names no longer named count
automatically
Columns created with the .count()
aggregation are no longer automatically named count
. This is to follow convention with other aggregations and reduce the likelihood of name collisions.
"species").agg(ibis._.species.count()) t.group_by(
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓ ┃ species ┃ Count(species) ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩ │ string │ int64 │ ├───────────┼────────────────┤ │ Adelie │ 152 │ │ Chinstrap │ 68 │ │ Gentoo │ 124 │ └───────────┴────────────────┘
To reproduce the old behavior, you can rename the column to count
with:
"species").agg(count=ibis._.species.count()) t.group_by(
Backends
Oracle
The Oracle backend was added! See the Voltron Data blog for more details.
connect(f"oracle://user:password@host") ibis.
DuckDB
There were various DuckDB improvements, but one notable new feature is the ability to attach to a SQLite database through DuckDB. This allows you to run OLAP queries via DuckDB significantly faster on source data from SQLite.
First we’ll create a DuckDB connection and show it has no tables:
= ibis.connect("duckdb://")
duckdb_con duckdb_con.list_tables()
[]
Then create a SQLite database with a table:
= ibis.connect("sqlite://penguins.db")
sqlite_con "penguins", t.to_pandas(), overwrite=True) sqlite_con.create_table(
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ sex ┃ year ┃ ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ string │ string │ float64 │ float64 │ float64 │ float64 │ string │ int64 │ ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤ │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181.0 │ 3750.0 │ male │ 2007 │ │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186.0 │ 3800.0 │ female │ 2007 │ │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195.0 │ 3250.0 │ female │ 2007 │ │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
And attach it:
"./penguins.db")
duckdb_con.attach_sqlite( duckdb_con.list_tables()
['penguins']
MotherDuck support!
MotherDuck launched recently and is now supported in Ibis!
Simply connect with the DuckDB backend using md:
or motherduck:
as the database.
connect("duckdb://md:") ibis.
Polars
The Polars backend received many improvements from community members @alexander-beedie and @mesejo, with plenty of operations now supported.
Some additions in this version include:
any
andall
reductionsargmin
andargmax
identical_to
corr
- support for
.sql()
Give it a try by setting your backend to Polars with ibis.set_backend("polars")
.
Functionality
UDFs
User-defined functions (UDFs) have been revamped with a new syntax and new backends added. To get started, import the decorator:
from ibis import udf
Define a UDF:
@udf.scalar.python
def num_vowels(s: str, include_y: bool = False) -> int:
return sum(map(s.lower().count, "aeiou" + ("y" * include_y)))
And call it:
1].species.execute()[0]) num_vowels(t[:
4
=num_vowels(t.species)).agg(
t.group_by(num_vowels=t.island.count()
num_vowels_island_count )
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ num_vowels ┃ num_vowels_island_count ┃ ┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ int64 │ int64 │ ├────────────┼─────────────────────────┤ │ 2 │ 68 │ │ 4 │ 152 │ │ 3 │ 124 │ └────────────┴─────────────────────────┘
filter(num_vowels(t.species) < 4) t.
┏━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ sex ┃ year ┃ ┡━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ string │ string │ float64 │ float64 │ int64 │ int64 │ string │ int64 │ ├─────────┼────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤ │ Gentoo │ Biscoe │ 46.1 │ 13.2 │ 211 │ 4500 │ female │ 2007 │ │ Gentoo │ Biscoe │ 50.0 │ 16.3 │ 230 │ 5700 │ male │ 2007 │ │ Gentoo │ Biscoe │ 48.7 │ 14.1 │ 210 │ 4450 │ female │ 2007 │ │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────┴────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
to_torch
API
A new to_torch
output method was added. Combined with UDFs, this brings powerful ML capabilities into Ibis. See a complete example in the Ibis + DuckDB + PyTorch blog.
import torch
=10) torch.set_printoptions(threshold
t.select(s.numeric()).to_torch()
{'bill_length_mm': tensor([39.1000, 39.5000, 40.3000, ..., 49.6000, 50.8000, 50.2000],
dtype=torch.float64),
'bill_depth_mm': tensor([18.7000, 17.4000, 18.0000, ..., 18.2000, 19.0000, 18.7000],
dtype=torch.float64),
'flipper_length_mm': tensor([181, 186, 195, ..., 193, 210, 198]),
'body_mass_g': tensor([3750, 3800, 3250, ..., 3775, 4100, 3775]),
'year': tensor([2007, 2007, 2007, ..., 2009, 2009, 2009])}
Array zip support
A new zip operation was added on array data types, allowing you to zip together multiple arrays.
= ibis.memtable(
arrays "numbers": [[3, 2], [], None], "strings": [["a", "c"], None, ["e"]]}
{
) arrays
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓ ┃ numbers ┃ strings ┃ ┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩ │ array<int64> │ array<string> │ ├──────────────┼───────────────┤ │ [3, 2] │ ['a', 'c'] │ │ [] │ NULL │ │ NULL │ ['e'] │ └──────────────┴───────────────┘
zip(arrays.strings) arrays.numbers.
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ArrayZip() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ array<struct<f1: int64, f2: string>> │ ├──────────────────────────────────────┤ │ [{...}, {...}] │ │ NULL │ │ NULL │ └──────────────────────────────────────┘
zip(arrays.strings).unnest() arrays.numbers.
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ ArrayZip() ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ struct<f1: int64, f2: string> │ ├───────────────────────────────┤ │ {'f1': 3, 'f2': 'a'} │ │ {'f1': 2, 'f2': 'c'} │ └───────────────────────────────┘
Try cast support
A new try_cast()
operation was added that allows you to cast a column to a type, but return null if the cast fails.
"a").try_cast("int") ibis.literal(
None
0).try_cast("float") ibis.literal(
__dataframe__
support
Ibis now supports the dataframe interchange protocol, allowing Ibis expressions to be used in any framework that supports it. Adoption of the protocol is still in its early stages, but we expect this to enable Ibis to be used in many new places going forward.
t.__dataframe__()
<ibis.expr.types.dataframe_interchange.IbisDataFrame at 0x7ff54d1ad450>
Streamlit experimental connection interface
A new experimental connection interface was added for Streamlit. See how-to write a Streamlit app with Ibis.
SQL dialect parameter
In SQL methods, you can now pass the dialect
parameter to specify the SQL dialect used. This leverages sqlglot
under the hood.
= """
bigquery_sql SELECT
t0.`species`,
COUNT(t0.`species`) AS `count`,
CAST(COUNT(DISTINCT t0.`island`) AS FLOAT64) AS `islands`
FROM penguins AS t0
GROUP BY
1
"""
="bigquery") duckdb_con.sql(bigquery_sql, dialect
┏━━━━━━━━━━━┳━━━━━━━┳━━━━━━━━━┓ ┃ species ┃ count ┃ islands ┃ ┡━━━━━━━━━━━╇━━━━━━━╇━━━━━━━━━┩ │ string │ int64 │ float64 │ ├───────────┼───────┼─────────┤ │ Chinstrap │ 68 │ 1.0 │ │ Adelie │ 152 │ 3.0 │ │ Gentoo │ 124 │ 1.0 │ └───────────┴───────┴─────────┘
Delta Lake read/write support for some backends
Delta Lake tables are supported through the deltalake
package with read_delta()
implemented for DuckDB, Polars, and DataFusion.
"penguins.delta", mode="overwrite") t.to_delta(
= ibis.read_delta("penguins.delta")
t t
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓ ┃ species ┃ island ┃ bill_length_mm ┃ bill_depth_mm ┃ flipper_length_mm ┃ body_mass_g ┃ sex ┃ year ┃ ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩ │ string │ string │ float64 │ float64 │ int64 │ int64 │ string │ int64 │ ├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤ │ Adelie │ Torgersen │ 39.1 │ 18.7 │ 181 │ 3750 │ male │ 2007 │ │ Adelie │ Torgersen │ 39.5 │ 17.4 │ 186 │ 3800 │ female │ 2007 │ │ Adelie │ Torgersen │ 40.3 │ 18.0 │ 195 │ 3250 │ female │ 2007 │ │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
Selectors
Some minor selectors improvements were added including the ability to use abstract type names and lists of strings.
"string")) t.select(s.of_type(
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━┓ ┃ species ┃ island ┃ sex ┃ ┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━┩ │ string │ string │ string │ ├─────────┼───────────┼────────┤ │ Adelie │ Torgersen │ male │ │ Adelie │ Torgersen │ female │ │ Adelie │ Torgersen │ female │ │ … │ … │ … │ └─────────┴───────────┴────────┘
"species", "island"], ibis._.count())) t.agg(s.across([
┏━━━━━━━━━┳━━━━━━━━┓ ┃ species ┃ island ┃ ┡━━━━━━━━━╇━━━━━━━━┩ │ int64 │ int64 │ ├─────────┼────────┤ │ 344 │ 344 │ └─────────┴────────┘
Refactors
Several internal refactors that shouldn’t affect normal usage were made. See the release notes for more details.
Wrapping up
Ibis v6.0.0 brings exciting new features that enable future support for ML and streaming workloads.
As always, try Ibis by installing and getting started.
If you run into any issues or find support is lacking for your backend, open an issue or discussion and let us know!