Ibis: an overview

Cody Peterson

what

Ibis is a Python library for:

  • exploratory data analysis (EDA)
  • analytics
  • data engineering
  • machine learning
  • building your own library (e.g. Google BigFrames)

development to production with the same API

Getting started…

import ibis

ibis.options.interactive = True
t = ibis.read_parquet("penguins.parquet")
t.head(3)
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘

One API, 20+ backends

con = ibis.connect("duckdb://")
t = con.read_parquet("penguins.parquet")
t.head(3)
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
t.group_by("species", "island").agg(count=t.count()).order_by("count")
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Adelie   Biscoe   44 │
│ Adelie   Torgersen52 │
│ Adelie   Dream    56 │
│ ChinstrapDream    68 │
│ Gentoo   Biscoe   124 │
└───────────┴───────────┴───────┘
con = ibis.connect("polars://")
t = con.read_parquet("penguins.parquet")
t.head(3)
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
t.group_by("species", "island").agg(count=t.count()).order_by("count")
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Adelie   Biscoe   44 │
│ Adelie   Torgersen52 │
│ Adelie   Dream    56 │
│ ChinstrapDream    68 │
│ Gentoo   Biscoe   124 │
└───────────┴───────────┴───────┘
con = ibis.connect("datafusion://")
t = con.read_parquet("penguins.parquet")
t.head(3)
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
t.group_by("species", "island").agg(count=t.count()).order_by("count")
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Adelie   Biscoe   44 │
│ Adelie   Torgersen52 │
│ Adelie   Dream    56 │
│ ChinstrapDream    68 │
│ Gentoo   Biscoe   124 │
└───────────┴───────────┴───────┘
con = ibis.connect("pyspark://")
t = con.read_parquet("penguins.parquet")
t.head(3)
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘
t.group_by("species", "island").agg(count=t.count()).order_by("count")
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Adelie   Biscoe   44 │
│ Adelie   Torgersen52 │
│ Adelie   Dream    56 │
│ ChinstrapDream    68 │
│ Gentoo   Biscoe   124 │
└───────────┴───────────┴───────┘

how it works

Ibis compiles down to SQL or dataframe code:

SQL + Python: better together

For SQL backends, inspect the SQL generated by Ibis:

g = t.group_by("species", "island").agg(count=t.count()).order_by(ibis.desc("count"))
ibis.to_sql(g)
SELECT
  *
FROM (
  SELECT
    "t0"."species",
    "t0"."island",
    COUNT(*) AS "count"
  FROM "penguins" AS "t0"
  GROUP BY
    1,
    2
) AS "t1"
ORDER BY
  "t1"."count" DESC

SQL + Python: better together

For SQL backends, mix SQL and Python:

sql = """
SELECT
  species,
  island,
  count(*) as count
FROM penguins
GROUP BY 1, 2
"""
t.sql(sql)
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Adelie   Dream    56 │
│ Gentoo   Biscoe   124 │
│ ChinstrapDream    68 │
│ Adelie   Torgersen52 │
│ Adelie   Biscoe   44 │
└───────────┴───────────┴───────┘

SQL + Python: better together

For SQL backends, mix SQL and Python:

sql = """
SELECT
  species,
  island,
  count(*) as count
FROM penguins
GROUP BY 1, 2
"""
t.sql(sql).order_by(ibis.desc("count"))
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━┓
┃ species    island     count ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━┩
│ stringstringint64 │
├───────────┼───────────┼───────┤
│ Gentoo   Biscoe   124 │
│ ChinstrapDream    68 │
│ Adelie   Dream    56 │
│ Adelie   Torgersen52 │
│ Adelie   Biscoe   44 │
└───────────┴───────────┴───────┘

demo

ibis-analytics

Analyzing 10M+ rows from 4+ data sources.

why

dataframe lore

Dataframes first appeared in the S programming language, then evolved into the R programming language.

Then pandas perfected the dataframe in Python…or did it?

Since, dozens of Python dataframes libraries have come and gone…

The pandas API remains the de facto standard for dataframes in Python (alongside PySpark), but it doesn’t scale.

This leads to data scientists frequently “throwing their work over the wall” to data engineers and ML engineers.

But what if there were a new standard?

Ibis origins

…in 2015, I started the Ibis project…to create a pandas-friendly deferred expression system for static analysis and compilation [of] these types of [query planned, multicore execution] operations. Since an efficient multithreaded in-memory engine for pandas was not available when I started Ibis, I instead focused on building compilers for SQL engines (Impala, PostgreSQL, SQLite), similar to the R dplyr package. Phillip Cloud from the pandas core team has been actively working on Ibis with me for quite a long time.

two world problem

SQL:

Python:

two world problem

SQL:

  • databases & tables

Python:

  • files & dataframes

two world problem

SQL:

  • databases & tables
  • analytics

Python:

  • files & dataframes
  • data science

two world problem

SQL:

  • databases & tables
  • analytics
  • metrics

Python:

  • files & dataframes
  • data science
  • statistics

two world problem

SQL:

  • databases & tables
  • analytics
  • metrics
  • dashboards

Python:

  • files & dataframes
  • data science
  • statistics
  • notebooks

two world problem

Python:

  • files & dataframes
  • data science
  • statistics
  • notebooks

SQL:

  • databases & tables
  • analytics
  • metrics
  • dashboards

Ibis bridges the gap.

dataframe history

  • pandas (2008): dataframes in Python
  • Spark (2009): distributed dataframes with PySpark
  • Dask (2014): distributed pandas dataframes with Python
  • dplyr (2014): dataframes in R with SQL-like syntax
  • Ibis (2015): dataframes in Python with SQL-like syntax
  • cuDF (2017): pandas on GPUs
  • Modin (2018): pandas on Ray/Dask
  • Koalas (2019): pandas on Spark
  • Polars (2020): multicore dataframes in Python via Rust
  • Ibis (2022): Ibis invested in heavily by Voltron Data
  • Snowpark Python (2022): PySpark-like dataframes on Snowflake
  • BigQuery DataFrames (2023): pandas on Google BigQuery (via Ibis!)

dataframe history (aside)

We see three approaches:

pandas clones:

  • Modin
  • pandas on Spark
    • formerly known as Koalas
  • cuDF
  • Dask (sort of)
  • BigQuery DataFrames

PySpark clones:

  • Snowpark Python (sort of)
  • DuckDB Spark API
  • SQLGlot Spark API

something else:

  • Ibis
  • Polars

database history

  • they got faster

Ibis brings the best of databases to dataframes.

DuckDB

import ibis

con = ibis.duckdb.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

An embeddable, zero-dependency, C++ SQL database engine.

DataFusion

import ibis

con = ibis.datafusion.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A Rust SQL query engine.

ClickHouse

import ibis

con = ibis.clickhouse.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A C++ column-oriented database management system.

Polars

import ibis

con = ibis.polars.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A Rust DataFrame library.

BigQuery

import ibis

con = ibis.bigquery.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A serverless, highly scalable, and cost-effective cloud data warehouse.

Snowflake

import ibis

con = ibis.snowflake.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A cloud data platform.

Oracle

import ibis

con = ibis.oracle.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A relational database management system.

Spark

import ibis

con = ibis.pyspark.connect(session)
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A unified analytics engine for large-scale data processing.

Trino

import ibis

con = ibis.trino.connect()
penguins = con.table("penguins")
penguins.group_by(["species", "island"]).agg(penguins.count().name("count"))

A distributed SQL query engine.

and more!

  • SQLite
  • PostgreSQL
  • MySQL
  • MSSQL
  • Druid
  • pandas
  • Impala
  • Dask

New backends are easy to add!*

*usually

how

try it out now

Install:

pip install 'ibis-framework[duckdb,examples]'

Then run:

import ibis

ibis.options.interactive = True

t = ibis.examples.penguins.fetch()
t
┏━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┓
┃ species  island     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  sex     year  ┃
┡━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━┩
│ stringstringfloat64float64int64int64stringint64 │
├─────────┼───────────┼────────────────┼───────────────┼───────────────────┼─────────────┼────────┼───────┤
│ Adelie Torgersen39.118.71813750male  2007 │
│ Adelie Torgersen39.517.41863800female2007 │
│ Adelie Torgersen40.318.01953250female2007 │
│ Adelie TorgersenNULLNULLNULLNULLNULL2007 │
│ Adelie Torgersen36.719.31933450female2007 │
│ Adelie Torgersen39.320.61903650male  2007 │
│ Adelie Torgersen38.917.81813625female2007 │
│ Adelie Torgersen39.219.61954675male  2007 │
│ Adelie Torgersen34.118.11933475NULL2007 │
│ Adelie Torgersen42.020.21904250NULL2007 │
│  │
└─────────┴───────────┴────────────────┴───────────────┴───────────────────┴─────────────┴────────┴───────┘

questions?

the end