Analyze IMDB data using Ibis

Let’s use the Ibis examples module and the DuckDB backend to find some movies to watch.

Adapted from Phillip in the Cloud’s livestream using the same data.

Imports

For this example, we’ll just use Ibis.

from ibis.interactive import *
1
This import imports ibis.examples as ex.

Fetch the example data

We can use the ibis.examples module to fetch the IMDB data. Ibis automatically caches the data on disk so subsequent runs don’t require fetching from cloud storage on each call to fetch.

name_basics = ex.imdb_name_basics.fetch()
name_basics
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primaryName      birthYear  deathYear  primaryProfession                    knownForTitles                          ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼───────────┼───────────┼─────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987soundtrack,actor,miscellaneous     tt0053137,tt0072308,tt0045537,tt0050419 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack                 tt0037382,tt0117057,tt0075213,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,soundtrack,music_departmenttt0057345,tt0054452,tt0049189,tt0056404 │
│ nm0000004John Belushi   19491982actor,soundtrack,writer            tt0072562,tt0078723,tt0077975,tt0080455 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor              tt0083922,tt0069467,tt0050976,tt0050986 │
│ nm0000006Ingrid Bergman 19151982actress,soundtrack,producer        tt0038109,tt0036855,tt0034583,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,soundtrack,producer          tt0037382,tt0034583,tt0042593,tt0043265 │
│ nm0000008Marlon Brando  19242004actor,soundtrack,director          tt0068646,tt0070849,tt0078788,tt0047296 │
│ nm0000009Richard Burton 19251984actor,soundtrack,producer          tt0057877,tt0059749,tt0061184,tt0087803 │
│ nm0000010James Cagney   18991986actor,soundtrack,director          tt0042041,tt0035575,tt0029870,tt0031867 │
│                                        │
└───────────┴─────────────────┴───────────┴───────────┴─────────────────────────────────────┴─────────────────────────────────────────┘

To ensure column names are Pythonic, we can rename as snake_case.

name_basics.rename("snake_case")
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                   known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987soundtrack,actor,miscellaneous     tt0053137,tt0072308,tt0045537,tt0050419 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack                 tt0037382,tt0117057,tt0075213,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,soundtrack,music_departmenttt0057345,tt0054452,tt0049189,tt0056404 │
│ nm0000004John Belushi   19491982actor,soundtrack,writer            tt0072562,tt0078723,tt0077975,tt0080455 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor              tt0083922,tt0069467,tt0050976,tt0050986 │
│ nm0000006Ingrid Bergman 19151982actress,soundtrack,producer        tt0038109,tt0036855,tt0034583,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,soundtrack,producer          tt0037382,tt0034583,tt0042593,tt0043265 │
│ nm0000008Marlon Brando  19242004actor,soundtrack,director          tt0068646,tt0070849,tt0078788,tt0047296 │
│ nm0000009Richard Burton 19251984actor,soundtrack,producer          tt0057877,tt0059749,tt0061184,tt0087803 │
│ nm0000010James Cagney   18991986actor,soundtrack,director          tt0042041,tt0035575,tt0029870,tt0031867 │
│                                        │
└───────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────┴─────────────────────────────────────────┘

Let’s grab all of the relevant IMDB tables and rename columns.

name_basics = ex.imdb_name_basics.fetch().rename("snake_case")
title_akas = ex.imdb_title_akas.fetch().rename("snake_case")
title_basics = ex.imdb_title_basics.fetch().rename("snake_case")
title_crew = ex.imdb_title_crew.fetch().rename("snake_case")
title_episode = ex.imdb_title_episode.fetch().rename("snake_case")
title_principals = ex.imdb_title_principals.fetch().rename("snake_case")
title_ratings = ex.imdb_title_ratings.fetch().rename("snake_case")

Preview the data

We’ll print out the first few rows of each table to get an idea of what is contained in each.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                   known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987soundtrack,actor,miscellaneous     tt0053137,tt0072308,tt0045537,tt0050419 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack                 tt0037382,tt0117057,tt0075213,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,soundtrack,music_departmenttt0057345,tt0054452,tt0049189,tt0056404 │
│ nm0000004John Belushi   19491982actor,soundtrack,writer            tt0072562,tt0078723,tt0077975,tt0080455 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor              tt0083922,tt0069467,tt0050976,tt0050986 │
└───────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────┴─────────────────────────────────────────┘
title_akas.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ title_id   ordering  title                      region  language  types        attributes     is_original_title ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ stringint64stringstringstringstringstringint64             │
├───────────┼──────────┼───────────────────────────┼────────┼──────────┼─────────────┼───────────────┼───────────────────┤
│ tt00000011Карменсіта               UA    NULLimdbDisplayNULL0 │
│ tt00000012Carmencita               DE    NULLNULLliteral title0 │
│ tt00000013Carmencita - spanyol táncHU    NULLimdbDisplayNULL0 │
│ tt00000014Καρμενσίτα               GR    NULLimdbDisplayNULL0 │
│ tt00000015Карменсита               RU    NULLimdbDisplayNULL0 │
└───────────┴──────────┴───────────────────────────┴────────┴──────────┴─────────────┴───────────────┴───────────────────┘
title_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ tconst     title_type  primary_title           original_title          is_adult  start_year  end_year  runtime_minutes  genres                   ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringstringstringint64int64stringint64string                   │
├───────────┼────────────┼────────────────────────┼────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼──────────────────────────┤
│ tt0000001short     Carmencita            Carmencita            01894NULL1Documentary,Short        │
│ tt0000002short     Le clown et ses chiensLe clown et ses chiens01892NULL5Animation,Short          │
│ tt0000003short     Pauvre Pierrot        Pauvre Pierrot        01892NULL4Animation,Comedy,Romance │
│ tt0000004short     Un bon bock           Un bon bock           01892NULL12Animation,Short          │
│ tt0000005short     Blacksmith Scene      Blacksmith Scene      01893NULL1Comedy,Short             │
└───────────┴────────────┴────────────────────────┴────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴──────────────────────────┘
title_crew.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┓
┃ tconst     directors  writers ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━┩
│ stringstringstring  │
├───────────┼───────────┼─────────┤
│ tt0000001nm0005690NULL    │
│ tt0000002nm0721526NULL    │
│ tt0000003nm0721526NULL    │
│ tt0000004nm0721526NULL    │
│ tt0000005nm0005690NULL    │
└───────────┴───────────┴─────────┘
title_episode.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ tconst     parent_tconst  season_number  episode_number ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ stringstringint64int64          │
├───────────┼───────────────┼───────────────┼────────────────┤
│ tt0041951tt0041038    19 │
│ tt0042816tt0989125    117 │
│ tt0042889tt0989125    NULLNULL │
│ tt0043426tt0040051    342 │
│ tt0043631tt0989125    216 │
└───────────┴───────────────┴───────────────┴────────────────┘
title_principals.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ tconst     ordering  nconst     category         job                      characters ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ stringint64stringstringstringstring     │
├───────────┼──────────┼───────────┼─────────────────┼─────────────────────────┼────────────┤
│ tt00000011nm1588970self           NULL["Self"]   │
│ tt00000012nm0005690director       NULLNULL       │
│ tt00000013nm0374658cinematographerdirector of photographyNULL       │
│ tt00000021nm0721526director       NULLNULL       │
│ tt00000022nm1335271composer       NULLNULL       │
└───────────┴──────────┴───────────┴─────────────────┴─────────────────────────┴────────────┘
title_ratings.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst     average_rating  num_votes ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ stringfloat64int64     │
├───────────┼────────────────┼───────────┤
│ tt00000015.71990 │
│ tt00000025.8265 │
│ tt00000036.51869 │
│ tt00000045.5177 │
│ tt00000056.22655 │
└───────────┴────────────────┴───────────┘

Basic data exploration

Let’s check how many records are in each table. It’s just Python, so we can construct a dictionary and iterate through it in a for loop.

tables = {
    "name_basics": name_basics,
    "title_akas": title_akas,
    "title_basics": title_basics,
    "title_crew": title_crew,
    "title_episode": title_episode,
    "title_principals": title_principals,
    "title_ratings": title_ratings,
}
max_name_len = max(map(len, tables.keys())) + 1
print("Length of tables:")
for t in tables:
    print(f"\t{t.ljust(max_name_len)}: {tables[t].count().to_pandas():,}")
Length of tables:
    name_basics      : 12,820,800
    title_akas       : 37,135,244
    title_basics     : 10,133,372
    title_crew       : 10,133,372
    title_episode    : 7,717,216
    title_principals : 58,023,068
    title_ratings    : 1,344,974

Clean data

Looking at the data, the nconst and tconst columns seem to be unique identifiers. Let’s confirm and adjust them accordingly.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                   known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987soundtrack,actor,miscellaneous     tt0053137,tt0072308,tt0045537,tt0050419 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack                 tt0037382,tt0117057,tt0075213,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,soundtrack,music_departmenttt0057345,tt0054452,tt0049189,tt0056404 │
│ nm0000004John Belushi   19491982actor,soundtrack,writer            tt0072562,tt0078723,tt0077975,tt0080455 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor              tt0083922,tt0069467,tt0050976,tt0050986 │
└───────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────┴─────────────────────────────────────────┘

Check the number of unique nconst values.

name_basics.nconst.nunique()

12820800

Confirm it’s equal to the number of rows.

name_basics.nconst.nunique() == name_basics.count()

True

Mutate the table to convert nconst to an integer.

t = name_basics.mutate(nconst=_.nconst.replace("nm", "").cast("int"))
t.head()
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                   known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64stringstring                                  │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987soundtrack,actor,miscellaneous     tt0053137,tt0072308,tt0045537,tt0050419 │
│      2Lauren Bacall  19242014actress,soundtrack                 tt0037382,tt0117057,tt0075213,tt0038355 │
│      3Brigitte Bardot1934NULLactress,soundtrack,music_departmenttt0057345,tt0054452,tt0049189,tt0056404 │
│      4John Belushi   19491982actor,soundtrack,writer            tt0072562,tt0078723,tt0077975,tt0080455 │
│      5Ingmar Bergman 19182007writer,director,actor              tt0083922,tt0069467,tt0050976,tt0050986 │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────┴─────────────────────────────────────────┘

Let’s also turn primary_profession into an array of strings instead of a single comma-separated string.

t = t.mutate(primary_profession=_.primary_profession.split(","))
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                 known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>string                                  │
├────────┼─────────────────┼────────────┼────────────┼───────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987['soundtrack', 'actor', ... +1]tt0053137,tt0072308,tt0045537,tt0050419 │
│      2Lauren Bacall  19242014['actress', 'soundtrack']tt0037382,tt0117057,tt0075213,tt0038355 │
│      3Brigitte Bardot1934NULL['actress', 'soundtrack', ... +1]tt0057345,tt0054452,tt0049189,tt0056404 │
│      4John Belushi   19491982['actor', 'soundtrack', ... +1]tt0072562,tt0078723,tt0077975,tt0080455 │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1]tt0083922,tt0069467,tt0050976,tt0050986 │
│      6Ingrid Bergman 19151982['actress', 'soundtrack', ... +1]tt0038109,tt0036855,tt0034583,tt0038787 │
│      7Humphrey Bogart18991957['actor', 'soundtrack', ... +1]tt0037382,tt0034583,tt0042593,tt0043265 │
│      8Marlon Brando  19242004['actor', 'soundtrack', ... +1]tt0068646,tt0070849,tt0078788,tt0047296 │
│      9Richard Burton 19251984['actor', 'soundtrack', ... +1]tt0057877,tt0059749,tt0061184,tt0087803 │
│     10James Cagney   18991986['actor', 'soundtrack', ... +1]tt0042041,tt0035575,tt0029870,tt0031867 │
│                                             │
└────────┴─────────────────┴────────────┴────────────┴───────────────────────────────────┴─────────────────────────────────────────┘

And, combining the two concepts, convert known_for_titles into an array of integers corresponding to tconst identifiers.

t = t.mutate(
    known_for_titles=_.known_for_titles.split(",").map(
        lambda tconst: tconst.replace("tt", "").cast("int")
    )
)
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                 known_for_titles        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>            │
├────────┼─────────────────┼────────────┼────────────┼───────────────────────────────────┼─────────────────────────┤
│      1Fred Astaire   18991987['soundtrack', 'actor', ... +1][53137, 72308, ... +2]  │
│      2Lauren Bacall  19242014['actress', 'soundtrack'][37382, 117057, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'soundtrack', ... +1][57345, 54452, ... +2]  │
│      4John Belushi   19491982['actor', 'soundtrack', ... +1][72562, 78723, ... +2]  │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][83922, 69467, ... +2]  │
│      6Ingrid Bergman 19151982['actress', 'soundtrack', ... +1][38109, 36855, ... +2]  │
│      7Humphrey Bogart18991957['actor', 'soundtrack', ... +1][37382, 34583, ... +2]  │
│      8Marlon Brando  19242004['actor', 'soundtrack', ... +1][68646, 70849, ... +2]  │
│      9Richard Burton 19251984['actor', 'soundtrack', ... +1][57877, 59749, ... +2]  │
│     10James Cagney   18991986['actor', 'soundtrack', ... +1][42041, 35575, ... +2]  │
│                             │
└────────┴─────────────────┴────────────┴────────────┴───────────────────────────────────┴─────────────────────────┘

DRY-ing up the code

We can define functions to convert nconst and tconst to integers.

def nconst_to_int(nconst):
    return nconst.replace("nm", "").cast("int")


def tconst_to_int(tconst):
    return tconst.replace("tt", "").cast("int")

Then combine the previous data cleansing in a single mutate call.

name_basics = name_basics.mutate(
    nconst=nconst_to_int(_.nconst),
    primary_profession=_.primary_profession.split(","),
    known_for_titles=_.known_for_titles.split(",").map(tconst_to_int),
)
name_basics
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                 known_for_titles        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>            │
├────────┼─────────────────┼────────────┼────────────┼───────────────────────────────────┼─────────────────────────┤
│      1Fred Astaire   18991987['soundtrack', 'actor', ... +1][53137, 72308, ... +2]  │
│      2Lauren Bacall  19242014['actress', 'soundtrack'][37382, 117057, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'soundtrack', ... +1][57345, 54452, ... +2]  │
│      4John Belushi   19491982['actor', 'soundtrack', ... +1][72562, 78723, ... +2]  │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][83922, 69467, ... +2]  │
│      6Ingrid Bergman 19151982['actress', 'soundtrack', ... +1][38109, 36855, ... +2]  │
│      7Humphrey Bogart18991957['actor', 'soundtrack', ... +1][37382, 34583, ... +2]  │
│      8Marlon Brando  19242004['actor', 'soundtrack', ... +1][68646, 70849, ... +2]  │
│      9Richard Burton 19251984['actor', 'soundtrack', ... +1][57877, 59749, ... +2]  │
│     10James Cagney   18991986['actor', 'soundtrack', ... +1][42041, 35575, ... +2]  │
│                             │
└────────┴─────────────────┴────────────┴────────────┴───────────────────────────────────┴─────────────────────────┘

We can use ibis.to_sql to see the SQL this generates.

ibis.to_sql(name_basics)
SELECT
  CAST(REPLACE("t0"."nconst", 'nm', '') AS BIGINT) AS "nconst",
  "t0"."primaryName" AS "primary_name",
  "t0"."birthYear" AS "birth_year",
  "t0"."deathYear" AS "death_year",
  STR_SPLIT("t0"."primaryProfession", ',') AS "primary_profession",
  LIST_APPLY(
    STR_SPLIT("t0"."knownForTitles", ','),
    __ibis_param_tconst__ -> CAST(REPLACE(__ibis_param_tconst__, 'tt', '') AS BIGINT)
  ) AS "known_for_titles"
FROM "imdb_name_basics" AS "t0"

Clean the rest of the tables. We’ll convert nconst and tconst columns consistently to allow for easy joining.

title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).rename(
    tconst="title_id"
)
title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst))
title_crew = title_crew.mutate(
    tconst=tconst_to_int(_.tconst),
    directors=_.directors.split(",").map(nconst_to_int),
    writers=_.writers.split(",").map(nconst_to_int),
)
title_episode = title_episode.mutate(
    tconst=tconst_to_int(_.tconst), parent_tconst=tconst_to_int(_.parent_tconst)
)
title_principals = title_principals.mutate(
    tconst=tconst_to_int(_.tconst), nconst=nconst_to_int(_.nconst)
)
title_ratings = title_ratings.mutate(tconst=tconst_to_int(_.tconst))

Finding good (and bad) movies to watch

Join the IMDB rankings with information about the movies.

joined = title_basics.join(title_ratings, "tconst")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title                                original_title                               is_adult  start_year  end_year  runtime_minutes  genres                    average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼─────────────────────────────────────────────┼─────────────────────────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼──────────────────────────┼────────────────┼───────────┤
│      1short     Carmencita                                 Carmencita                                 01894NULL1Documentary,Short       5.71990 │
│      2short     Le clown et ses chiens                     Le clown et ses chiens                     01892NULL5Animation,Short         5.8265 │
│      3short     Pauvre Pierrot                             Pauvre Pierrot                             01892NULL4Animation,Comedy,Romance6.51869 │
│      4short     Un bon bock                                Un bon bock                                01892NULL12Animation,Short         5.5177 │
│      5short     Blacksmith Scene                           Blacksmith Scene                           01893NULL1Comedy,Short            6.22655 │
│      6short     Chinese Opium Den                          Chinese Opium Den                          01894NULL1Short                   5.0182 │
│      7short     Corbett and Courtney Before the KinetographCorbett and Courtney Before the Kinetograph01894NULL1Short,Sport             5.4831 │
│      8short     Edison Kinetoscopic Record of a Sneeze     Edison Kinetoscopic Record of a Sneeze     01894NULL1Documentary,Short       5.42132 │
│      9movie     Miss Jerry                                 Miss Jerry                                 01894NULL45Romance                 5.3206 │
│     10short     Leaving the Factory                        La sortie de l'usine Lumière à Lyon        01895NULL1Documentary,Short       6.97268 │
│       │
└────────┴────────────┴─────────────────────────────────────────────┴─────────────────────────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴──────────────────────────┴────────────────┴───────────┘
joined.title_type.value_counts().order_by(_.title_type_count.desc())
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ title_type    title_type_count ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ stringint64            │
├──────────────┼──────────────────┤
│ tvEpisode   663341 │
│ movie       296311 │
│ short       149980 │
│ tvSeries    89154 │
│ tvMovie     51119 │
│ video       50728 │
│ tvMiniSeries15698 │
│ videoGame   15074 │
│ tvSpecial   11391 │
│ tvShort     2178 │
└──────────────┴──────────────────┘

Filter down to movies.

joined = joined.filter(_.title_type == "movie")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title                   original_title                  is_adult  start_year  end_year  runtime_minutes  genres                      average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼────────────────────────────────┼────────────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼────────────────────────────┼────────────────┼───────────┤
│      9movie     Miss Jerry                    Miss Jerry                    01894NULL45Romance                   5.3206 │
│    147movie     The Corbett-Fitzsimmons Fight The Corbett-Fitzsimmons Fight 01897NULL100Documentary,News,Sport    5.3480 │
│    502movie     Bohemios                      Bohemios                      01905NULL100NULL4.115 │
│    574movie     The Story of the Kelly Gang   The Story of the Kelly Gang   01906NULL70Action,Adventure,Biography6.0845 │
│    591movie     The Prodigal Son              L'enfant prodigue             01907NULL90Drama                     4.420 │
│    615movie     Robbery Under Arms            Robbery Under Arms            01907NULLNULLDrama                     4.325 │
│    679movie     The Fairylogue and Radio-PlaysThe Fairylogue and Radio-Plays01908NULL120Adventure,Fantasy         5.168 │
│    862movie     Faldgruben                    Faldgruben                    01909NULLNULLNULL4.417 │
│    886movie     Hamlet, Prince of Denmark     Hamlet                        01910NULLNULLDrama                     4.740 │
│    941movie     Locura de amor                Locura de amor                01909NULL45Drama                     4.527 │
│       │
└────────┴────────────┴────────────────────────────────┴────────────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴────────────────────────────┴────────────────┴───────────┘

Reorder the columns and drop some.

joined = joined.select(
    "tconst",
    "primary_title",
    "average_rating",
    "num_votes",
    "genres",
    "runtime_minutes",
)
joined
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst  primary_title                   average_rating  num_votes  genres                      runtime_minutes ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├────────┼────────────────────────────────┼────────────────┼───────────┼────────────────────────────┼─────────────────┤
│      9Miss Jerry                    5.3206Romance                   45 │
│    147The Corbett-Fitzsimmons Fight 5.3480Documentary,News,Sport    100 │
│    502Bohemios                      4.115NULL100 │
│    574The Story of the Kelly Gang   6.0845Action,Adventure,Biography70 │
│    591The Prodigal Son              4.420Drama                     90 │
│    615Robbery Under Arms            4.325Drama                     NULL │
│    679The Fairylogue and Radio-Plays5.168Adventure,Fantasy         120 │
│    862Faldgruben                    4.417NULLNULL │
│    886Hamlet, Prince of Denmark     4.740Drama                     NULL │
│    941Locura de amor                4.527Drama                     45 │
│       │
└────────┴────────────────────────────────┴────────────────┴───────────┴────────────────────────────┴─────────────────┘

Sort by the average rating.

joined = joined.order_by([_.average_rating.desc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                                   average_rating  num_votes  genres                            runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼────────────────────────────────────────────────┼────────────────┼───────────┼──────────────────────────────────┼─────────────────┤
│ 28315145Nano Narayanappa                              10.0266Comedy                          NULL │
│ 26345008Go On, Be Brave                               10.084Documentary,Sport               111 │
│  4971262Road King                                     10.067Drama                           100 │
│ 28626183Aaromalinte Adyathe Pranayam                  10.047Family                          NULL │
│ 21101694In Pursuit of Flavor                          10.040Documentary                     60 │
│ 21065098Prince Oak Oakleyski Starring Supremacy       10.030NULL60 │
│ 10925174Petersburg's tram                             10.027Documentary,History             74 │
│ 10663998Widow of the Revolution: The Anna Larina Story10.025Documentary                     NULL │
│ 12454598Rutas del más allá: El documental             10.024Biography,Documentary,Reality-TV70 │
│ 28334813Purple Poppins                                10.023Crime,Drama                     98 │
│         │
└──────────┴────────────────────────────────────────────────┴────────────────┴───────────┴──────────────────────────────────┴─────────────────┘

A lot of 10/10 movies I haven’t heard of … let’s filter to movies with at least N votes.

N = 50000
joined = joined.filter(_.num_votes > N)
joined
┏━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst   primary_title                                  average_rating  num_votes  genres                   runtime_minutes ┃
┡━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├─────────┼───────────────────────────────────────────────┼────────────────┼───────────┼─────────────────────────┼─────────────────┤
│  111161The Shawshank Redemption                     9.32793065Drama                  142 │
│   68646The Godfather                                9.21945537Crime,Drama            175 │
│  468569The Dark Knight                              9.02773504Action,Crime,Drama     152 │
│  167260The Lord of the Rings: The Return of the King9.01912955Action,Adventure,Drama 201 │
│  108052Schindler's List                             9.01404536Biography,Drama,History195 │
│   71562The Godfather Part II                        9.01321642Crime,Drama            202 │
│   5008312 Angry Men                                 9.0829776Crime,Drama            96 │
│  110912Pulp Fiction                                 8.92142607Crime,Drama            154 │
│ 1375666Inception                                    8.82462299Action,Adventure,Sci-Fi148 │
│  137523Fight Club                                   8.82227179Drama                  139 │
│        │
└─────────┴───────────────────────────────────────────────┴────────────────┴───────────┴─────────────────────────┴─────────────────┘

What if you’re in the mood for a bad movie?

joined = joined.order_by([_.average_rating.asc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  5988370Reis                               1.073916Biography,Drama         108 │
│  7886848Sadak 2                            1.296417Action,Drama            133 │
│  1702443Justin Bieber: Never Say Never     1.776373Documentary,Music       105 │
│ 10888594Radhe                              1.9179168Action,Crime,Thriller   109 │
│  1213644Disaster Movie                     1.993564Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055797Comedy                  110 │
│   362165Son of the Mask                    2.258304Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4108203Adventure,Comedy,Fantasy86 │
│   185183Battlefield Earth                  2.582019Action,Adventure,Sci-Fi 118 │
│  1098327Dragonball Evolution               2.577753Action,Adventure,Fantasy85 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

And specifically a bad comedy?

joined = joined.filter(_.genres.contains("Comedy"))
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  1213644Disaster Movie                     1.993564Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055797Comedy                  110 │
│   362165Son of the Mask                    2.258304Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4108203Adventure,Comedy,Fantasy86 │
│ 10350922Laxmii                             2.558222Action,Comedy,Horror    141 │
│   299930Gigli                              2.650152Comedy,Crime,Romance    121 │
│  1073498Meet the Spartans                  2.8110428Comedy,Fantasy          87 │
│   466342Date Movie                         2.861307Comedy,Romance          83 │
│  5697572Cats                               2.854206Comedy,Drama,Family     110 │
│   810913Jack and Jill                      3.388301Comedy                  91 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

Perfect!

Next Steps

We only used two of the IMDB tables. What else can we do with the rest of the data? Play around and let us know!

Back to top