Analyze IMDB data using Ibis

Let’s use the Ibis examples module and the DuckDB backend to find some movies to watch.

Adapted from Phillip in the Cloud’s livestream using the same data.

Imports

For this example, we’ll just use Ibis.

from ibis.interactive import *
1
This import imports ibis.examples as ex.

Fetch the example data

We can use the ibis.examples module to fetch the IMDB data. Ibis automatically caches the data on disk so subsequent runs don’t require fetching from cloud storage on each call to fetch.

name_basics = ex.imdb_name_basics.fetch()
name_basics
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primaryName      birthYear  deathYear  primaryProfession                   knownForTitles                          ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼───────────┼───────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
│ nm0000006Ingrid Bergman 19151982actress,producer,soundtrack       tt0034583,tt0036855,tt0038109,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,producer,miscellaneous      tt0034583,tt0042593,tt0043265,tt0037382 │
│ nm0000008Marlon Brando  19242004actor,director,writer             tt0078788,tt0068646,tt0047296,tt0070849 │
│ nm0000009Richard Burton 19251984actor,producer,director           tt0061184,tt0087803,tt0059749,tt0057877 │
│ nm0000010James Cagney   18991986actor,director,producer           tt0029870,tt0031867,tt0042041,tt0035575 │
│                                        │
└───────────┴─────────────────┴───────────┴───────────┴────────────────────────────────────┴─────────────────────────────────────────┘

To ensure column names are Pythonic, we can rename as snake_case.

name_basics.rename("snake_case")
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
│ nm0000006Ingrid Bergman 19151982actress,producer,soundtrack       tt0034583,tt0036855,tt0038109,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,producer,miscellaneous      tt0034583,tt0042593,tt0043265,tt0037382 │
│ nm0000008Marlon Brando  19242004actor,director,writer             tt0078788,tt0068646,tt0047296,tt0070849 │
│ nm0000009Richard Burton 19251984actor,producer,director           tt0061184,tt0087803,tt0059749,tt0057877 │
│ nm0000010James Cagney   18991986actor,director,producer           tt0029870,tt0031867,tt0042041,tt0035575 │
│                                        │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Let’s grab all of the relevant IMDB tables and rename columns.

name_basics = ex.imdb_name_basics.fetch().rename("snake_case")
title_akas = ex.imdb_title_akas.fetch().rename("snake_case")
title_basics = ex.imdb_title_basics.fetch().rename("snake_case")
title_crew = ex.imdb_title_crew.fetch().rename("snake_case")
title_episode = ex.imdb_title_episode.fetch().rename("snake_case")
title_principals = ex.imdb_title_principals.fetch().rename("snake_case")
title_ratings = ex.imdb_title_ratings.fetch().rename("snake_case")

Preview the data

We’ll print out the first few rows of each table to get an idea of what is contained in each.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘
title_akas.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ title_id   ordering  title                      region  language  types        attributes     is_original_title ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ stringint64stringstringstringstringstringint64             │
├───────────┼──────────┼───────────────────────────┼────────┼──────────┼─────────────┼───────────────┼───────────────────┤
│ tt00000011Carmencita               NULLNULLoriginal   NULL1 │
│ tt00000012Carmencita               DE    NULLNULLliteral title0 │
│ tt00000013Carmencita               US    NULLimdbDisplayNULL0 │
│ tt00000014Carmencita - spanyol táncHU    NULLimdbDisplayNULL0 │
│ tt00000015Καρμενσίτα               GR    NULLimdbDisplayNULL0 │
└───────────┴──────────┴───────────────────────────┴────────┴──────────┴─────────────┴───────────────┴───────────────────┘
title_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ tconst     title_type  primary_title           original_title          is_adult  start_year  end_year  runtime_minutes  genres                   ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringstringstringint64int64stringint64string                   │
├───────────┼────────────┼────────────────────────┼────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼──────────────────────────┤
│ tt0000001short     Carmencita            Carmencita            01894NULL1Documentary,Short        │
│ tt0000002short     Le clown et ses chiensLe clown et ses chiens01892NULL5Animation,Short          │
│ tt0000003short     Pauvre Pierrot        Pauvre Pierrot        01892NULL4Animation,Comedy,Romance │
│ tt0000004short     Un bon bock           Un bon bock           01892NULL12Animation,Short          │
│ tt0000005short     Blacksmith Scene      Blacksmith Scene      01893NULL1Comedy,Short             │
└───────────┴────────────┴────────────────────────┴────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴──────────────────────────┘
title_crew.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┓
┃ tconst     directors  writers ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━┩
│ stringstringstring  │
├───────────┼───────────┼─────────┤
│ tt0000001nm0005690NULL    │
│ tt0000002nm0721526NULL    │
│ tt0000003nm0721526NULL    │
│ tt0000004nm0721526NULL    │
│ tt0000005nm0005690NULL    │
└───────────┴───────────┴─────────┘
title_episode.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ tconst     parent_tconst  season_number  episode_number ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ stringstringint64int64          │
├───────────┼───────────────┼───────────────┼────────────────┤
│ tt0041951tt0041038    19 │
│ tt0042816tt0989125    117 │
│ tt0042889tt0989125    NULLNULL │
│ tt0043426tt0040051    342 │
│ tt0043631tt0989125    216 │
└───────────┴───────────────┴───────────────┴────────────────┘
title_principals.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ tconst     ordering  nconst     category         job                      characters ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ stringint64stringstringstringstring     │
├───────────┼──────────┼───────────┼─────────────────┼─────────────────────────┼────────────┤
│ tt00000011nm1588970self           NULL["Self"]   │
│ tt00000012nm0005690director       NULLNULL       │
│ tt00000013nm0005690producer       producer               NULL       │
│ tt00000014nm0374658cinematographerdirector of photographyNULL       │
│ tt00000021nm0721526director       NULLNULL       │
└───────────┴──────────┴───────────┴─────────────────┴─────────────────────────┴────────────┘
title_ratings.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst     average_rating  num_votes ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ stringfloat64int64     │
├───────────┼────────────────┼───────────┤
│ tt00000015.72036 │
│ tt00000025.7272 │
│ tt00000036.51985 │
│ tt00000045.4178 │
│ tt00000056.22745 │
└───────────┴────────────────┴───────────┘

Basic data exploration

Let’s check how many records are in each table. It’s just Python, so we can construct a dictionary and iterate through it in a for loop.

tables = {
    "name_basics": name_basics,
    "title_akas": title_akas,
    "title_basics": title_basics,
    "title_crew": title_crew,
    "title_episode": title_episode,
    "title_principals": title_principals,
    "title_ratings": title_ratings,
}
max_name_len = max(map(len, tables.keys())) + 1
print("Length of tables:")
for t in tables:
    print(f"\t{t.ljust(max_name_len)}: {tables[t].count().to_pandas():,}")
Length of tables:
    name_basics      : 13,359,981
    title_akas       : 40,591,770
    title_basics     : 10,814,709
    title_crew       : 9,995,917
    title_episode    : 8,124,470
    title_principals : 84,654,684
    title_ratings    : 1,416,620

Clean data

Looking at the data, the nconst and tconst columns seem to be unique identifiers. Let’s confirm and adjust them accordingly.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Check the number of unique nconst values.

name_basics.nconst.nunique()

13359981

Confirm it’s equal to the number of rows.

name_basics.nconst.nunique() == name_basics.count()

True

Mutate the table to convert nconst to an integer.

t = name_basics.mutate(nconst=_.nconst.replace("nm", "").cast("int"))
t.head()
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64stringstring                                  │
├────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│      2Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│      3Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│      4John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│      5Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Let’s also turn primary_profession into an array of strings instead of a single comma-separated string.

t = t.mutate(primary_profession=_.primary_profession.split(","))
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>string                                  │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1]tt0072308,tt0050419,tt0053137,tt0027125 │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1]tt0037382,tt0075213,tt0117057,tt0038355 │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1]tt0057345,tt0049189,tt0056404,tt0054452 │
│      4John Belushi   19491982['actor', 'writer', ... +1]tt0072562,tt0077975,tt0080455,tt0078723 │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1]tt0050986,tt0083922,tt0050976,tt0069467 │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1]tt0034583,tt0036855,tt0038109,tt0038787 │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1]tt0034583,tt0042593,tt0043265,tt0037382 │
│      8Marlon Brando  19242004['actor', 'director', ... +1]tt0078788,tt0068646,tt0047296,tt0070849 │
│      9Richard Burton 19251984['actor', 'producer', ... +1]tt0061184,tt0087803,tt0059749,tt0057877 │
│     10James Cagney   18991986['actor', 'director', ... +1]tt0029870,tt0031867,tt0042041,tt0035575 │
│                                             │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴─────────────────────────────────────────┘

And, combining the two concepts, convert known_for_titles into an array of integers corresponding to tconst identifiers.

t = t.mutate(
    known_for_titles=_.known_for_titles.split(",").map(
        lambda tconst: tconst.replace("tt", "").cast("int")
    )
)
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles       ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>           │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1][72308, 50419, ... +2] │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1][37382, 75213, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1][57345, 49189, ... +2] │
│      4John Belushi   19491982['actor', 'writer', ... +1][72562, 77975, ... +2] │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][50986, 83922, ... +2] │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1][34583, 36855, ... +2] │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1][34583, 42593, ... +2] │
│      8Marlon Brando  19242004['actor', 'director', ... +1][78788, 68646, ... +2] │
│      9Richard Burton 19251984['actor', 'producer', ... +1][61184, 87803, ... +2] │
│     10James Cagney   18991986['actor', 'director', ... +1][29870, 31867, ... +2] │
│                            │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴────────────────────────┘

DRY-ing up the code

We can define functions to convert nconst and tconst to integers.

def nconst_to_int(nconst):
    return nconst.replace("nm", "").cast("int")


def tconst_to_int(tconst):
    return tconst.replace("tt", "").cast("int")

Then combine the previous data cleansing in a single mutate call.

name_basics = name_basics.mutate(
    nconst=nconst_to_int(_.nconst),
    primary_profession=_.primary_profession.split(","),
    known_for_titles=_.known_for_titles.split(",").map(tconst_to_int),
)
name_basics
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles       ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>           │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1][72308, 50419, ... +2] │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1][37382, 75213, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1][57345, 49189, ... +2] │
│      4John Belushi   19491982['actor', 'writer', ... +1][72562, 77975, ... +2] │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][50986, 83922, ... +2] │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1][34583, 36855, ... +2] │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1][34583, 42593, ... +2] │
│      8Marlon Brando  19242004['actor', 'director', ... +1][78788, 68646, ... +2] │
│      9Richard Burton 19251984['actor', 'producer', ... +1][61184, 87803, ... +2] │
│     10James Cagney   18991986['actor', 'director', ... +1][29870, 31867, ... +2] │
│                            │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴────────────────────────┘

We can use ibis.to_sql to see the SQL this generates.

ibis.to_sql(name_basics)
SELECT
  CAST(REPLACE("t0"."nconst", 'nm', '') AS BIGINT) AS "nconst",
  "t0"."primaryName" AS "primary_name",
  "t0"."birthYear" AS "birth_year",
  "t0"."deathYear" AS "death_year",
  STR_SPLIT("t0"."primaryProfession", ',') AS "primary_profession",
  LIST_APPLY(
    STR_SPLIT("t0"."knownForTitles", ','),
    __ibis_param_tconst__ -> CAST(REPLACE(__ibis_param_tconst__, 'tt', '') AS BIGINT)
  ) AS "known_for_titles"
FROM "imdb_name_basics" AS "t0"

Clean the rest of the tables. We’ll convert nconst and tconst columns consistently to allow for easy joining.

title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).rename(
    tconst="title_id"
)
title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst))
title_crew = title_crew.mutate(
    tconst=tconst_to_int(_.tconst),
    directors=_.directors.split(",").map(nconst_to_int),
    writers=_.writers.split(",").map(nconst_to_int),
)
title_episode = title_episode.mutate(
    tconst=tconst_to_int(_.tconst), parent_tconst=tconst_to_int(_.parent_tconst)
)
title_principals = title_principals.mutate(
    tconst=tconst_to_int(_.tconst), nconst=nconst_to_int(_.nconst)
)
title_ratings = title_ratings.mutate(tconst=tconst_to_int(_.tconst))

Finding good (and bad) movies to watch

Join the IMDB rankings with information about the movies.

joined = title_basics.join(title_ratings, "tconst")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title                                original_title                               is_adult  start_year  end_year  runtime_minutes  genres                    average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼─────────────────────────────────────────────┼─────────────────────────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼──────────────────────────┼────────────────┼───────────┤
│      1short     Carmencita                                 Carmencita                                 01894NULL1Documentary,Short       5.72036 │
│      2short     Le clown et ses chiens                     Le clown et ses chiens                     01892NULL5Animation,Short         5.7272 │
│      3short     Pauvre Pierrot                             Pauvre Pierrot                             01892NULL4Animation,Comedy,Romance6.51985 │
│      5short     Blacksmith Scene                           Blacksmith Scene                           01893NULL1Comedy,Short            6.22745 │
│      6short     Chinese Opium Den                          Chinese Opium Den                          01894NULL1Short                   5.0183 │
│      7short     Corbett and Courtney Before the KinetographCorbett and Courtney Before the Kinetograph01894NULL1Short,Sport             5.4852 │
│      8short     Edison Kinetoscopic Record of a Sneeze     Edison Kinetoscopic Record of a Sneeze     01894NULL1Documentary,Short       5.42181 │
│      9movie     Miss Jerry                                 Miss Jerry                                 01894NULL45Romance                 5.3209 │
│     10short     Leaving the Factory                        La sortie de l'usine Lumière à Lyon        01895NULL1Documentary,Short       6.87506 │
│     11short     Akrobatisches Potpourri                    Akrobatisches Potpourri                    01895NULL1Documentary,Short       5.2385 │
│       │
└────────┴────────────┴─────────────────────────────────────────────┴─────────────────────────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴──────────────────────────┴────────────────┴───────────┘
joined.title_type.value_counts().order_by(_.title_type_count.desc())
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ title_type    title_type_count ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ stringint64            │
├──────────────┼──────────────────┤
│ tvEpisode   706019 │
│ movie       306726 │
│ short       156909 │
│ tvSeries    94209 │
│ video       52785 │
│ tvMovie     52535 │
│ tvMiniSeries16859 │
│ videoGame   16077 │
│ tvSpecial   12212 │
│ tvShort     2289 │
└──────────────┴──────────────────┘

Filter down to movies.

joined = joined.filter(_.title_type == "movie")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title              original_title             is_adult  start_year  end_year  runtime_minutes  genres           average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼───────────────────────────┼───────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼─────────────────┼────────────────┼───────────┤
│ 251661movie     One Day in August        Dekapentavgoustos        02001NULL106Drama          6.61193 │
│ 251666movie     Dhanya Te Santaji DhanajiDhanya Te Santaji Dhanaji01968NULLNULLNULL6.08 │
│ 251667movie     The Dinner Party         The Dinner Party         01995NULLNULLDrama          8.118 │
│ 251671movie     Domoy!                   Domoy!                   01983NULL94Drama          5.914 │
│ 251677movie     Cuban Blood              Dreaming of Julia        02003NULL109Comedy,Drama   5.8577 │
│ 251679movie     The Age of Violence      La edad de la violencia  01964NULL90Crime,Drama    5.945 │
│ 251681movie     The Empath               The Empath               02002NULLNULLNULL6.420 │
│ 251683movie     En la Hacienda de la FlorEn la Hacienda de la Flor01948NULLNULLDrama          6.512 │
│ 251690movie     Ethirum Puthirum         Ethirum Puthirum         02001NULLNULLAction         5.331 │
│ 251696movie     Fast Sofa                Fast Sofa                02001NULL109Adventure,Drama4.6959 │
│       │
└────────┴────────────┴───────────────────────────┴───────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴─────────────────┴────────────────┴───────────┘

Reorder the columns and drop some.

joined = joined.select(
    "tconst",
    "primary_title",
    "average_rating",
    "num_votes",
    "genres",
    "runtime_minutes",
)
joined
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst  primary_title                 average_rating  num_votes  genres                   runtime_minutes ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├────────┼──────────────────────────────┼────────────────┼───────────┼─────────────────────────┼─────────────────┤
│   2101Cleopatra                   5.1619Drama,History          100 │
│   2130Dante's Inferno             7.03408Adventure,Drama,Fantasy71 │
│   2153The Great Circus Catastrophe6.081Drama                  45 │
│   2199From the Manger to the Cross5.8635Biography,Drama        71 │
│   2305Life of Villa               7.629Documentary,War        NULL │
│   2395Le nègre blanc              4.814NULLNULL │
│   2405Oliver Twist                5.041Drama                  NULL │
│   2406Oliver Twist                4.626Drama                  NULL │
│   2423Passion                     6.6996Biography,Drama,Romance113 │
│   2445Quo Vadis?                  6.1438Drama,History          120 │
│       │
└────────┴──────────────────────────────┴────────────────┴───────────┴─────────────────────────┴─────────────────┘

Sort by the average rating.

joined = joined.order_by([_.average_rating.desc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                               average_rating  num_votes  genres                         runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼────────────────────────────────────────────┼────────────────┼───────────┼───────────────────────────────┼─────────────────┤
│ 30177820A Maiden's Tale                           10.0181Drama,Romance                59 │
│ 30209484The Dark Web                              10.0135Crime,Thriller               110 │
│ 21139706Prince Oak Oakleyski and News Interviewers10.059Documentary,News,Reality-TV  60 │
│ 21065098Prince Oak Oakleyski Starring Supremacy   10.031NULL60 │
│ 10925174Petersburg's tram                         10.028Documentary,History          74 │
│  9052412Magnificus                                10.028Biography,Documentary,History70 │
│ 30981554Non Solo Barzellette                      10.023Comedy                       NULL │
│  9303810La Nueve, Heroes en el olvido             10.022Documentary                  NULL │
│ 21236128We Are Art Through the Eyes of Annalaura  10.016Documentary                  69 │
│ 13458442Vegan on a Jet Plane: Bali                10.014Documentary                  50 │
│         │
└──────────┴────────────────────────────────────────────┴────────────────┴───────────┴───────────────────────────────┴─────────────────┘

A lot of 10/10 movies I haven’t heard of … let’s filter to movies with at least N votes.

N = 50000
joined = joined.filter(_.num_votes > N)
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                                      average_rating  num_votes  genres                   runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼───────────────────────────────────────────────────┼────────────────┼───────────┼─────────────────────────┼─────────────────┤
│   111161The Shawshank Redemption                         9.32873242Drama                  142 │
│    68646The Godfather                                    9.22001286Crime,Drama            175 │
│   468569The Dark Knight                                  9.02855626Action,Crime,Drama     152 │
│   167260The Lord of the Rings: The Return of the King    9.01968792Action,Adventure,Drama 201 │
│   108052Schindler's List                                 9.01443212Biography,Drama,History195 │
│    71562The Godfather Part II                            9.01357703Crime,Drama            202 │
│    5008312 Angry Men                                     9.0859397Crime,Drama            96 │
│ 2384920412th Fail                                        9.0105676Biography,Drama        147 │
│   110912Pulp Fiction                                     8.92207397Crime,Drama            154 │
│   120737The Lord of the Rings: The Fellowship of the Ring8.91996832Action,Adventure,Drama 178 │
│         │
└──────────┴───────────────────────────────────────────────────┴────────────────┴───────────┴─────────────────────────┴─────────────────┘

What if you’re in the mood for a bad movie?

joined = joined.order_by([_.average_rating.asc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  5988370Reis                               1.074052Biography,Drama         108 │
│  7886848Sadak 2                            1.296671Action,Drama            133 │
│  1702443Justin Bieber: Never Say Never     1.776428Documentary,Music       105 │
│ 10888594Radhe                              1.9179825Action,Crime,Thriller   109 │
│  1213644Disaster Movie                     1.994399Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055981Comedy                  110 │
│   362165Son of the Mask                    2.359171Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4109367Adventure,Comedy,Fantasy86 │
│   185183Battlefield Earth                  2.582933Action,Adventure,Sci-Fi 117 │
│  1098327Dragonball Evolution               2.579144Action,Adventure,Fantasy85 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

And specifically a bad comedy?

joined = joined.filter(_.genres.contains("Comedy"))
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  1213644Disaster Movie                     1.994399Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055981Comedy                  110 │
│   362165Son of the Mask                    2.359171Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4109367Adventure,Comedy,Fantasy86 │
│ 10350922Laxmii                             2.558686Action,Comedy,Horror    141 │
│   299930Gigli                              2.650648Comedy,Crime,Romance    121 │
│  1073498Meet the Spartans                  2.8111394Comedy,Fantasy          87 │
│   466342Date Movie                         2.862115Comedy,Romance          83 │
│  5697572Cats                               2.855718Comedy,Drama,Family     110 │
│   810913Jack and Jill                      3.390079Comedy                  91 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

Perfect!

Next Steps

We only used two of the IMDB tables. What else can we do with the rest of the data? Play around and let us know!

Back to top