Analyze IMDB data using Ibis

Let’s use the Ibis examples module and the DuckDB backend to find some movies to watch.

Adapted from Phillip in the Cloud’s livestream using the same data.

Imports

For this example, we’ll just use Ibis.

from ibis.interactive import *
1
This import imports ibis.examples as ex.

Fetch the example data

We can use the ibis.examples module to fetch the IMDB data. Ibis automatically caches the data on disk so subsequent runs don’t require fetching from cloud storage on each call to fetch.

name_basics = ex.imdb_name_basics.fetch()
name_basics
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primaryName      birthYear  deathYear  primaryProfession                   knownForTitles                          ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼───────────┼───────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
│ nm0000006Ingrid Bergman 19151982actress,producer,soundtrack       tt0034583,tt0036855,tt0038109,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,producer,miscellaneous      tt0034583,tt0042593,tt0043265,tt0037382 │
│ nm0000008Marlon Brando  19242004actor,director,writer             tt0078788,tt0068646,tt0047296,tt0070849 │
│ nm0000009Richard Burton 19251984actor,producer,director           tt0061184,tt0087803,tt0059749,tt0057877 │
│ nm0000010James Cagney   18991986actor,director,producer           tt0029870,tt0031867,tt0042041,tt0035575 │
│                                        │
└───────────┴─────────────────┴───────────┴───────────┴────────────────────────────────────┴─────────────────────────────────────────┘

To ensure column names are Pythonic, we can rename as snake_case.

name_basics.rename("snake_case")
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
│ nm0000006Ingrid Bergman 19151982actress,producer,soundtrack       tt0034583,tt0036855,tt0038109,tt0038787 │
│ nm0000007Humphrey Bogart18991957actor,producer,miscellaneous      tt0034583,tt0042593,tt0043265,tt0037382 │
│ nm0000008Marlon Brando  19242004actor,director,writer             tt0078788,tt0068646,tt0047296,tt0070849 │
│ nm0000009Richard Burton 19251984actor,producer,director           tt0061184,tt0087803,tt0059749,tt0057877 │
│ nm0000010James Cagney   18991986actor,director,producer           tt0029870,tt0031867,tt0042041,tt0035575 │
│                                        │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Let’s grab all of the relevant IMDB tables and rename columns.

name_basics = ex.imdb_name_basics.fetch().rename("snake_case")
title_akas = ex.imdb_title_akas.fetch().rename("snake_case")
title_basics = ex.imdb_title_basics.fetch().rename("snake_case")
title_crew = ex.imdb_title_crew.fetch().rename("snake_case")
title_episode = ex.imdb_title_episode.fetch().rename("snake_case")
title_principals = ex.imdb_title_principals.fetch().rename("snake_case")
title_ratings = ex.imdb_title_ratings.fetch().rename("snake_case")

Preview the data

We’ll print out the first few rows of each table to get an idea of what is contained in each.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘
title_akas.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┓
┃ title_id   ordering  title                      region  language  types        attributes     is_original_title ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━┩
│ stringint64stringstringstringstringstringint64             │
├───────────┼──────────┼───────────────────────────┼────────┼──────────┼─────────────┼───────────────┼───────────────────┤
│ tt00000011Carmencita               NULLNULLoriginal   NULL1 │
│ tt00000012Carmencita               DE    NULLNULLliteral title0 │
│ tt00000013Carmencita               US    NULLimdbDisplayNULL0 │
│ tt00000014Carmencita - spanyol táncHU    NULLimdbDisplayNULL0 │
│ tt00000015Καρμενσίτα               GR    NULLimdbDisplayNULL0 │
└───────────┴──────────┴───────────────────────────┴────────┴──────────┴─────────────┴───────────────┴───────────────────┘
title_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ tconst     title_type  primary_title           original_title          is_adult  start_year  end_year  runtime_minutes  genres                   ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringstringstringint64int64stringint64string                   │
├───────────┼────────────┼────────────────────────┼────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼──────────────────────────┤
│ tt0000001short     Carmencita            Carmencita            01894NULL1Documentary,Short        │
│ tt0000002short     Le clown et ses chiensLe clown et ses chiens01892NULL5Animation,Short          │
│ tt0000003short     Pauvre Pierrot        Pauvre Pierrot        01892NULL4Animation,Comedy,Romance │
│ tt0000004short     Un bon bock           Un bon bock           01892NULL12Animation,Short          │
│ tt0000005short     Blacksmith Scene      Blacksmith Scene      01893NULL1Comedy,Short             │
└───────────┴────────────┴────────────────────────┴────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴──────────────────────────┘
title_crew.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━┓
┃ tconst     directors  writers ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━┩
│ stringstringstring  │
├───────────┼───────────┼─────────┤
│ tt0000001nm0005690NULL    │
│ tt0000002nm0721526NULL    │
│ tt0000003nm0721526NULL    │
│ tt0000004nm0721526NULL    │
│ tt0000005nm0005690NULL    │
└───────────┴───────────┴─────────┘
title_episode.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┓
┃ tconst     parent_tconst  season_number  episode_number ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━┩
│ stringstringint64int64          │
├───────────┼───────────────┼───────────────┼────────────────┤
│ tt0041951tt0041038    19 │
│ tt0042816tt0989125    117 │
│ tt0042889tt0989125    NULLNULL │
│ tt0043426tt0040051    342 │
│ tt0043631tt0989125    216 │
└───────────┴───────────────┴───────────────┴────────────────┘
title_principals.head()
┏━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
┃ tconst     ordering  nconst     category         job                      characters ┃
┡━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
│ stringint64stringstringstringstring     │
├───────────┼──────────┼───────────┼─────────────────┼─────────────────────────┼────────────┤
│ tt00000011nm1588970self           NULL["Self"]   │
│ tt00000012nm0005690director       NULLNULL       │
│ tt00000013nm0005690producer       producer               NULL       │
│ tt00000014nm0374658cinematographerdirector of photographyNULL       │
│ tt00000021nm0721526director       NULLNULL       │
└───────────┴──────────┴───────────┴─────────────────┴─────────────────────────┴────────────┘
title_ratings.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst     average_rating  num_votes ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ stringfloat64int64     │
├───────────┼────────────────┼───────────┤
│ tt00000015.72036 │
│ tt00000025.7272 │
│ tt00000036.51985 │
│ tt00000045.4178 │
│ tt00000056.22745 │
└───────────┴────────────────┴───────────┘

Basic data exploration

Let’s check how many records are in each table. It’s just Python, so we can construct a dictionary and iterate through it in a for loop.

tables = {
    "name_basics": name_basics,
    "title_akas": title_akas,
    "title_basics": title_basics,
    "title_crew": title_crew,
    "title_episode": title_episode,
    "title_principals": title_principals,
    "title_ratings": title_ratings,
}
max_name_len = max(map(len, tables.keys())) + 1
print("Length of tables:")
for t in tables:
    print(f"\t{t.ljust(max_name_len)}: {tables[t].count().to_pandas():,}")
Length of tables:
    name_basics      : 13,359,981
    title_akas       : 40,591,770
    title_basics     : 10,814,709
    title_crew       : 9,995,917
    title_episode    : 8,124,470
    title_principals : 84,654,684
    title_ratings    : 1,416,620

Clean data

Looking at the data, the nconst and tconst columns seem to be unique identifiers. Let’s confirm and adjust them accordingly.

name_basics.head()
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst     primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ stringstringint64int64stringstring                                  │
├───────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│ nm0000001Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│ nm0000002Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│ nm0000003Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│ nm0000004John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│ nm0000005Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└───────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Check the number of unique nconst values.

name_basics.nconst.nunique()

┌──────────┐
│ 13359981 │
└──────────┘

Confirm it’s equal to the number of rows.

name_basics.nconst.nunique() == name_basics.count()

┌──────┐
│ True │
└──────┘

Mutate the table to convert nconst to an integer.

t = name_basics.mutate(nconst=_.nconst.replace("nm", "").cast("int"))
t.head()
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                  known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64stringstring                                  │
├────────┼─────────────────┼────────────┼────────────┼────────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987actor,miscellaneous,producer      tt0072308,tt0050419,tt0053137,tt0027125 │
│      2Lauren Bacall  19242014actress,soundtrack,archive_footagett0037382,tt0075213,tt0117057,tt0038355 │
│      3Brigitte Bardot1934NULLactress,music_department,producer tt0057345,tt0049189,tt0056404,tt0054452 │
│      4John Belushi   19491982actor,writer,music_department     tt0072562,tt0077975,tt0080455,tt0078723 │
│      5Ingmar Bergman 19182007writer,director,actor             tt0050986,tt0083922,tt0050976,tt0069467 │
└────────┴─────────────────┴────────────┴────────────┴────────────────────────────────────┴─────────────────────────────────────────┘

Let’s also turn primary_profession into an array of strings instead of a single comma-separated string.

t = t.mutate(primary_profession=_.primary_profession.split(","))
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles                        ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>string                                  │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼─────────────────────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1]tt0072308,tt0050419,tt0053137,tt0027125 │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1]tt0037382,tt0075213,tt0117057,tt0038355 │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1]tt0057345,tt0049189,tt0056404,tt0054452 │
│      4John Belushi   19491982['actor', 'writer', ... +1]tt0072562,tt0077975,tt0080455,tt0078723 │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1]tt0050986,tt0083922,tt0050976,tt0069467 │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1]tt0034583,tt0036855,tt0038109,tt0038787 │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1]tt0034583,tt0042593,tt0043265,tt0037382 │
│      8Marlon Brando  19242004['actor', 'director', ... +1]tt0078788,tt0068646,tt0047296,tt0070849 │
│      9Richard Burton 19251984['actor', 'producer', ... +1]tt0061184,tt0087803,tt0059749,tt0057877 │
│     10James Cagney   18991986['actor', 'director', ... +1]tt0029870,tt0031867,tt0042041,tt0035575 │
│                                             │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴─────────────────────────────────────────┘

And, combining the two concepts, convert known_for_titles into an array of integers corresponding to tconst identifiers.

t = t.mutate(
    known_for_titles=_.known_for_titles.split(",").map(
        lambda tconst: tconst.replace("tt", "").cast("int")
    )
)
t
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles       ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>           │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1][72308, 50419, ... +2] │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1][37382, 75213, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1][57345, 49189, ... +2] │
│      4John Belushi   19491982['actor', 'writer', ... +1][72562, 77975, ... +2] │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][50986, 83922, ... +2] │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1][34583, 36855, ... +2] │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1][34583, 42593, ... +2] │
│      8Marlon Brando  19242004['actor', 'director', ... +1][78788, 68646, ... +2] │
│      9Richard Burton 19251984['actor', 'producer', ... +1][61184, 87803, ... +2] │
│     10James Cagney   18991986['actor', 'director', ... +1][29870, 31867, ... +2] │
│                            │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴────────────────────────┘

DRY-ing up the code

We can define functions to convert nconst and tconst to integers.

def nconst_to_int(nconst):
    return nconst.replace("nm", "").cast("int")


def tconst_to_int(tconst):
    return tconst.replace("tt", "").cast("int")

Then combine the previous data cleansing in a single mutate call.

name_basics = name_basics.mutate(
    nconst=nconst_to_int(_.nconst),
    primary_profession=_.primary_profession.split(","),
    known_for_titles=_.known_for_titles.split(",").map(tconst_to_int),
)
name_basics
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┓
┃ nconst  primary_name     birth_year  death_year  primary_profession                       known_for_titles       ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━┩
│ int64stringint64int64array<string>array<int64>           │
├────────┼─────────────────┼────────────┼────────────┼─────────────────────────────────────────┼────────────────────────┤
│      1Fred Astaire   18991987['actor', 'miscellaneous', ... +1][72308, 50419, ... +2] │
│      2Lauren Bacall  19242014['actress', 'soundtrack', ... +1][37382, 75213, ... +2] │
│      3Brigitte Bardot1934NULL['actress', 'music_department', ... +1][57345, 49189, ... +2] │
│      4John Belushi   19491982['actor', 'writer', ... +1][72562, 77975, ... +2] │
│      5Ingmar Bergman 19182007['writer', 'director', ... +1][50986, 83922, ... +2] │
│      6Ingrid Bergman 19151982['actress', 'producer', ... +1][34583, 36855, ... +2] │
│      7Humphrey Bogart18991957['actor', 'producer', ... +1][34583, 42593, ... +2] │
│      8Marlon Brando  19242004['actor', 'director', ... +1][78788, 68646, ... +2] │
│      9Richard Burton 19251984['actor', 'producer', ... +1][61184, 87803, ... +2] │
│     10James Cagney   18991986['actor', 'director', ... +1][29870, 31867, ... +2] │
│                            │
└────────┴─────────────────┴────────────┴────────────┴─────────────────────────────────────────┴────────────────────────┘

We can use ibis.to_sql to see the SQL this generates.

ibis.to_sql(name_basics)
SELECT
  CAST(REPLACE("t0"."nconst", 'nm', '') AS BIGINT) AS "nconst",
  "t0"."primaryName" AS "primary_name",
  "t0"."birthYear" AS "birth_year",
  "t0"."deathYear" AS "death_year",
  STR_SPLIT("t0"."primaryProfession", ',') AS "primary_profession",
  LIST_APPLY(
    STR_SPLIT("t0"."knownForTitles", ','),
    __ibis_param_tconst__ -> CAST(REPLACE(__ibis_param_tconst__, 'tt', '') AS BIGINT)
  ) AS "known_for_titles"
FROM "imdb_name_basics" AS "t0"

Clean the rest of the tables. We’ll convert nconst and tconst columns consistently to allow for easy joining.

title_akas = title_akas.mutate(title_id=tconst_to_int(_.title_id)).rename(
    tconst="title_id"
)
title_basics = title_basics.mutate(tconst=tconst_to_int(_.tconst))
title_crew = title_crew.mutate(
    tconst=tconst_to_int(_.tconst),
    directors=_.directors.split(",").map(nconst_to_int),
    writers=_.writers.split(",").map(nconst_to_int),
)
title_episode = title_episode.mutate(
    tconst=tconst_to_int(_.tconst), parent_tconst=tconst_to_int(_.parent_tconst)
)
title_principals = title_principals.mutate(
    tconst=tconst_to_int(_.tconst), nconst=nconst_to_int(_.nconst)
)
title_ratings = title_ratings.mutate(tconst=tconst_to_int(_.tconst))

Finding good (and bad) movies to watch

Join the IMDB rankings with information about the movies.

joined = title_basics.join(title_ratings, "tconst")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title                       original_title                      is_adult  start_year  end_year  runtime_minutes  genres               average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼────────────────────────────────────┼────────────────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼─────────────────────┼────────────────┼───────────┤
│ 124058movie     The Girl and the Viscount         Pigen og greven                   01966NULL96Comedy,Family      6.4314 │
│ 124060movie     Plecarea Vlasinilor               Plecarea Vlasinilor               01982NULL108Drama              7.634 │
│ 124061movie     Pod Jezevci skalou                Pod Jezevci skalou                01978NULL74Family             6.3151 │
│ 124062movie     Those Damned Kids                 De pokkers unger                  01947NULL93Crime,Drama,Family 7.184 │
│ 124064tvEpisode Der Teufel hat den Schnaps gemachtDer Teufel hat den Schnaps gemacht01981NULL81Crime,Drama,Mystery6.652 │
│ 124066tvEpisode Die Spur des 13. Apostels         Die Spur des 13. Apostels         01983NULL64Crime,Drama,Mystery6.938 │
│ 124067tvEpisode Es ist nicht immer Sonnenschein   Es ist nicht immer Sonnenschein   01983NULL82Crime,Drama,Mystery5.741 │
│ 124068tvEpisode Holzwege                          Holzwege                          01978NULL75Crime,Drama,Mystery6.844 │
│ 124069tvEpisode Im Tal                            Im Tal                            01982NULL76Crime,Drama,Mystery5.940 │
│ 124070tvEpisode Minuten zu spät                   Minuten zu spät                   01972NULL65Crime,Drama,Mystery7.141 │
│       │
└────────┴────────────┴────────────────────────────────────┴────────────────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴─────────────────────┴────────────────┴───────────┘
joined.title_type.value_counts().order_by(_.title_type_count.desc())
┏━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃ title_type    title_type_count ┃
┡━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ stringint64            │
├──────────────┼──────────────────┤
│ tvEpisode   706019 │
│ movie       306726 │
│ short       156909 │
│ tvSeries    94209 │
│ video       52785 │
│ tvMovie     52535 │
│ tvMiniSeries16859 │
│ videoGame   16077 │
│ tvSpecial   12212 │
│ tvShort     2289 │
└──────────────┴──────────────────┘

Filter down to movies.

joined = joined.filter(_.title_type == "movie")
joined
┏━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ tconst  title_type  primary_title                 original_title                        is_adult  start_year  end_year  runtime_minutes  genres                   average_rating  num_votes ┃
┡━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64stringstringstringint64int64stringint64stringfloat64int64     │
├────────┼────────────┼──────────────────────────────┼──────────────────────────────────────┼──────────┼────────────┼──────────┼─────────────────┼─────────────────────────┼────────────────┼───────────┤
│   2101movie     Cleopatra                   Cleopatra                           01912NULL100Drama,History          5.1619 │
│   2130movie     Dante's Inferno             L'Inferno                           01911NULL71Adventure,Drama,Fantasy7.03408 │
│   2153movie     The Great Circus CatastropheDødsspring til hest fra cirkuskuplen01912NULL45Drama                  6.081 │
│   2199movie     From the Manger to the CrossFrom the Manger to the Cross        01912NULL71Biography,Drama        5.8635 │
│   2220movie     Grannie                     Grannie                             01912NULLNULLNULL3.846 │
│   2305movie     Life of Villa               Life of Villa                       01912NULLNULLDocumentary,War        7.629 │
│   2375movie     La mort du duc d'Enghien    La mort du duc d'Enghien            01912NULLNULLNULL5.712 │
│   2395movie     Le nègre blanc              Le nègre blanc                      01912NULLNULLNULL4.814 │
│   2405movie     Oliver Twist                Oliver Twist                        01912NULLNULLDrama                  5.041 │
│   2406movie     Oliver Twist                Oliver Twist                        01912NULLNULLDrama                  4.626 │
│       │
└────────┴────────────┴──────────────────────────────┴──────────────────────────────────────┴──────────┴────────────┴──────────┴─────────────────┴─────────────────────────┴────────────────┴───────────┘

Reorder the columns and drop some.

joined = joined.select(
    "tconst",
    "primary_title",
    "average_rating",
    "num_votes",
    "genres",
    "runtime_minutes",
)
joined
┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst  primary_title                 average_rating  num_votes  genres                   runtime_minutes ┃
┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├────────┼──────────────────────────────┼────────────────┼───────────┼─────────────────────────┼─────────────────┤
│   2101Cleopatra                   5.1619Drama,History          100 │
│   2130Dante's Inferno             7.03408Adventure,Drama,Fantasy71 │
│   2153The Great Circus Catastrophe6.081Drama                  45 │
│   2199From the Manger to the Cross5.8635Biography,Drama        71 │
│   2305Life of Villa               7.629Documentary,War        NULL │
│   2395Le nègre blanc              4.814NULLNULL │
│   2405Oliver Twist                5.041Drama                  NULL │
│   2406Oliver Twist                4.626Drama                  NULL │
│   2423Passion                     6.6996Biography,Drama,Romance113 │
│   2445Quo Vadis?                  6.1438Drama,History          120 │
│       │
└────────┴──────────────────────────────┴────────────────┴───────────┴─────────────────────────┴─────────────────┘

Sort by the average rating.

joined = joined.order_by([_.average_rating.desc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                               average_rating  num_votes  genres                         runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼────────────────────────────────────────────┼────────────────┼───────────┼───────────────────────────────┼─────────────────┤
│ 30177820A Maiden's Tale                           10.0181Drama,Romance                59 │
│ 30209484The Dark Web                              10.0135Crime,Thriller               110 │
│ 21139706Prince Oak Oakleyski and News Interviewers10.059Documentary,News,Reality-TV  60 │
│ 21065098Prince Oak Oakleyski Starring Supremacy   10.031NULL60 │
│ 10925174Petersburg's tram                         10.028Documentary,History          74 │
│  9052412Magnificus                                10.028Biography,Documentary,History70 │
│ 30981554Non Solo Barzellette                      10.023Comedy                       NULL │
│  9303810La Nueve, Heroes en el olvido             10.022Documentary                  NULL │
│ 21236128We Are Art Through the Eyes of Annalaura  10.016Documentary                  69 │
│ 13458442Vegan on a Jet Plane: Bali                10.014Documentary                  50 │
│         │
└──────────┴────────────────────────────────────────────┴────────────────┴───────────┴───────────────────────────────┴─────────────────┘

A lot of 10/10 movies I haven’t heard of … let’s filter to movies with at least N votes.

N = 50000
joined = joined.filter(_.num_votes > N)
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                                      average_rating  num_votes  genres                   runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼───────────────────────────────────────────────────┼────────────────┼───────────┼─────────────────────────┼─────────────────┤
│   111161The Shawshank Redemption                         9.32873242Drama                  142 │
│    68646The Godfather                                    9.22001286Crime,Drama            175 │
│   468569The Dark Knight                                  9.02855626Action,Crime,Drama     152 │
│   167260The Lord of the Rings: The Return of the King    9.01968792Action,Adventure,Drama 201 │
│   108052Schindler's List                                 9.01443212Biography,Drama,History195 │
│    71562The Godfather Part II                            9.01357703Crime,Drama            202 │
│    5008312 Angry Men                                     9.0859397Crime,Drama            96 │
│ 2384920412th Fail                                        9.0105676Biography,Drama        147 │
│   110912Pulp Fiction                                     8.92207397Crime,Drama            154 │
│   120737The Lord of the Rings: The Fellowship of the Ring8.91996832Action,Adventure,Drama 178 │
│         │
└──────────┴───────────────────────────────────────────────────┴────────────────┴───────────┴─────────────────────────┴─────────────────┘

What if you’re in the mood for a bad movie?

joined = joined.order_by([_.average_rating.asc(), _.num_votes.desc()])
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  5988370Reis                               1.074052Biography,Drama         108 │
│  7886848Sadak 2                            1.296671Action,Drama            133 │
│  1702443Justin Bieber: Never Say Never     1.776428Documentary,Music       105 │
│ 10888594Radhe                              1.9179825Action,Crime,Thriller   109 │
│  1213644Disaster Movie                     1.994399Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055981Comedy                  110 │
│   362165Son of the Mask                    2.359171Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4109367Adventure,Comedy,Fantasy86 │
│   185183Battlefield Earth                  2.582933Action,Adventure,Sci-Fi 117 │
│  1098327Dragonball Evolution               2.579144Action,Adventure,Fantasy85 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

And specifically a bad comedy?

joined = joined.filter(_.genres.contains("Comedy"))
joined
┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ tconst    primary_title                        average_rating  num_votes  genres                    runtime_minutes ┃
┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ int64stringfloat64int64stringint64           │
├──────────┼─────────────────────────────────────┼────────────────┼───────────┼──────────────────────────┼─────────────────┤
│  1213644Disaster Movie                     1.994399Comedy,Sci-Fi           87 │
│  7668842Enes Batur: Imagination or Reality?2.055981Comedy                  110 │
│   362165Son of the Mask                    2.359171Comedy,Family,Fantasy   94 │
│   799949Epic Movie                         2.4109367Adventure,Comedy,Fantasy86 │
│ 10350922Laxmii                             2.558686Action,Comedy,Horror    141 │
│   299930Gigli                              2.650648Comedy,Crime,Romance    121 │
│  1073498Meet the Spartans                  2.8111394Comedy,Fantasy          87 │
│   466342Date Movie                         2.862115Comedy,Romance          83 │
│  5697572Cats                               2.855718Comedy,Drama,Family     110 │
│   810913Jack and Jill                      3.390079Comedy                  91 │
│         │
└──────────┴─────────────────────────────────────┴────────────────┴───────────┴──────────────────────────┴─────────────────┘

Perfect!

Next Steps

We only used two of the IMDB tables. What else can we do with the rest of the data? Play around and let us know!

Back to top