Better PyPI stats with Python

clickhouse
shiny
Author

Cody Peterson

Published

September 3, 2024

Ibis + ClickHouse + Shiny for Python = better PyPI stats.

Overview

PyPI Stats is a great resource for Python package download statistics from PyPI. However, it only contains 180 days of data and lacks more detailed analysis we might be interested in. In this post, we’ll build a dynamic Python application for better PyPI stats using ClickHouse as our data platform, Ibis as our Python data interface, and Shiny for Python as our dashboarding tool.

What about ClickPy?

ClickPy is an existing open source and reproducible project built on the same data with ClickHouse. The primary difference is that ClickPy uses SQL and JavaScript whereas this project is in Python. We also focus on different visualizations and metrics.

Prerequisites

Install the required dependencies:

pip install 'ibis-framework[clickhouse]' plotly

Then run imports and setup:

import ibis
import plotly.express as px
import clickhouse_connect

px.defaults.template = "plotly_dark"
ibis.options.interactive = True

Connecting to ClickHouse

You can connect to the public ClickHouse playground’s PyPI database:

host = "clickpy-clickhouse.clickhouse.com"
port = 443
user = "play"
database = "pypi"

con = ibis.clickhouse.connect(
    host=host,
    port=port,
    user=user,
    database=database,
)
con.list_tables()
['countries',
 'countries_dict',
 'last_updated_dict',
 'projects',
 'pypi',
 'pypi_downloads',
 'pypi_downloads_by_version',
 'pypi_downloads_by_version_mv',
 'pypi_downloads_max_min',
 'pypi_downloads_max_min_mv',
 'pypi_downloads_mv',
 'pypi_downloads_per_day',
 'pypi_downloads_per_day_by_version',
 'pypi_downloads_per_day_by_version_by_country',
 'pypi_downloads_per_day_by_version_by_country_mv',
 'pypi_downloads_per_day_by_version_by_file_type',
 'pypi_downloads_per_day_by_version_by_file_type_mv',
 'pypi_downloads_per_day_by_version_by_installer_by_type',
 'pypi_downloads_per_day_by_version_by_installer_by_type_by_country',
 'pypi_downloads_per_day_by_version_by_installer_by_type_by_country_mv',
 'pypi_downloads_per_day_by_version_by_installer_by_type_mv',
 'pypi_downloads_per_day_by_version_by_python',
 'pypi_downloads_per_day_by_version_by_python_by_country',
 'pypi_downloads_per_day_by_version_by_python_by_country_mv',
 'pypi_downloads_per_day_by_version_by_python_mv',
 'pypi_downloads_per_day_by_version_by_system',
 'pypi_downloads_per_day_by_version_by_system_by_country',
 'pypi_downloads_per_day_by_version_by_system_by_country_mv',
 'pypi_downloads_per_day_by_version_by_system_mv',
 'pypi_downloads_per_day_by_version_mv',
 'pypi_downloads_per_day_mv',
 'pypi_downloads_per_month',
 'pypi_downloads_per_month_mv']

Top packages by downloads

Let’s start by looking at the most downloaded packages:

overall_t = con.table("pypi_downloads")

top_k = 10_000
overall_t = (
    overall_t.order_by(ibis.desc("count"))
    .limit(top_k)
    .mutate(rank=1 + ibis.row_number().over(order_by=ibis.desc("count")))
    .rename({"downloads": "count"})
    .relocate("rank")
    .order_by("rank")
)
overall_t
┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓
┃ rank   project          downloads   ┃
┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩
│ int64!string!int64      │
├───────┼─────────────────┼─────────────┤
│     1boto3          25779423674 │
│     2urllib3        16339094961 │
│     3botocore       15210036424 │
│     4requests       13514466781 │
│     5setuptools     12656259406 │
│     6idna           11843633399 │
│     7certifi        11823436764 │
│     8s3transfer     11059293057 │
│     9six            10982223268 │
│    10python-dateutil10756832919 │
│      │
└───────┴─────────────────┴─────────────┘

Analyzing downloads for a package

Let’s choose a package to analyze:

project = "clickhouse-connect"

And see where it ranks in the top downloads:

overall_t.filter(overall_t["project"] == project)
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓
┃ rank   project             downloads ┃
┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩
│ int64!string!int64    │
├───────┼────────────────────┼───────────┤
│  2303clickhouse-connect23148161 │
└───────┴────────────────────┴───────────┘

Let’s look at downloads per day by various categories for this package:

downloads_t = con.table(
    "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == project)
downloads_t
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┓
┃ project             version  date        installer     type         country_code  count  ┃
┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━┩
│ !string!string!date!string!string!string!int64 │
├────────────────────┼─────────┼────────────┼──────────────┼─────────────┼──────────────┼────────┤
│ clickhouse-connect0.2.10 2024-08-29bandersnatchbdist_wheelUS          18 │
│ clickhouse-connect0.2.10 2024-08-29bandersnatchsdist      US          1 │
│ clickhouse-connect0.2.8  2024-08-29bandersnatchbdist_wheelUS          9 │
│ clickhouse-connect0.2.9  2024-08-29bandersnatchbdist_wheelUS          16 │
│ clickhouse-connect0.2.9  2024-08-29bandersnatchsdist      US          1 │
│ clickhouse-connect0.3.0  2024-08-29bandersnatchbdist_wheelUS          19 │
│ clickhouse-connect0.3.0  2024-08-29bandersnatchsdist      US          1 │
│ clickhouse-connect0.3.1  2024-08-29~bdist_wheelUS          26 │
│ clickhouse-connect0.3.1  2024-08-29~sdist      US          1 │
│ clickhouse-connect0.3.1  2024-08-29Browser     bdist_wheelUS          1 │
│  │
└────────────────────┴─────────┴────────────┴──────────────┴─────────────┴──────────────┴────────┘

We might be interested in the day-of-week seasonality of downloads:

def day_of_week_bar(t):
    t = t.mutate(day_of_week=t["date"].day_of_week.full_name())
    t = t.group_by("day_of_week").agg(downloads=ibis._["count"].sum())
    c = px.bar(
        t,
        x="day_of_week",
        y="downloads",
        category_orders={
            "day_of_week": [
                "Sunday",
                "Monday",
                "Tuesday",
                "Wednesday",
                "Thursday",
                "Friday",
                "Saturday",
            ]
        },
    )
    return c


day_of_week_bar(downloads_t)

Or the rolling 28-day downloads metric:

def rolling_downloads(t, days=28):
    t = t.mutate(
        timestamp=t["date"].cast("timestamp"),
    )
    t = t.group_by("timestamp").agg(downloads=ibis._["count"].sum())
    t = t.select(
        "timestamp",
        rolling_downloads=ibis._["downloads"]
        .sum()
        .over(
            ibis.window(
                order_by="timestamp",
                preceding=days,
                following=0,
            )
        ),
    ).order_by("timestamp")

    c = px.line(
        t,
        x="timestamp",
        y="rolling_downloads",
    )

    return c


rolling_downloads(downloads_t)

Or rolling 28-days downloads by version with a few options for how to group versions:

def rolling_downloads_by_version(t, days=28, version_style="major.minor"):
    t = t.mutate(
        timestamp=t["date"].cast("timestamp"),
    )

    match version_style:
        case "major":
            t = t.mutate(version=t["version"].split(".")[0])
        case "major.minor":
            t = t.mutate(
                version=t["version"].split(".")[0] + "." + t["version"].split(".")[1]
            )
        case _:
            pass

    t = t.group_by("timestamp", "version").agg(downloads=ibis._["count"].sum())

    t = t.select(
        "timestamp",
        "version",
        rolling_downloads=ibis._["downloads"]
        .sum()
        .over(
            ibis.window(
                order_by="timestamp",
                group_by="version",
                preceding=28,
                following=0,
            )
        ),
    ).order_by("timestamp")

    c = px.line(
        t,
        x="timestamp",
        y="rolling_downloads",
        color="version",
        category_orders={
            "version": reversed(
                sorted(
                    t.distinct(on="version")["version"].to_pyarrow().to_pylist(),
                    key=lambda x: tuple(int(y) for y in x.split(".") if y.isdigit()),
                )
            )
        },
    )
    return c


rolling_downloads_by_version(downloads_t)

Or a bar chart of downloads grouped by a category:

def group_bar(t, group_by="installer", log_y=True):
    t = t.mutate(timestamp=t["date"].cast("timestamp"))
    t = t.group_by(group_by).agg(downloads=ibis._["count"].sum())
    t = t.order_by(ibis.desc("downloads"))

    c = px.bar(
        t,
        x=group_by,
        y="downloads",
        log_y=log_y,
    )
    return c


group_bar(downloads_t)

Since we’re just writing Python, we’ve already organized code into functions for reuse. We can rerun our above analytics on a different package by changing the project variable and adjusting our table accordingly. We’ll demonstrate this with a few more packages below.

Notice you could also pass in Ibis tables from different backends, not just ClickHouse, to these functions!

package = "pyarrow"

t = con.table(
    "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)
day_of_week_bar(t)
rolling_downloads(t)
rolling_downloads_by_version(t, version_style="major")
group_bar(t, group_by="installer")
package = "chdb"

t = con.table(
    "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)
day_of_week_bar(t)
rolling_downloads(t)
rolling_downloads_by_version(t, version_style="major.minor")
group_bar(t, group_by="installer")
package = "ibis-framework"

t = con.table(
    "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)
day_of_week_bar(t)
rolling_downloads(t)