import ibis
import plotly.express as px
import clickhouse_connect
px.defaults.template = "plotly_dark"
ibis.options.interactive = TrueIbis + ClickHouse + Shiny for Python = better PyPI stats.
Overview
PyPI Stats is a great resource for Python package download statistics from PyPI. However, it only contains 180 days of data and lacks more detailed analysis we might be interested in. In this post, we’ll build a dynamic Python application for better PyPI stats using ClickHouse as our data platform, Ibis as our Python data interface, and Shiny for Python as our dashboarding tool.
ClickPy is an existing open source and reproducible project built on the same data with ClickHouse. The primary difference is that ClickPy uses SQL and JavaScript whereas this project is in Python. We also focus on different visualizations and metrics.
Prerequisites
Install the required dependencies:
pip install 'ibis-framework[clickhouse]' plotlyThen run imports and setup:
Connecting to ClickHouse
You can connect to the public ClickHouse playground’s PyPI database:
host = "clickpy-clickhouse.clickhouse.com"
port = 443
user = "play"
database = "pypi"
con = ibis.clickhouse.connect(
host=host,
port=port,
user=user,
database=database,
)
con.list_tables()['countries',
'countries_dict',
'last_updated_dict',
'projects',
'pypi',
'pypi_downloads',
'pypi_downloads_by_version',
'pypi_downloads_by_version_mv',
'pypi_downloads_max_min',
'pypi_downloads_max_min_mv',
'pypi_downloads_mv',
'pypi_downloads_per_day',
'pypi_downloads_per_day_by_version',
'pypi_downloads_per_day_by_version_by_country',
'pypi_downloads_per_day_by_version_by_country_mv',
'pypi_downloads_per_day_by_version_by_file_type',
'pypi_downloads_per_day_by_version_by_file_type_mv',
'pypi_downloads_per_day_by_version_by_installer_by_type',
'pypi_downloads_per_day_by_version_by_installer_by_type_by_country',
'pypi_downloads_per_day_by_version_by_installer_by_type_by_country_mv',
'pypi_downloads_per_day_by_version_by_installer_by_type_mv',
'pypi_downloads_per_day_by_version_by_python',
'pypi_downloads_per_day_by_version_by_python_by_country',
'pypi_downloads_per_day_by_version_by_python_by_country_mv',
'pypi_downloads_per_day_by_version_by_python_mv',
'pypi_downloads_per_day_by_version_by_system',
'pypi_downloads_per_day_by_version_by_system_by_country',
'pypi_downloads_per_day_by_version_by_system_by_country_mv',
'pypi_downloads_per_day_by_version_by_system_mv',
'pypi_downloads_per_day_by_version_mv',
'pypi_downloads_per_day_mv',
'pypi_downloads_per_month',
'pypi_downloads_per_month_mv']
Top packages by downloads
Let’s start by looking at the most downloaded packages:
overall_t = con.table("pypi_downloads")
top_k = 10_000
overall_t = (
overall_t.order_by(ibis.desc("count"))
.limit(top_k)
.mutate(rank=1 + ibis.row_number().over(order_by=ibis.desc("count")))
.rename({"downloads": "count"})
.relocate("rank")
.order_by("rank")
)
overall_t┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ rank ┃ project ┃ downloads ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ !string │ !int64 │ ├───────┼─────────────────┼─────────────┤ │ 1 │ boto3 │ 25779423674 │ │ 2 │ urllib3 │ 16339094961 │ │ 3 │ botocore │ 15210036424 │ │ 4 │ requests │ 13514466781 │ │ 5 │ setuptools │ 12656259406 │ │ 6 │ idna │ 11843633399 │ │ 7 │ certifi │ 11823436764 │ │ 8 │ s3transfer │ 11059293057 │ │ 9 │ six │ 10982223268 │ │ 10 │ python-dateutil │ 10756832919 │ │ … │ … │ … │ └───────┴─────────────────┴─────────────┘
Analyzing downloads for a package
Let’s choose a package to analyze:
project = "clickhouse-connect"And see where it ranks in the top downloads:
overall_t.filter(overall_t["project"] == project)┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ rank ┃ project ┃ downloads ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ int64 │ !string │ !int64 │ ├───────┼────────────────────┼───────────┤ │ 2303 │ clickhouse-connect │ 23148161 │ └───────┴────────────────────┴───────────┘
Let’s look at downloads per day by various categories for this package:
downloads_t = con.table(
"pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == project)
downloads_t┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┓ ┃ project ┃ version ┃ date ┃ installer ┃ type ┃ country_code ┃ count ┃ ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━┩ │ !string │ !string │ !date │ !string │ !string │ !string │ !int64 │ ├────────────────────┼─────────┼────────────┼──────────────┼─────────────┼──────────────┼────────┤ │ clickhouse-connect │ 0.2.10 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 18 │ │ clickhouse-connect │ 0.2.10 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.2.8 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 9 │ │ clickhouse-connect │ 0.2.9 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 16 │ │ clickhouse-connect │ 0.2.9 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.0 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 19 │ │ clickhouse-connect │ 0.3.0 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ ~ │ bdist_wheel │ US │ 26 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ ~ │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ Browser │ bdist_wheel │ US │ 1 │ │ … │ … │ … │ … │ … │ … │ … │ └────────────────────┴─────────┴────────────┴──────────────┴─────────────┴──────────────┴────────┘
We might be interested in the day-of-week seasonality of downloads:
def day_of_week_bar(t):
t = t.mutate(day_of_week=t["date"].day_of_week.full_name())
t = t.group_by("day_of_week").agg(downloads=ibis._["count"].sum())
c = px.bar(
t,
x="day_of_week",
y="downloads",
category_orders={
"day_of_week": [
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
]
},
)
return c
day_of_week_bar(downloads_t)Or the rolling 28-day downloads metric:
def rolling_downloads(t, days=28):
t = t.mutate(
timestamp=t["date"].cast("timestamp"),
)
t = t.group_by("timestamp").agg(downloads=ibis._["count"].sum())
t = t.select(
"timestamp",
rolling_downloads=ibis._["downloads"]
.sum()
.over(
ibis.window(
order_by="timestamp",
preceding=days,
following=0,
)
),
).order_by("timestamp")
c = px.line(
t,
x="timestamp",
y="rolling_downloads",
)
return c
rolling_downloads(downloads_t)Or rolling 28-days downloads by version with a few options for how to group versions:
def rolling_downloads_by_version(t, days=28, version_style="major.minor"):
t = t.mutate(
timestamp=t["date"].cast("timestamp"),
)
match version_style:
case "major":
t = t.mutate(version=t["version"].split(".")[0])
case "major.minor":
t = t.mutate(
version=t["version"].split(".")[0] + "." + t["version"].split(".")[1]
)
case _:
pass
t = t.group_by("timestamp", "version").agg(downloads=ibis._["count"].sum())
t = t.select(
"timestamp",
"version",
rolling_downloads=ibis._["downloads"]
.sum()
.over(
ibis.window(
order_by="timestamp",
group_by="version",
preceding=28,
following=0,
)
),
).order_by("timestamp")
c = px.line(
t,
x="timestamp",
y="rolling_downloads",
color="version",
category_orders={
"version": reversed(
sorted(
t.distinct(on="version")["version"].to_pyarrow().to_pylist(),
key=lambda x: tuple(int(y) for y in x.split(".") if y.isdigit()),
)
)
},
)
return c
rolling_downloads_by_version(downloads_t)Or a bar chart of downloads grouped by a category:
def group_bar(t, group_by="installer", log_y=True):
t = t.mutate(timestamp=t["date"].cast("timestamp"))
t = t.group_by(group_by).agg(downloads=ibis._["count"].sum())
t = t.order_by(ibis.desc("downloads"))
c = px.bar(
t,
x=group_by,
y="downloads",
log_y=log_y,
)
return c
group_bar(downloads_t)Since we’re just writing Python, we’ve already organized code into functions for reuse. We can rerun our above analytics on a different package by changing the project variable and adjusting our table accordingly. We’ll demonstrate this with a few more packages below.
Notice you could also pass in Ibis tables from different backends, not just ClickHouse, to these functions!
package = "pyarrow"
t = con.table(
"pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)day_of_week_bar(t)rolling_downloads(t)rolling_downloads_by_version(t, version_style="major")group_bar(t, group_by="installer")package = "chdb"
t = con.table(
"pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)day_of_week_bar(t)rolling_downloads(t)rolling_downloads_by_version(t, version_style="major.minor")group_bar(t, group_by="installer")package = "ibis-framework"
t = con.table(
"pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
).filter(ibis._["project"] == package)day_of_week_bar(t)rolling_downloads(t)rolling_downloads_by_version(t, version_style="major")group_bar(t, group_by="installer")Shiny for Python application
We can create an interactive Shiny with Python application using the code above to serve as a dashboard for better PyPI stats:
See the GitHub repository for the most up-to-date code.
Reproducing and contributing
The code is available on GitHub. Feel free to open an issue or pull request if you have any suggested improvements.