import ibis
import plotly.express as px
import clickhouse_connect
= "plotly_dark"
px.defaults.template = True ibis.options.interactive
Ibis + ClickHouse + Shiny for Python = better PyPI stats.
Overview
PyPI Stats is a great resource for Python package download statistics from PyPI. However, it only contains 180 days of data and lacks more detailed analysis we might be interested in. In this post, we’ll build a dynamic Python application for better PyPI stats using ClickHouse as our data platform, Ibis as our Python data interface, and Shiny for Python as our dashboarding tool.
ClickPy is an existing open source and reproducible project built on the same data with ClickHouse. The primary difference is that ClickPy uses SQL and JavaScript whereas this project is in Python. We also focus on different visualizations and metrics.
Prerequisites
Install the required dependencies:
pip install 'ibis-framework[clickhouse]' plotly
Then run imports and setup:
Connecting to ClickHouse
You can connect to the public ClickHouse playground’s PyPI database:
= "clickpy-clickhouse.clickhouse.com"
host = 443
port = "play"
user = "pypi"
database
= ibis.clickhouse.connect(
con =host,
host=port,
port=user,
user=database,
database
) con.list_tables()
['countries',
'countries_dict',
'last_updated_dict',
'projects',
'pypi',
'pypi_downloads',
'pypi_downloads_by_version',
'pypi_downloads_by_version_mv',
'pypi_downloads_max_min',
'pypi_downloads_max_min_mv',
'pypi_downloads_mv',
'pypi_downloads_per_day',
'pypi_downloads_per_day_by_version',
'pypi_downloads_per_day_by_version_by_country',
'pypi_downloads_per_day_by_version_by_country_mv',
'pypi_downloads_per_day_by_version_by_file_type',
'pypi_downloads_per_day_by_version_by_file_type_mv',
'pypi_downloads_per_day_by_version_by_installer_by_type',
'pypi_downloads_per_day_by_version_by_installer_by_type_by_country',
'pypi_downloads_per_day_by_version_by_installer_by_type_by_country_mv',
'pypi_downloads_per_day_by_version_by_installer_by_type_mv',
'pypi_downloads_per_day_by_version_by_python',
'pypi_downloads_per_day_by_version_by_python_by_country',
'pypi_downloads_per_day_by_version_by_python_by_country_mv',
'pypi_downloads_per_day_by_version_by_python_mv',
'pypi_downloads_per_day_by_version_by_system',
'pypi_downloads_per_day_by_version_by_system_by_country',
'pypi_downloads_per_day_by_version_by_system_by_country_mv',
'pypi_downloads_per_day_by_version_by_system_mv',
'pypi_downloads_per_day_by_version_mv',
'pypi_downloads_per_day_mv',
'pypi_downloads_per_month',
'pypi_downloads_per_month_mv']
Top packages by downloads
Let’s start by looking at the most downloaded packages:
= con.table("pypi_downloads")
overall_t
= 10_000
top_k = (
overall_t "count"))
overall_t.order_by(ibis.desc(
.limit(top_k)=1 + ibis.row_number().over(order_by=ibis.desc("count")))
.mutate(rank"downloads": "count"})
.rename({"rank")
.relocate("rank")
.order_by(
) overall_t
┏━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓ ┃ rank ┃ project ┃ downloads ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩ │ int64 │ !string │ !int64 │ ├───────┼─────────────────┼─────────────┤ │ 1 │ boto3 │ 25779423674 │ │ 2 │ urllib3 │ 16339094961 │ │ 3 │ botocore │ 15210036424 │ │ 4 │ requests │ 13514466781 │ │ 5 │ setuptools │ 12656259406 │ │ 6 │ idna │ 11843633399 │ │ 7 │ certifi │ 11823436764 │ │ 8 │ s3transfer │ 11059293057 │ │ 9 │ six │ 10982223268 │ │ 10 │ python-dateutil │ 10756832919 │ │ … │ … │ … │ └───────┴─────────────────┴─────────────┘
Analyzing downloads for a package
Let’s choose a package to analyze:
= "clickhouse-connect" project
And see where it ranks in the top downloads:
filter(overall_t["project"] == project) overall_t.
┏━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━┓ ┃ rank ┃ project ┃ downloads ┃ ┡━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━┩ │ int64 │ !string │ !int64 │ ├───────┼────────────────────┼───────────┤ │ 2303 │ clickhouse-connect │ 23148161 │ └───────┴────────────────────┴───────────┘
Let’s look at downloads per day by various categories for this package:
= con.table(
downloads_t "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
filter(ibis._["project"] == project)
). downloads_t
┏━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━┓ ┃ project ┃ version ┃ date ┃ installer ┃ type ┃ country_code ┃ count ┃ ┡━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━┩ │ !string │ !string │ !date │ !string │ !string │ !string │ !int64 │ ├────────────────────┼─────────┼────────────┼──────────────┼─────────────┼──────────────┼────────┤ │ clickhouse-connect │ 0.2.10 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 18 │ │ clickhouse-connect │ 0.2.10 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.2.8 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 9 │ │ clickhouse-connect │ 0.2.9 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 16 │ │ clickhouse-connect │ 0.2.9 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.0 │ 2024-08-29 │ bandersnatch │ bdist_wheel │ US │ 19 │ │ clickhouse-connect │ 0.3.0 │ 2024-08-29 │ bandersnatch │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ ~ │ bdist_wheel │ US │ 26 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ ~ │ sdist │ US │ 1 │ │ clickhouse-connect │ 0.3.1 │ 2024-08-29 │ Browser │ bdist_wheel │ US │ 1 │ │ … │ … │ … │ … │ … │ … │ … │ └────────────────────┴─────────┴────────────┴──────────────┴─────────────┴──────────────┴────────┘
We might be interested in the day-of-week seasonality of downloads:
def day_of_week_bar(t):
= t.mutate(day_of_week=t["date"].day_of_week.full_name())
t = t.group_by("day_of_week").agg(downloads=ibis._["count"].sum())
t = px.bar(
c
t,="day_of_week",
x="downloads",
y={
category_orders"day_of_week": [
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
]
},
)return c
day_of_week_bar(downloads_t)
Or the rolling 28-day downloads metric:
def rolling_downloads(t, days=28):
= t.mutate(
t =t["date"].cast("timestamp"),
timestamp
)= t.group_by("timestamp").agg(downloads=ibis._["count"].sum())
t = t.select(
t "timestamp",
=ibis._["downloads"]
rolling_downloadssum()
.
.over(
ibis.window(="timestamp",
order_by=days,
preceding=0,
following
)
),"timestamp")
).order_by(
= px.line(
c
t,="timestamp",
x="rolling_downloads",
y
)
return c
rolling_downloads(downloads_t)
Or rolling 28-days downloads by version with a few options for how to group versions:
def rolling_downloads_by_version(t, days=28, version_style="major.minor"):
= t.mutate(
t =t["date"].cast("timestamp"),
timestamp
)
match version_style:
case "major":
= t.mutate(version=t["version"].split(".")[0])
t case "major.minor":
= t.mutate(
t =t["version"].split(".")[0] + "." + t["version"].split(".")[1]
version
)case _:
pass
= t.group_by("timestamp", "version").agg(downloads=ibis._["count"].sum())
t
= t.select(
t "timestamp",
"version",
=ibis._["downloads"]
rolling_downloadssum()
.
.over(
ibis.window(="timestamp",
order_by="version",
group_by=28,
preceding=0,
following
)
),"timestamp")
).order_by(
= px.line(
c
t,="timestamp",
x="rolling_downloads",
y="version",
color={
category_orders"version": reversed(
sorted(
="version")["version"].to_pyarrow().to_pylist(),
t.distinct(on=lambda x: tuple(int(y) for y in x.split(".") if y.isdigit()),
key
)
)
},
)return c
rolling_downloads_by_version(downloads_t)
Or a bar chart of downloads grouped by a category:
def group_bar(t, group_by="installer", log_y=True):
= t.mutate(timestamp=t["date"].cast("timestamp"))
t = t.group_by(group_by).agg(downloads=ibis._["count"].sum())
t = t.order_by(ibis.desc("downloads"))
t
= px.bar(
c
t,=group_by,
x="downloads",
y=log_y,
log_y
)return c
group_bar(downloads_t)
Since we’re just writing Python, we’ve already organized code into functions for reuse. We can rerun our above analytics on a different package by changing the project
variable and adjusting our table accordingly. We’ll demonstrate this with a few more packages below.
Notice you could also pass in Ibis tables from different backends, not just ClickHouse, to these functions!
= "pyarrow"
package
= con.table(
t "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
filter(ibis._["project"] == package) ).
day_of_week_bar(t)
rolling_downloads(t)
="major") rolling_downloads_by_version(t, version_style
="installer") group_bar(t, group_by
= "chdb"
package
= con.table(
t "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
filter(ibis._["project"] == package) ).
day_of_week_bar(t)
rolling_downloads(t)
="major.minor") rolling_downloads_by_version(t, version_style
="installer") group_bar(t, group_by
= "ibis-framework"
package
= con.table(
t "pypi_downloads_per_day_by_version_by_installer_by_type_by_country"
filter(ibis._["project"] == package) ).
day_of_week_bar(t)
rolling_downloads(t)
="major") rolling_downloads_by_version(t, version_style
="installer") group_bar(t, group_by
Shiny for Python application
We can create an interactive Shiny with Python application using the code above to serve as a dashboard for better PyPI stats:
See the GitHub repository for the most up-to-date code.
Reproducing and contributing
The code is available on GitHub. Feel free to open an issue or pull request if you have any suggested improvements.