2024-09-17
order_by
or orderby
or sort
or sort_by
or sortby
?group_by
or groupby
or partition
or partition_by
or partitionby
?🙏please only choose one🙏
when in doubt, copy dplyr
rpad
and lpad
also trim
?log(b, x)
?log(x, b)
?How many Star Wars characters have 'Darth' in their name?
(It was bad)
You should choose the engine that suits your problem.
You should choose the interface that suits your problem.1
pip install 'ibis-framework[datafusion]'
conda install -c conda-forge ibis-datafusion
pandas
API?pandas is a good tool, provided:
X
?Nope. You should use Ibis with X
.
import ibis
con = ibis.datafusion.connect()
ibis.set_backend(con)
ratings = ibis.examples.imdb_title_ratings.fetch()
basics = ibis.examples.imdb_title_basics.fetch()
basics.filter(basics.titleType == "movie", basics.isAdult == 0).join(
ratings, "tconst"
).filter(ratings.numVotes > 100_000).order_by(ratings.averageRating.desc()).select(
"primaryTitle", "averageRating"
).limit(10)
import glob
import os
import pandas as pd
def main():
df = pd.read_parquet(
min(glob.glob("/home/gil/databog/parquet/pypi/*.parquet"), key=os.path.getsize),
columns=["path", "uploaded_on", "project_name"],
)
df = df[
df.path.str.contains(
r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
)
& ~df.path.str.contains(r"(?:(?:^|/)test(?:|s|ing)|/site-packages/)")
]
return (
df.assign(
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0)
.iloc[:, 0]
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
.str.replace("^f.*$", "Fortran", regex=True)
.str.replace("rs", "Rust")
.str.replace("go", "Go")
.str.replace("asm", "Assembly"),
)
.groupby(["month", "ext"])
.project_name.nunique()
.rename("project_count")
.reset_index()
.sort_values(["month", "project_count"], ascending=False)
)
import glob
import os.path
import ibis
from ibis import _
con = ibis.datafusion.connect()
expr = (
con.read_parquet(
min(glob.glob("/home/gil/databog/parquet/pypi/*.parquet"), key=os.path.getsize)
)
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.drop_null("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)
import ibis
from ibis import _
con = ibis.datafusion.connect()
expr = (
con.read_parquet("/home/gil/databog/parquet/pypi/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.drop_null("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)
PyPI dataset: Instructions for downloading the (large) dataset from Seth M. Larson