2024-09-17
order_by or orderby or sort or sort_by or sortby?group_by or groupby or partition or partition_by or partitionby?🙏please only choose one🙏
when in doubt, copy dplyr
rpad and lpad also trim?log(b, x)?log(x, b)?How many Star Wars characters have 'Darth' in their name?




(It was bad)
You should choose the engine that suits your problem.
You should choose the interface that suits your problem.1
pip install 'ibis-framework[datafusion]'
conda install -c conda-forge ibis-datafusion
pandas API?pandas is a good tool, provided:
X?Nope. You should use Ibis with X.
import ibis
con = ibis.datafusion.connect()
ibis.set_backend(con)
ratings = ibis.examples.imdb_title_ratings.fetch()
basics = ibis.examples.imdb_title_basics.fetch()
basics.filter(basics.titleType == "movie", basics.isAdult == 0).join(
ratings, "tconst"
).filter(ratings.numVotes > 100_000).order_by(ratings.averageRating.desc()).select(
"primaryTitle", "averageRating"
).limit(10)import glob
import os
import pandas as pd
def main():
df = pd.read_parquet(
min(glob.glob("/home/gil/databog/parquet/pypi/*.parquet"), key=os.path.getsize),
columns=["path", "uploaded_on", "project_name"],
)
df = df[
df.path.str.contains(
r"\.(?:asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
)
& ~df.path.str.contains(r"(?:(?:^|/)test(?:|s|ing)|/site-packages/)")
]
return (
df.assign(
month=df.uploaded_on.dt.to_period("M").dt.to_timestamp(),
ext=df.path.str.extract(r"\.([a-z0-9]+)$", 0)
.iloc[:, 0]
.str.replace(r"cxx|cpp|cc|c|hpp|h", "C/C++", regex=True)
.str.replace("^f.*$", "Fortran", regex=True)
.str.replace("rs", "Rust")
.str.replace("go", "Go")
.str.replace("asm", "Assembly"),
)
.groupby(["month", "ext"])
.project_name.nunique()
.rename("project_count")
.reset_index()
.sort_values(["month", "project_count"], ascending=False)
)import glob
import os.path
import ibis
from ibis import _
con = ibis.datafusion.connect()
expr = (
con.read_parquet(
min(glob.glob("/home/gil/databog/parquet/pypi/*.parquet"), key=os.path.getsize)
)
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.drop_null("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)import ibis
from ibis import _
con = ibis.datafusion.connect()
expr = (
con.read_parquet("/home/gil/databog/parquet/pypi/*.parquet")
.filter(
[
_.path.re_search(
r"\.(asm|c|cc|cpp|cxx|h|hpp|rs|[Ff][0-9]{0,2}(?:or)?|go)$"
),
~_.path.re_search(r"(^|/)test(|s|ing)"),
~_.path.contains("/site-packages/"),
]
)
.group_by(
month=_.uploaded_on.truncate("M"),
ext=_.path.re_extract(r"\.([a-z0-9]+)$", 1)
.re_replace(r"cxx|cpp|cc|c|hpp|h", "C/C++")
.re_replace("^f.*$", "Fortran")
.replace("rs", "Rust")
.replace("go", "Go")
.replace("asm", "Assembly")
.nullif(""),
)
.aggregate(project_count=_.project_name.nunique())
.drop_null("ext")
.order_by([_.month.desc(), _.project_count.desc()])
)PyPI dataset: Instructions for downloading the (large) dataset from Seth M. Larson