跳转到内容

性能基准

python
import pandas as pd
import polars as pl
import numpy as np
import time

n = 5_000_000
data = {'a': np.random.randn(n), 'b': np.random.choice(list('ABCDEFGHIJ'), n)}

pd.DataFrame(data).to_parquet('bench.pq')

start = time.time()
df_pd = pd.read_parquet('bench.pq')
r1 = df_pd[df_pd['a'] > 0].groupby('b')['a'].mean().sort_values(ascending=False)
t_pandas = time.time() - start

start = time.time()
df_pl = pl.scan_parquet('bench.pq')\
    .filter(pl.col('a') > 0)\
    .group_by('b').agg(pl.col('a').mean())\
    .sort('a', descending=True)\
    .collect()
t_polars = time.time() - start

print(f"Pandas: {t_pandas:.2f}s | Polars: {t_polars:.2f}s | 加速: {t_pandas/t_polars:.1f}x")

基于 MIT 许可发布