生成数据集
第一步是生成一个大型数据集用于测试。我们创建一个包含许多列的数据集,并将其存储在一个Parquet文件中。此步骤需要pandas和numpy库。
import pandas as pd
import numpy as np
def make_timeseries(start="2000-01-01", end="2000-12-31", freq="1D", seed=None):
## 生成时间序列数据的函数
index = pd.date_range(start=start, end=end, freq=freq, name="timestamp")
n = len(index)
state = np.random.RandomState(seed)
columns = {
"name": state.choice(["Alice", "Bob", "Charlie"], size=n),
"id": state.poisson(1000, size=n),
"x": state.rand(n) * 2 - 1,
"y": state.rand(n) * 2 - 1,
}
df = pd.DataFrame(columns, index=index, columns=sorted(columns))
if df.index[-1] == end:
df = df.iloc[:-1]
return df
timeseries = [
make_timeseries(freq="1T", seed=i).rename(columns=lambda x: f"{x}_{i}")
for i in range(10)
]
ts_wide = pd.concat(timeseries, axis=1)
ts_wide.to_parquet("timeseries_wide.parquet")