使用 Scikit-Learn 估计器的预测延迟

Machine LearningMachine LearningBeginner
立即练习

This tutorial is from open-source community. Access the source code

💡 本教程由 AI 辅助翻译自英文原版。如需查看原文,您可以 切换至英文原版

简介

机器学习模型的预测延迟是实际应用中的一个关键因素。在本实验中,我们将使用 Scikit-Learn 估计器来对各种回归器的预测延迟进行基准测试。我们将测量批量或原子模式下进行预测时的延迟。这些图表将以箱线图的形式表示预测延迟的分布。

虚拟机使用提示

虚拟机启动完成后,点击左上角切换到笔记本标签页,以访问 Jupyter Notebook 进行练习。

有时,你可能需要等待几秒钟让 Jupyter Notebook 完成加载。由于 Jupyter Notebook 的限制,操作的验证无法自动化。

如果你在学习过程中遇到问题,随时向 Labby 提问。课程结束后提供反馈,我们会及时为你解决问题。


Skills Graph

%%%%{init: {'theme':'neutral'}}%%%% flowchart RL ml(("Machine Learning")) -.-> ml/FrameworkandSoftwareGroup(["Framework and Software"]) ml/FrameworkandSoftwareGroup -.-> ml/sklearn("scikit-learn") subgraph Lab Skills ml/sklearn -.-> lab-49250{{"使用 Scikit-Learn 估计器的预测延迟"}} end

生成回归数据集

我们将使用 Scikit-Learn 的 make_regression 函数,根据给定参数生成一个回归数据集。该数据集将包含 n_train 个训练实例、n_test 个测试实例、n_features 个特征,噪声(noise)为 0.1。

X, y, coef = make_regression(
    n_samples=n_train + n_test, n_features=n_features, noise=noise, coef=True
)

random_seed = 13
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=n_train, test_size=n_test, random_state=random_seed
)
X_train, y_train = shuffle(X_train, y_train, random_state=random_seed)

X_scaler = StandardScaler()
X_train = X_scaler.fit_transform(X_train)
X_test = X_scaler.transform(X_test)

y_scaler = StandardScaler()
y_train = y_scaler.fit_transform(y_train[:, None])[:, 0]
y_test = y_scaler.transform(y_test[:, None])[:, 0]

基准测试并绘制原子和批量预测延迟图

我们将使用 Scikit-Learn 的 predict() 方法来测量每个实例和整个输入的运行时预测。我们将使用 benchmark_estimator() 函数来测量原子和批量模式下的预测运行时。然后,我们将使用 boxplot_runtimes() 函数以箱线图的形式绘制预测延迟的分布。

def benchmark_estimator(estimator, X_test, n_bulk_repeats=30, verbose=False):
    atomic_runtimes = atomic_benchmark_estimator(estimator, X_test, verbose)
    bulk_runtimes = bulk_benchmark_estimator(estimator, X_test, n_bulk_repeats, verbose)
    return atomic_runtimes, bulk_runtimes

def boxplot_runtimes(runtimes, pred_type, configuration):
    fig, ax1 = plt.subplots(figsize=(10, 6))
    bp = plt.boxplot(
        runtimes,
    )
    cls_infos = [
        "%s\n(%d %s)"
        % (
            estimator_conf["name"],
            estimator_conf["complexity_computer"](estimator_conf["instance"]),
            estimator_conf["complexity_label"],
        )
        for estimator_conf in configuration["estimators"]
    ]
    plt.setp(ax1, xticklabels=cls_infos)
    plt.setp(bp["boxes"], color="black")
    plt.setp(bp["whiskers"], color="black")
    plt.setp(bp["fliers"], color="red", marker="+")
    ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
    ax1.set_axisbelow(True)
    ax1.set_title(
        "Prediction Time per Instance - %s, %d feats."
        % (pred_type.capitalize(), configuration["n_features"])
    )
    ax1.set_ylabel("Prediction Time (us)")
    plt.show()

configuration = {
    "n_train": int(1e3),
    "n_test": int(1e2),
    "n_features": int(1e2),
    "estimators": [
        {
            "name": "线性模型",
            "instance": SGDRegressor(
                penalty="elasticnet", alpha=0.01, l1_ratio=0.25, tol=1e-4
            ),
            "complexity_label": "非零系数",
            "complexity_computer": lambda clf: np.count_nonzero(clf.coef_),
        },
        {
            "name": "随机森林",
            "instance": RandomForestRegressor(),
            "complexity_label": "估计器",
            "complexity_computer": lambda clf: clf.n_estimators,
        },
        {
            "name": "支持向量回归",
            "instance": SVR(kernel="rbf"),
            "complexity_label": "支持向量",
            "complexity_computer": lambda clf: len(clf.support_vectors_),
        },
    ],
}
X_train, y_train, X_test, y_test = generate_dataset(
    configuration["n_train"], configuration["n_test"], configuration["n_features"]
)
stats = {}
for estimator_conf in configuration["estimators"]:
    estimator_conf["instance"].fit(X_train, y_train)
    gc.collect()
    a, b = benchmark_estimator(estimator_conf["instance"], X_test)
    stats[estimator_conf["name"]] = {"原子": a, "批量": b}
cls_names = [estimator_conf["name"] for estimator_conf in configuration["estimators"]]
runtimes = [1e6 * stats[clf_name]["原子"] for clf_name in cls_names]
boxplot_runtimes(runtimes, "原子", configuration)
runtimes = [1e6 * stats[clf_name]["批量"] for clf_name in cls_names]
boxplot_runtimes(runtimes, "批量 (%d)" % configuration["n_test"], configuration)

基准测试:特征数量对预测延迟的影响

我们将使用 Scikit-Learn 的 Ridge() 估计器来评估特征数量对预测时间的影响。我们会使用 n_feature_influence() 函数来评估这种影响,并使用 plot_n_features_influence() 函数来绘制预测时间随特征数量的变化情况。

def n_feature_influence(estimators, n_train, n_test, n_features, percentile):
    percentiles = defaultdict(defaultdict)
    for n in n_features:
        X_train, y_train, X_test, y_test = generate_dataset(n_train, n_test, n)
        for cls_name, estimator in estimators.items():
            estimator.fit(X_train, y_train)
            gc.collect()
            runtimes = bulk_benchmark_estimator(estimator, X_test, 30, False)
            percentiles[cls_name][n] = 1e6 * np.percentile(runtimes, percentile)
    return percentiles

def plot_n_features_influence(percentiles, percentile):
    fig, ax1 = plt.subplots(figsize=(10, 6))
    colors = ["r", "g", "b"]
    for i, cls_name in enumerate(percentiles.keys()):
        x = np.array(sorted([n for n in percentiles[cls_name].keys()]))
        y = np.array([percentiles[cls_name][n] for n in x])
        plt.plot(
            x,
            y,
            color=colors[i],
        )
    ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)
    ax1.set_axisbelow(True)
    ax1.set_title("预测时间随特征数量的变化")
    ax1.set_xlabel("特征数量")
    ax1.set_ylabel("第 %d 百分位数的预测时间 (微秒)" % percentile)
    plt.show()

percentile = 90
percentiles = n_feature_influence(
    {"ridge": Ridge()},
    configuration["n_train"],
    configuration["n_test"],
    [100, 250, 500],
    percentile,
)
plot_n_features_influence(percentiles, percentile)

基准测试吞吐量

我们将使用 Scikit-Learn 的 predict() 方法来测量不同估计器的吞吐量。我们会使用 benchmark_throughputs() 函数来进行吞吐量的基准测试,并使用 plot_benchmark_throughput() 函数来绘制不同估计器的预测吞吐量。

def benchmark_throughputs(configuration, duration_secs=0.1):
    X_train, y_train, X_test, y_test = generate_dataset(
        configuration["n_train"], configuration["n_test"], configuration["n_features"]
    )
    throughputs = dict()
    for estimator_config in configuration["estimators"]:
        estimator_config["instance"].fit(X_train, y_train)
        start_time = time.time()
        n_predictions = 0
        while (time.time() - start_time) < duration_secs:
            estimator_config["instance"].predict(X_test[[0]])
            n_predictions += 1
        throughputs[estimator_config["name"]] = n_predictions / duration_secs
    return throughputs

def plot_benchmark_throughput(throughputs, configuration):
    fig, ax = plt.subplots(figsize=(10, 6))
    colors = ["r", "g", "b"]
    cls_infos = [
        "%s\n(%d %s)"
        % (
            estimator_conf["name"],
            estimator_conf["complexity_computer"](estimator_conf["instance"]),
            estimator_conf["complexity_label"],
        )
        for estimator_conf in configuration["estimators"]
    ]
    cls_values = [
        throughputs[estimator_conf["name"]]
        for estimator_conf in configuration["estimators"]
    ]
    plt.bar(range(len(throughputs)), cls_values, width=0.5, color=colors)
    ax.set_xticks(np.linspace(0.25, len(throughputs) - 0.75, len(throughputs)))
    ax.set_xticklabels(cls_infos, fontsize=10)
    ymax = max(cls_values) * 1.2
    ax.set_ylim((0, ymax))
    ax.set_ylabel("吞吐量 (预测次数/秒)")
    ax.set_title(
        "不同估计器的预测吞吐量 (%d 个特征)"
        % configuration["n_features"]
    )
    plt.show()

throughputs = benchmark_throughputs(configuration)
plot_benchmark_throughput(throughputs, configuration)

总结

在本实验中,我们学习了如何使用 Scikit-Learn 估计器来对各种回归器的预测延迟进行基准测试。我们测量了批量或原子模式下进行预测时的延迟,并以箱线图的形式绘制了预测延迟的分布。我们还评估了特征数量对预测时间的影响,并绘制了预测时间随特征数量的变化情况。最后,我们测量了不同估计器的吞吐量,并绘制了不同估计器的预测吞吐量。