患病率不变性
我们将证明类别似然比与疾病患病率无关,并且无论任何可能的类别不平衡情况,都可以在不同人群之间进行外推。
准备数据
我们将使用scikit-learn中的make_classification
函数生成一个合成数据集。这个数据集将模拟一个少数受试者患有疾病的人群。
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10_000, weights=[0.9, 0.1], random_state=0)
print(f"Percentage of people carrying the disease: {100*y.mean():.2f}%")
测试前与测试后分析
我们将对数据拟合一个逻辑回归模型,并在一个留出的测试集上评估其性能。我们将计算正似然比,以评估这个分类器作为疾病诊断工具的有用性。
from sklearn.model_selection import train_test_split
from sklearn.metrics import class_likelihood_ratios
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
estimator = LogisticRegression().fit(X_train, y_train)
y_pred = estimator.predict(X_test)
pos_LR, neg_LR = class_likelihood_ratios(y_test, y_pred)
print(f"LR+: {pos_LR:.3f}")
似然比的交叉验证
我们将在某些特定情况下,使用交叉验证来评估类别似然比测量值的可变性。
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.dummy import DummyClassifier
def scoring(estimator, X, y):
y_pred = estimator.predict(X)
pos_lr, neg_lr = class_likelihood_ratios(y, y_pred, raise_warning=False)
return {"positive_likelihood_ratio": pos_lr, "negative_likelihood_ratio": neg_lr}
def extract_score(cv_results):
lr = pd.DataFrame(
{
"positive": cv_results["test_positive_likelihood_ratio"],
"negative": cv_results["test_negative_likelihood_ratio"],
}
)
return lr.aggregate(["mean", "std"])
estimator = LogisticRegression()
extract_score(cross_validate(estimator, X, y, scoring=scoring, cv=10))
estimator = DummyClassifier(strategy="stratified", random_state=1234)
extract_score(cross_validate(estimator, X, y, scoring=scoring, cv=10))
estimator = DummyClassifier(strategy="most_frequent")
extract_score(cross_validate(estimator, X, y, scoring=scoring, cv=10))
患病率不变性
我们将证明类别似然比与疾病患病率无关,并且无论任何可能的类别不平衡情况,都可以在不同人群之间进行外推。
import numpy as np
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from collections import defaultdict
populations = defaultdict(list)
common_params = {
"n_samples": 10_000,
"n_features": 2,
"n_informative": 2,
"n_redundant": 0,
"random_state": 0,
}
weights = np.linspace(0.1, 0.8, 6)
weights = weights[::-1]
## 对平衡类别的基础模型进行拟合和评估
X, y = make_classification(**common_params, weights=[0.5, 0.5])
estimator = LogisticRegression().fit(X, y)
lr_base = extract_score(cross_validate(estimator, X, y, scoring=scoring, cv=10))
pos_lr_base, pos_lr_base_std = lr_base["positive"].values
neg_lr_base, neg_lr_base_std = lr_base["negative"].values
## 我们现在将展示每个患病率水平的决策边界。请注意,
## 我们只绘制原始数据的一个子集,以便更好地评估线性模型
## 的决策边界。
fig, axs = plt.subplots(nrows=3, ncols=2, figsize=(15, 12))
for ax, (n, weight) in zip(axs.ravel(), enumerate(weights)):
X, y = make_classification(
**common_params,
weights=[weight, 1 - weight],
)
prevalence = y.mean()
populations["prevalence"].append(prevalence)
populations["X"].append(X)
populations["y"].append(y)
## 为了绘图进行下采样
rng = np.random.RandomState(1)
plot_indices = rng.choice(np.arange(X.shape[0]), size=500, replace=True)
X_plot, y_plot = X[plot_indices], y[plot_indices]
## 绘制基础模型在不同患病率下的固定决策边界
disp = DecisionBoundaryDisplay.from_estimator(
estimator,
X_plot,
response_method="predict",
alpha=0.5,
ax=ax,
)
scatter = disp.ax_.scatter(X_plot[:, 0], X_plot[:, 1], c=y_plot, edgecolor="k")
disp.ax_.set_title(f"prevalence = {y_plot.mean():.2f}")
disp.ax_.legend(*scatter.legend_elements())
def scoring_on_bootstrap(estimator, X, y, rng, n_bootstrap=100):
results_for_prevalence = defaultdict(list)
for _ in range(n_bootstrap):
bootstrap_indices = rng.choice(
np.arange(X.shape[0]), size=X.shape[0], replace=True
)
for key, value in scoring(
estimator, X[bootstrap_indices], y[bootstrap_indices]
).items():
results_for_prevalence[key].append(value)
return pd.DataFrame(results_for_prevalence)
results = defaultdict(list)
n_bootstrap = 100
rng = np.random.default_rng(seed=0)
for prevalence, X, y in zip(
populations["prevalence"], populations["X"], populations["y"]
):
results_for_prevalence = scoring_on_bootstrap(
estimator, X, y, rng, n_bootstrap=n_bootstrap
)
results["prevalence"].append(prevalence)
results["metrics"].append(
results_for_prevalence.aggregate(["mean", "std"]).unstack()
)
results = pd.DataFrame(results["metrics"], index=results["prevalence"])
results.index.name = "prevalence"
results
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(15, 6))
results["positive_likelihood_ratio"]["mean"].plot(
ax=ax1, color="r", label="extrapolation through populations"
)
ax1.axhline(y=pos_lr_base + pos_lr_base_std, color="r", linestyle="--")
ax1.axhline(
y=pos_lr_base - pos_lr_base_std,
color="r",
linestyle="--",
label="base model confidence band"
)
ax1.fill_between(
results.index,
results["positive_likelihood_ratio"]["mean"]
- results["positive_likelihood_ratio"]["std"],
results["positive_likelihood_ratio"]["mean"]
+ results["positive_likelihood_ratio"]["std"],
color="r",
alpha=0.3
)
ax1.set(
title="Positive likelihood ratio",
ylabel="LR+",
ylim=[0, 5]
)
ax1.legend(loc="lower right")
ax2 = results["negative_likelihood_ratio"]["mean"].plot(
ax=ax2, color="b", label="extrapolation through populations"
)
ax2.axhline(y=neg_lr_base + neg_lr_base_std, color="b", linestyle="--")
ax2.axhline(
y=neg_lr_base - neg_lr_base_std,
color="b",
linestyle="--",
label="base model confidence band"
)
ax2.fill_between(
results.index,
results["negative_likelihood_ratio"]["mean"]
- results["negative_likelihood_ratio"]["std"],
results["negative_likelihood_ratio"]["mean"]
+ results["negative_likelihood_ratio"]["std"],
color="b",
alpha=0.3
)
ax2.set(
title="Negative likelihood ratio",
ylabel="LR-",
ylim=[0, 0.5]
)
ax2.legend(loc="lower right")
plt.show()