简介
在本实验中,我们将学习如何使用 Python 的 scikit-learn 库在 t-SNE 中使用近似最近邻算法。
虚拟机使用提示
虚拟机启动完成后,点击左上角切换到“笔记本”标签页,以访问 Jupyter Notebook 进行练习。
有时,你可能需要等待几秒钟让 Jupyter Notebook 完成加载。由于 Jupyter Notebook 的限制,操作验证无法自动化。
如果你在学习过程中遇到问题,随时向 Labby 提问。课程结束后提供反馈,我们会及时为你解决问题。
在本实验中,我们将学习如何使用 Python 的 scikit-learn 库在 t-SNE 中使用近似最近邻算法。
虚拟机启动完成后,点击左上角切换到“笔记本”标签页,以访问 Jupyter Notebook 进行练习。
有时,你可能需要等待几秒钟让 Jupyter Notebook 完成加载。由于 Jupyter Notebook 的限制,操作验证无法自动化。
如果你在学习过程中遇到问题,随时向 Labby 提问。课程结束后提供反馈,我们会及时为你解决问题。
我们需要安装 nmslib 和 pynndescent 软件包。这些软件包可以使用 pip 命令进行安装。
!pip install nmslib pynndescent
我们需要导入所需的库,包括 nmslib、pynndescent、sklearn、numpy、scipy 和 matplotlib。
import sys
import joblib
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline
from pynndescent import PyNNDescentTransformer
import nmslib
我们为 nmslib 定义一个包装类,以实现 scikit-learn API 到 nmslib 的接口,以及一个加载函数。NMSlibTransformer 类以 n_neighbors、metric、method 和 n_jobs 作为参数。fit() 方法初始化 nmslib 并将数据点添加到其中。transform() 方法找到最近邻并返回一个稀疏矩阵。
class NMSlibTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=-1):
self.n_neighbors = n_neighbors
self.method = method
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
## see more metric in the manual
## https://github.com/nmslib/nmslib/tree/master/manual
space = {
"euclidean": "l2",
"cosine": "cosinesimil",
"l1": "l1",
"l2": "l2",
}[self.metric]
self.nmslib_ = nmslib.init(method=self.method, space=space)
self.nmslib_.addDataPointBatch(X.copy())
self.nmslib_.createIndex()
return self
def transform(self, X):
n_samples_transform = X.shape[0]
## For compatibility reasons, as each sample is considered as its own
## neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
if self.n_jobs < 0:
## Same handling as done in joblib for negative values of n_jobs:
## in particular, `n_jobs == -1` means "as many threads as CPUs".
num_threads = joblib.cpu_count() + self.n_jobs + 1
else:
num_threads = self.n_jobs
results = self.nmslib_.knnQueryBatch(
X.copy(), k=n_neighbors, num_threads=num_threads
)
indices, distances = zip(*results)
indices, distances = np.vstack(indices), np.vstack(distances)
indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
kneighbors_graph = csr_matrix(
(distances.ravel(), indices.ravel(), indptr),
shape=(n_samples_transform, self.n_samples_fit_),
)
return kneighbors_graph
我们定义一个函数 load_mnist() 来加载 MNIST 数据集,对数据进行洗牌(打乱顺序),并仅返回指定数量的样本。
def load_mnist(n_samples):
"""Load MNIST, shuffle the data, and return only n_samples."""
mnist = fetch_openml("mnist_784", as_frame=False, parser="pandas")
X, y = shuffle(mnist.data, mnist.target, random_state=2)
return X[:n_samples] / 255, y[:n_samples]
我们对不同的精确/近似最近邻变换器进行基准测试。我们定义数据集、变换器和参数,包括 n_iter、perplexity、metric 和 n_neighbors。我们测量每个变换器在每个数据集上进行拟合和变换所需的时间。我们打印出每个变换器进行拟合和变换所花费的时间。
datasets = [
("MNIST_10000", load_mnist(n_samples=10_000)),
("MNIST_20000", load_mnist(n_samples=20_000)),
]
n_iter = 500
perplexity = 30
metric = "euclidean"
n_neighbors = int(3.0 * perplexity + 1) + 1
tsne_params = dict(
init="random", ## pca not supported for sparse matrices
perplexity=perplexity,
method="barnes_hut",
random_state=42,
n_iter=n_iter,
learning_rate="auto",
)
transformers = [
(
"KNeighborsTransformer",
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance", metric=metric),
),
(
"NMSlibTransformer",
NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
),
(
"PyNNDescentTransformer",
PyNNDescentTransformer(
n_neighbors=n_neighbors, metric=metric, parallel_batch_queries=True
),
),
]
for dataset_name, (X, y) in datasets:
msg = f"Benchmarking on {dataset_name}:"
print(f"\n{msg}\n" + str("-" * len(msg)))
for transformer_name, transformer in transformers:
longest = np.max([len(name) for name, model in transformers])
start = time.time()
transformer.fit(X)
fit_duration = time.time() - start
print(f"{transformer_name:<{longest}} {fit_duration:.3f} sec (fit)")
start = time.time()
Xt = transformer.transform(X)
transform_duration = time.time() - start
print(f"{transformer_name:<{longest}} {transform_duration:.3f} sec (transform)")
if transformer_name == "PyNNDescentTransformer":
start = time.time()
Xt = transformer.transform(X)
transform_duration = time.time() - start
print(
f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
" (transform)"
)
我们使用不同的最近邻变换器来可视化 t-SNE 嵌入。我们将 transformers 定义为一个列表,其中包含三个管道:「带有内部最近邻的 t-SNE」、「带有 KNeighborsTransformer 的 t-SNE」和「带有 NMSlibTransformer 的 t-SNE」。我们遍历数据集和变换器,并绘制 t-SNE 嵌入图,不同方法得到的嵌入图应该相似。最后我们展示该图。
transformers = [
("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)),
(
"TSNE with KNeighborsTransformer",
make_pipeline(
KNeighborsTransformer(
n_neighbors=n_neighbors, mode="distance", metric=metric
),
TSNE(metric="precomputed", **tsne_params),
),
),
(
"TSNE with NMSlibTransformer",
make_pipeline(
NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
TSNE(metric="precomputed", **tsne_params),
),
),
]
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if "TSNE" in name])
fig, axes = plt.subplots(
nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)
)
axes = axes.ravel()
i_ax = 0
for dataset_name, (X, y) in datasets:
msg = f"Benchmarking on {dataset_name}:"
print(f"\n{msg}\n" + str("-" * len(msg)))
for transformer_name, transformer in transformers:
longest = np.max([len(name) for name, model in transformers])
start = time.time()
Xt = transformer.fit_transform(X)
transform_duration = time.time() - start
print(
f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
" (fit_transform)"
)
## plot TSNE embedding which should be very similar across methods
axes[i_ax].set_title(transformer_name + "\non " + dataset_name)
axes[i_ax].scatter(
Xt[:, 0],
Xt[:, 1],
c=y.astype(np.int32),
alpha=0.2,
cmap=plt.cm.viridis,
)
axes[i_ax].xaxis.set_major_formatter(NullFormatter())
axes[i_ax].yaxis.set_major_formatter(NullFormatter())
axes[i_ax].axis("tight")
i_ax += 1
fig.tight_layout()
plt.show()
在本实验中,我们学习了如何使用 Python 的 scikit-learn 库在 t-SNE 中使用近似最近邻。我们导入了所需的库,为 nmslib 定义了一个包装类,定义了一个加载 MNIST 数据集的函数,对不同的最近邻变换器进行了基准测试,并可视化了 t-SNE 嵌入。我们了解到,默认的带有内部 NearestNeighbors 实现的 TSNE 估计器在性能方面大致等同于带有 TSNE 和 KNeighborsTransformer 的管道。我们还了解到,近似的 NMSlibTransformer 在最小的数据集上已经比精确搜索略快,但在样本数量较多的数据集上,这种速度差异预计会变得更加显著。