计算邻接矩阵
我们将提取邻接图作为一个SciPy稀疏矩阵。首先解析重定向。返回X,即SciPy稀疏邻接矩阵,重定向信息作为从文章名称到文章名称的Python字典,以及index_map,一个从文章名称到Python整数(文章索引)的Python字典。
import numpy as np
from scipy import sparse
def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
"""Extract the adjacency graph as a scipy sparse matrix"""
index_map = dict()
links = list()
for l, line in enumerate(BZ2File(page_links_filename)):
split = line.split()
if len(split)!= 4:
print("ignoring malformed line: " + line)
continue
i = index(redirects, index_map, short_name(split[0]))
j = index(redirects, index_map, short_name(split[2]))
links.append((i, j))
if l % 1000000 == 0:
print("[%s] line: %08d" % (datetime.now().isoformat(), l))
if limit is not None and l >= limit - 1:
break
print("Computing the adjacency matrix")
X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)
for i, j in links:
X[i, j] = 1.0
del links
print("Converting to CSR representation")
X = X.tocsr()
print("CSR conversion done")
return X, redirects, index_map
## stop after 5M links to make it possible to work in RAM
X, redirects, index_map = get_adjacency_matrix(
redirects_filename, page_links_filename, limit=5000000
)
names = {i: name for name, i in index_map.items()}