import numpy as np
import networkx as nx
import pandas as pd
import scipy.sparse as sps
import pickle as pkl
import os.path
from sklearn.preprocessing import LabelBinarizer
from functools import partial
from pathlib import Path
[docs]def load_cora(data_home="datasets/legacy/cora"):
base = Path(data_home)
df = pd.read_csv(base.joinpath("cora.content"),
sep=r"\s+", header=None, index_col=0)
features_df = df.iloc[:, :-1]
labels_df = df.iloc[:, -1]
X_all = features_df.values
y_all = LabelBinarizer().fit_transform(labels_df.values)
edge_list_df = pd.read_csv(base.joinpath("cora.cites"),
sep=r"\s+", header=None)
idx_map = {j: i for i, j in enumerate(df.index)}
H = nx.from_pandas_edgelist(edge_list_df, 0, 1)
G = nx.relabel.relabel_nodes(H, idx_map)
A = nx.to_scipy_sparse_matrix(G, nodelist=sorted(G.nodes()), format='coo')
return (X_all, y_all, A)
[docs]def load_pickle(name, ext, data_home="datasets", encoding='latin1'):
path = os.path.join(data_home, name, "ind.{0}.{1}".format(name, ext))
with open(path, "rb") as f:
return pkl.load(f, encoding=encoding)
[docs]def load_test_indices(name, data_home="datasets"):
indices_df = pd.read_csv(os.path.join(data_home, name, "ind.{0}.test.index".format(name)), header=None)
indices = indices_df.values.squeeze()
return indices
[docs]def load_dataset(name, data_home="datasets"):
exts = ['tx', 'ty', 'allx', 'ally', 'graph']
(X_test,
y_test,
X_rest,
y_rest,
G_dict) = map(partial(load_pickle, name, data_home=data_home), exts)
_, D = X_test.shape
_, K = y_test.shape
ind_test_perm = load_test_indices(name, data_home)
ind_test = np.sort(ind_test_perm)
num_test = len(ind_test)
num_test_full = ind_test[-1] - ind_test[0] + 1
# TODO: Issue warning if `num_isolated` is non-zero.
num_isolated = num_test_full - num_test
# normalized zero-based indices
ind_test_norm = ind_test - np.min(ind_test)
# features
X_test_full = sps.lil_matrix((num_test_full, D))
X_test_full[ind_test_norm] = X_test
X_all = sps.vstack((X_rest, X_test_full)).toarray()
X_all[ind_test_perm] = X_all[ind_test]
# targets
y_test_full = np.zeros((num_test_full, K))
y_test_full[ind_test_norm] = y_test
y_all = np.vstack((y_rest, y_test_full))
y_all[ind_test_perm] = y_all[ind_test]
# graph
G = nx.from_dict_of_lists(G_dict)
A = nx.to_scipy_sparse_matrix(G, format='coo')
return (X_all, y_all, A)