Source code for etudes.datasets.networks

import numpy as np
import networkx as nx
import pandas as pd
import scipy.sparse as sps

import pickle as pkl
import os.path

from sklearn.preprocessing import LabelBinarizer
from functools import partial
from pathlib import Path


[docs]def load_cora(data_home="datasets/legacy/cora"): base = Path(data_home) df = pd.read_csv(base.joinpath("cora.content"), sep=r"\s+", header=None, index_col=0) features_df = df.iloc[:, :-1] labels_df = df.iloc[:, -1] X_all = features_df.values y_all = LabelBinarizer().fit_transform(labels_df.values) edge_list_df = pd.read_csv(base.joinpath("cora.cites"), sep=r"\s+", header=None) idx_map = {j: i for i, j in enumerate(df.index)} H = nx.from_pandas_edgelist(edge_list_df, 0, 1) G = nx.relabel.relabel_nodes(H, idx_map) A = nx.to_scipy_sparse_matrix(G, nodelist=sorted(G.nodes()), format='coo') return (X_all, y_all, A)
[docs]def load_pickle(name, ext, data_home="datasets", encoding='latin1'): path = os.path.join(data_home, name, "ind.{0}.{1}".format(name, ext)) with open(path, "rb") as f: return pkl.load(f, encoding=encoding)
[docs]def load_test_indices(name, data_home="datasets"): indices_df = pd.read_csv(os.path.join(data_home, name, "ind.{0}.test.index".format(name)), header=None) indices = indices_df.values.squeeze() return indices
[docs]def load_dataset(name, data_home="datasets"): exts = ['tx', 'ty', 'allx', 'ally', 'graph'] (X_test, y_test, X_rest, y_rest, G_dict) = map(partial(load_pickle, name, data_home=data_home), exts) _, D = X_test.shape _, K = y_test.shape ind_test_perm = load_test_indices(name, data_home) ind_test = np.sort(ind_test_perm) num_test = len(ind_test) num_test_full = ind_test[-1] - ind_test[0] + 1 # TODO: Issue warning if `num_isolated` is non-zero. num_isolated = num_test_full - num_test # normalized zero-based indices ind_test_norm = ind_test - np.min(ind_test) # features X_test_full = sps.lil_matrix((num_test_full, D)) X_test_full[ind_test_norm] = X_test X_all = sps.vstack((X_rest, X_test_full)).toarray() X_all[ind_test_perm] = X_all[ind_test] # targets y_test_full = np.zeros((num_test_full, K)) y_test_full[ind_test_norm] = y_test y_all = np.vstack((y_rest, y_test_full)) y_all[ind_test_perm] = y_all[ind_test] # graph G = nx.from_dict_of_lists(G_dict) A = nx.to_scipy_sparse_matrix(G, format='coo') return (X_all, y_all, A)