-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathutil.py
executable file
·131 lines (112 loc) · 5.8 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
from __future__ import print_function
import numpy as np
import networkx as nx
import argparse
cmd_opt = argparse.ArgumentParser(description='Argparser for graph_classification')
cmd_opt.add_argument('-mode', default='cpu', help='cpu/gpu')
cmd_opt.add_argument('-gm', default='mean_field', help='mean_field/loopy_bp')
cmd_opt.add_argument('-data', default=None, help='data folder name')
cmd_opt.add_argument('-batch_size', type=int, default=50, help='minibatch size')
cmd_opt.add_argument('-seed', type=int, default=1, help='seed')
cmd_opt.add_argument('-feat_dim', type=int, default=0, help='dimension of discrete node feature (maximum node tag)')
cmd_opt.add_argument('-num_class', type=int, default=0, help='#classes')
cmd_opt.add_argument('-fold', type=int, default=1, help='fold (1..10)')
cmd_opt.add_argument('-test_number', type=int, default=0, help='if specified, will overwrite -fold and use the last -test_number graphs as testing data')
cmd_opt.add_argument('-num_epochs', type=int, default=1000, help='number of epochs')
cmd_opt.add_argument('-latent_dim', type=str, default='64', help='dimension(s) of latent layers')
cmd_opt.add_argument('-sortpooling_k', type=float, default=30, help='number of nodes kept after SortPooling')
cmd_opt.add_argument('-out_dim', type=int, default=1024, help='s2v output size')
cmd_opt.add_argument('-hidden', type=int, default=100, help='dimension of regression')
cmd_opt.add_argument('-max_lv', type=int, default=4, help='max rounds of message passing')
cmd_opt.add_argument('-learning_rate', type=float, default=0.0001, help='init learning_rate')
cmd_opt.add_argument('-dropout', type=bool, default=False, help='whether add dropout after dense layer')
cmd_opt.add_argument('-printAUC', type=bool, default=False, help='whether to print AUC (for binary classification only)')
cmd_opt.add_argument('-extract_features', type=bool, default=False, help='whether to extract final graph features')
cmd_args, _ = cmd_opt.parse_known_args()
cmd_args.latent_dim = [int(x) for x in cmd_args.latent_dim.split('-')]
if len(cmd_args.latent_dim) == 1:
cmd_args.latent_dim = cmd_args.latent_dim[0]
class S2VGraph(object):
def __init__(self, g, label, node_tags=None, node_features=None):
'''
g: a networkx graph
label: an integer graph label
node_tags: a list of integer node tags
node_features: a numpy array of continuous node features
'''
self.num_nodes = len(node_tags)
self.node_tags = node_tags
self.label = label
self.node_features = node_features # numpy array (node_num * feature_dim)
self.degs = list(dict(g.degree).values())
if len(g.edges()) != 0:
x, y = zip(*g.edges())
self.num_edges = len(x)
self.edge_pairs = np.ndarray(shape=(self.num_edges, 2), dtype=np.int32)
self.edge_pairs[:, 0] = x
self.edge_pairs[:, 1] = y
self.edge_pairs = self.edge_pairs.flatten()
else:
self.num_edges = 0
self.edge_pairs = np.array([])
def load_data():
print('loading data')
g_list = []
label_dict = {}
feat_dict = {}
with open('/mnt/dive/shared/hao.yuan/CRF_Graph/data/%s/%s.txt' % (cmd_args.data, cmd_args.data), 'r') as f:
n_g = int(f.readline().strip())
for i in range(n_g):
row = f.readline().strip().split()
n, l = [int(w) for w in row]
if not l in label_dict:
mapped = len(label_dict)
label_dict[l] = mapped
g = nx.Graph()
node_tags = []
node_features = []
n_edges = 0
for j in range(n):
g.add_node(j)
row = f.readline().strip().split()
tmp = int(row[1]) + 2
if tmp == len(row):
# no node attributes
row = [int(w) for w in row]
attr = None
else:
row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]])
if not row[0] in feat_dict:
mapped = len(feat_dict)
feat_dict[row[0]] = mapped
node_tags.append(feat_dict[row[0]])
if tmp > len(row):
node_features.append(attr)
n_edges += row[1]
for k in range(2, len(row)):
g.add_edge(j, row[k])
if node_features != []:
node_features = np.stack(node_features)
node_feature_flag = True
else:
node_features = None
node_feature_flag = False
#assert len(g.edges()) * 2 == n_edges (some graphs in COLLAB have self-loops, ignored here)
assert len(g) == n
g_list.append(S2VGraph(g, l, node_tags, node_features))
for g in g_list:
g.label = label_dict[g.label]
cmd_args.num_class = len(label_dict)
cmd_args.feat_dim = len(feat_dict) # maximum node label (tag)
if node_feature_flag == True:
cmd_args.attr_dim = node_features.shape[1] # dim of node features (attributes)
else:
cmd_args.attr_dim = 0
print('# classes: %d' % cmd_args.num_class)
print('# maximum node tag: %d' % cmd_args.feat_dim)
if cmd_args.test_number == 0:
train_idxes = np.loadtxt('/mnt/dive/shared/hao.yuan/CRF_Graph/data/%s/10fold_idx/train_idx-%d.txt' % (cmd_args.data, cmd_args.fold), dtype=np.int32).tolist()
test_idxes = np.loadtxt('/mnt/dive/shared/hao.yuan/CRF_Graph/data/%s/10fold_idx/test_idx-%d.txt' % (cmd_args.data, cmd_args.fold), dtype=np.int32).tolist()
return [g_list[i] for i in train_idxes], [g_list[i] for i in test_idxes]
else:
return g_list[: n_g - cmd_args.test_number], g_list[n_g - cmd_args.test_number :]