本文分享一个对Tensorflow 版GCN源码的分析。
keras版可以看:https://blog.csdn.net/tszupup/article/details/89004637
源代码 github:https://github.com/tkipf/gcn
代码分析
代码结构
├── __init__
├── data // 图数据
├── inits // 初始化的一些公用函数
├── layers // GCN层的定义
├── metrics // 评测指标的计算
├── models // 模型结构定义
├── train // 训练
└── utils // 工具函数的定义
一些具体代码的含义都在注释里。下面基于Cora数据集为例。
__init__.py
from __future__ import division
#即使在python2.X,使用print就得像python3.X那样加括号使用。
from __future__ import print_function
# 导入python未来支持的语言特征division(精确除法),
# 当我们没有在程序中导入该特征时,"/"操作符执行的是截断除法(Truncating Division);
# 当我们导入精确除法之后,"/"执行的是精确除法, "//"执行截断除除法
train.py
-
通过flags = tf.app.flags模式设置参数,可以在命令行运行时指定参数,例如:
python train.py --model gcn
-
提供了可供选择的三个模型:‘gcn’, ‘gcn_cheby’, ‘dense’。MLP是由两层的dense层构成
-
FLAGS.weight_decay(权重衰减):目的就是为了让权重减少到更小的值,在一定程度上减少模型过拟合的问题
-
FLAGS.hidden1:卷积层第一层的output_dim,第二层的input_dim
-
FLAGS.max_degree:K阶的切比雪夫近似矩阵的参数k
-
FLAGS.dropout:避免过拟合(按照一定的概率随机丢弃一部分神经元)
-
输入维度input_dim=features[2][1](1433),也就是每个节点特征的维度
from __future__ import division
#即使在python2.X,使用print就得像python3.X那样加括号使用。
from __future__ import print_function
# 导入python未来支持的语言特征division(精确除法),
# 当我们没有在程序中导入该特征时,"/"操作符执行的是截断除法(Truncating Division);
# 当我们导入精确除法之后,"/"执行的是精确除法, "//"执行截断除除法
import time
import tensorflow as tf
from gcn.utils import *
from gcn.models import GCN, MLP
# Set random seed
seed = 123
np.random.seed(seed)
tf.set_random_seed(seed)
# Settings
flags = tf.app.flags
FLAGS = flags.FLAGS
flags.DEFINE_string('dataset', 'cora', 'Dataset string.') # 'cora', 'citeseer', 'pubmed'
flags.DEFINE_string('model', 'gcn', 'Model string.') # 'gcn', 'gcn_cheby', 'dense'
flags.DEFINE_float('learning_rate', 0.01, 'Initial learning rate.')
flags.DEFINE_integer('epochs', 200, 'Number of epochs to train.')
#第一层的输出维度
flags.DEFINE_integer('hidden1', 16, 'Number of units in hidden layer 1.')
flags.DEFINE_float('dropout', 0.5, 'Dropout rate (1 - keep probability).')
#权值衰减:防止过拟合
# loss计算方式(权值衰减+正则化):self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
flags.DEFINE_float('weight_decay', 5e-4, 'Weight for L2 loss on embedding matrix.')
flags.DEFINE_integer('early_stopping', 10, 'Tolerance for early stopping (# of epochs).')
flags.DEFINE_integer('max_degree', 3, 'Maximum Chebyshev polynomial degree.') #K阶的切比雪夫近似矩阵的参数k
# Load data
adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(FLAGS.dataset)
# print(features)
# (0, 19) 1.0
# (0, 81) 1.0
# ...
# (2707, 1412) 1.0
# (2707, 1414) 1.0
# print(type(features))
# <class 'scipy.sparse.lil.lil_matrix'>
#预处理特征矩阵:将特征矩阵进行归一化并返回tuple (coords, values, shape)
features = preprocess_features(features)
# print(features)
# (array([[ 0, 1274],
# [ 0, 1247],
# [ 0, 1194],
# ...,
# [2707, 329],
# [2707, 186],
# [2707, 19]], dtype=int32), array([0.11111111, 0.11111111, 0.11111111, ..., 0.07692308, 0.07692308,
# 0.07692308], dtype=float32), (2708, 1433))
# print(type(features))
# <class 'tuple'>
# print("features[1]",features[1])
# features[1] [0.11111111 0.11111111 0.11111111 ... 0.07692308 0.07692308 0.07692308]
# print("features[1].shape",features[1].shape)
# features[1].shape (49216,)
if FLAGS.model == 'gcn':
support = [preprocess_adj(adj)] #support是邻接矩阵的归一化形式
# print("support:",support)
# support: [(array([[0, 0],
# [633, 0],
# [1862, 0],
# ...,
# [1473, 2707],
# [2706, 2707],
# [2707, 2707]], dtype=int32), array([0.25, 0.25, 0.2236068, ..., 0.2, 0.2,
# 0.2]), (2708, 2708))]
num_supports = 1
model_func = GCN
elif FLAGS.model == 'gcn_cheby':
support = chebyshev_polynomials(adj, FLAGS.max_degree)
num_supports = 1 + FLAGS.max_degree
model_func = GCN
elif FLAGS.model == 'dense':
support = [preprocess_adj(adj)] # Not used
num_supports = 1
model_func = MLP
else:
raise ValueError('Invalid argument for model: ' + str(FLAGS.model))
# print("num_supports:",num_supports)
#num_supports: 1
# Define placeholders
placeholders = {
#由于邻接矩阵是稀疏的,并且用LIL格式表示,因此定义为一个tf.sparse_placeholder(tf.float32),可以节省内存
'support': [tf.sparse_placeholder(tf.float32) for _ in range(num_supports)],
# features也是稀疏矩阵,也用LIL格式表示,因此定义为tf.sparse_placeholder(tf.float32),维度(2708, 1433)
# print(features[2])
# (2708, 1433)
'features': tf.sparse_placeholder(tf.float32, shape=tf.constant(features[2], dtype=tf.int64)),
# print(y_train.shape[1])
# 7
'labels': tf.placeholder(tf.float32, shape=(None, y_train.shape[1])),
'labels_mask': tf.placeholder(tf.int32),
'dropout': tf.placeholder_with_default(0., shape=()),
'num_features_nonzero': tf.placeholder(tf.int32) # helper variable for sparse dropout
}
# Create model
# print(features[2][1])
# 1433
model = model_func(placeholders, input_dim=features[2][1], logging=True)
# print("GCN output_dim:",model.output_dim)
#GCN output_dim: 7
# Initialize session
sess = tf.Session()
# Define model evaluation function
def evaluate(features, support, labels, mask, placeholders):
t_test = time.time()
feed_dict_val = construct_feed_dict(features, support, labels, mask, placeholders)
outs_val = sess.run([model.loss, model.accuracy], feed_dict=feed_dict_val)
return outs_val[0], outs_val[1], (time.time() - t_test)
# Init variables
sess.run(tf.global_variables_initializer())
cost_val = []
# Train model
for epoch in range(FLAGS.epochs):
t = time.time()
# Construct feed dictionary
feed_dict = construct_feed_dict(features, support, y_train, train_mask, placeholders)
feed_dict.update({placeholders['dropout']: FLAGS.dropout})
# Training step
outs = sess.run([model.opt_op, model.loss, model.accuracy], feed_dict=feed_dict)
# print("outs:",outs) #outs: [None, 0.57948196, 0.9642857]
# Validation
cost, acc, duration = evaluate(features, support, y_val, val_mask, placeholders)
cost_val.append(cost)
# Print results
print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(outs[1]),
"train_acc=", "{:.5f}".format(outs[2]), "val_loss=", "{:.5f}".format(cost),
"val_acc=", "{:.5f}".format(acc), "time=", "{:.5f}".format(time.time() - t))
if epoch > FLAGS.early_stopping and cost_val[-1] > np.mean(cost_val[-(FLAGS.early_stopping+1):-1]):
print("Early stopping...")
break
print("Optimization Finished!")
# Testing
test_cost, test_acc, test_duration = evaluate(features, support, y_test, test_mask, placeholders)
print("Test set results:", "cost=", "{:.5f}".format(test_cost),
"accuracy=", "{:.5f}".format(test_acc), "time=", "{:.5f}".format(test_duration))
models.py
定义了一个model基类,以及两个继承自model类的MLP、GCN类。
- 需要注意self.outputs、self.activations、self.layers的计算方式(看注释)
from gcn.layers import *
from gcn.metrics import *
flags = tf.app.flags
FLAGS = flags.FLAGS
#根据Layer来建立Model,主要是设置了self.layers 和 self.activations 建立序列模型,
# 还有init中的其他比如loss、accuracy、optimizer、opt_op等。
class Model(object):
def __init__(self, **kwargs):
allowed_kwargs = {'name', 'logging'}
for kwarg in kwargs.keys():
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
name = kwargs.get('name')
if not name:
name = self.__class__.__name__.lower()
self.name = name
logging = kwargs.get('logging', False)
self.logging = logging
self.vars = {}
self.placeholders = {}
#在子类中可以看出,通过_build方法append各个层
#保存每一个layer
self.layers = []
#保存每一次的输入,以及最后一层的输出
self.activations = []
self.inputs = None
self.outputs = None
self.loss = 0
self.accuracy = 0
self.optimizer = None
self.opt_op = None
# 定义私有方法,只能被类中的函数调用,不能在类外单独调用
def _build(self):
raise NotImplementedError
def build(self):
""" Wrapper for _build() """
with tf.variable_scope(self.name):
self._build()
# Build sequential layer model
self.activations.append(self.inputs)
# 以一个两层GCN层为例,输入inputs是features
#self.activations.append(self.inputs)初始化第一个元素为inputs,也就是features
# 第一层,hidden=layer(self.activations[-1]),即hidden等于inputs的输出outputs,并将第一层的输出hidden=outputs加入到activations中
#同理,对第二层,hidden作为一个中间存储结果。最后activations分别存储了三个元素:第一层的输入,第二层的输入(第一层的输出),第二层的输出
# 最后self.outputs=最后一层的输出
for layer in self.layers:
#Layer类重写了__call__ 函数,可以把对象当函数调用,__call__输入为inputs,输出为outputs
hidden = layer(self.activations[-1])
self.activations.append(hidden)
self.outputs = self.activations[-1]
# Store model variables for easy access
variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
self.vars = {var.name: var for var in variables}
# Build metrics
self._loss()
self._accuracy()
self.opt_op = self.optimizer.minimize(self.loss)
def predict(self):
pass
def _loss(self):
raise NotImplementedError
def _accuracy(self):
raise NotImplementedError
def save(self, sess=None):
if not sess:
raise AttributeError("TensorFlow session not provided.")
saver = tf.train.Saver(self.vars)
save_path = saver.save(sess, "tmp/%s.ckpt" % self.name)
print("Model saved in file: %s" % save_path)
def load(self, sess=None):
if not sess:
raise AttributeError("TensorFlow session not provided.")
saver = tf.train.Saver(self.vars)
save_path = "tmp/%s.ckpt" % self.name
saver.restore(sess, save_path)
print("Model restored from file: %s" % save_path)
#继承Model的多层感知机,主要是重写了基类中没有实现的函数;计算了网络第一层的权重衰减L2损失,因为这是半监督学习,还计算了掩码交叉熵masked_softmax_cross_entropy
class MLP(Model):
def __init__(self, placeholders, input_dim, **kwargs):
super(MLP, self).__init__(**kwargs)
self.inputs = placeholders['features']
self.input_dim = input_dim
# self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions
self.output_dim = placeholders['labels'].get_shape().as_list()[1]
self.placeholders = placeholders #以key,value形式存储的字典
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
self.build()
def _loss(self):
# Weight decay loss # 正则化项
for var in self.layers[0].vars.values():
self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
# Cross entropy error # 交叉熵损失函数
self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
self.placeholders['labels_mask'])
def _accuracy(self):
self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
self.placeholders['labels_mask'])
def _build(self):
self.layers.append(Dense(input_dim=self.input_dim,
output_dim=FLAGS.hidden1,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=True,
sparse_inputs=True,
logging=self.logging))
self.layers.append(Dense(input_dim=FLAGS.hidden1,
output_dim=self.output_dim,
placeholders=self.placeholders,
act=lambda x: x,
dropout=True,
logging=self.logging))
def predict(self):
return tf.nn.softmax(self.outputs)
#继承Model的卷机模型 GCN
class GCN(Model):
def __init__(self, placeholders, input_dim, **kwargs):
super(GCN, self).__init__(**kwargs)
self.inputs = placeholders['features']
self.input_dim = input_dim
# self.input_dim = self.inputs.get_shape().as_list()[1] # To be supported in future Tensorflow versions
self.output_dim = placeholders['labels'].get_shape().as_list()[1]
self.placeholders = placeholders
self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate)
self.build()
# 损失计算
def _loss(self):
# Weight decay loss
for var in self.layers[0].vars.values():
self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var)
# Cross entropy error
self.loss += masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'],
self.placeholders['labels_mask'])
# 计算模型准确度
def _accuracy(self):
self.accuracy = masked_accuracy(self.outputs, self.placeholders['labels'],
self.placeholders['labels_mask'])
# 构建模型:两层GCN
def _build(self):
#第一层的输入维度:input_dim=1433
#第一层的输出维度:output_dim=FLAGS.hidden1=16
#第一层的激活函数:relu
self.layers.append(GraphConvolution(input_dim=self.input_dim,
output_dim=FLAGS.hidden1,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=True,
sparse_inputs=True,
logging=self.logging))
#第二层的输入等于第一层的输出维度:input_dim=FLAGS.hidden1=16
#第二层的输出维度:output_dim=placeholders['labels'].get_shape().as_list()[1]=7
#第二层的激活函数:lambda x: x,即没有加激活函数
self.layers.append(GraphConvolution(input_dim=FLAGS.hidden1,
output_dim=self.output_dim,
placeholders=self.placeholders,
act=lambda x: x,
dropout=True,
logging=self.logging))
# 模型预测
def predict(self):
#返回的tensor每一行和为1
return tf.nn.softmax(self.outputs)
#test.py
#tf.enable_eager_execution()
# ones = tf.ones(shape=[2,3])
# print(ones)
# temp3 = tf.nn.softmax(ones)
# print(temp3)
# tf.Tensor(
# [[0.33333334 0.33333334 0.33333334]
# [0.33333334 0.33333334 0.33333334]], shape=(2, 3), dtype=float32)
layers.py
- 定义基类 Layer
- 属性:name (String) => 定义了变量范围;logging (Boolean) => 打开或关闭TensorFlow直方图日志记录
- 方法:init()(初始化),_call()(定义计算),call()(调用_call()函数),_log_vars()
- 定义Dense Layer类,继承自Layer类
- 定义GraphConvolution类,继承自Layer类。
from gcn.inits import *
import tensorflow as tf
flags = tf.app.flags
FLAGS = flags.FLAGS
# global unique layer ID dictionary for layer name assignment
_LAYER_UIDS = {}
def get_layer_uid(layer_name=''):
"""Helper function, assigns unique layer IDs."""
if layer_name not in _LAYER_UIDS:
_LAYER_UIDS[layer_name] = 1
return 1
else:
_LAYER_UIDS[layer_name] += 1
return _LAYER_UIDS[layer_name]
#稀疏矩阵的dropout操作
def sparse_dropout(x, keep_prob, noise_shape):
"""Dropout for sparse tensors."""
random_tensor = keep_prob
random_tensor += tf.random_uniform(noise_shape)
dropout_mask = tf.cast(tf.floor(random_tensor), dtype=tf.bool)
pre_out = tf.sparse_retain(x, dropout_mask)
return pre_out * (1./keep_prob)
def dot(x, y, sparse=False):
"""Wrapper for tf.matmul (sparse vs dense)."""
if sparse:
res = tf.sparse_tensor_dense_matmul(x, y)
else:
res = tf.matmul(x, y)
return res
#定义Layer 层,主要作用是:对每层的name做了命名,还用一个参数决定是否做log
class Layer(object):
"""Base layer class. Defines basic API for all layer objects.
Implementation inspired by keras (http://keras.io).
# Properties
name: String, defines the variable scope of the layer.
logging: Boolean, switches Tensorflow histogram logging on/off
# Methods
_call(inputs): Defines computation graph of layer
(i.e. takes input, returns output)
__call__(inputs): Wrapper for _call()
_log_vars(): Log all variables
"""
def __init__(self, **kwargs):
allowed_kwargs = {'name', 'logging'}
for kwarg in kwargs.keys():
assert kwarg in allowed_kwargs, 'Invalid keyword argument: ' + kwarg
name = kwargs.get('name')
if not name:
layer = self.__class__.__name__.lower()
name = layer + '_' + str(get_layer_uid(layer))
self.name = name
self.vars = {}
logging = kwargs.get('logging', False)
self.logging = logging
self.sparse_inputs = False
def _call(self, inputs):
return inputs
#__call__ 的作用让 Layer 的实例成为可调用对象;
def __call__(self, inputs):
with tf.name_scope(self.name):
if self.logging and not self.sparse_inputs:
tf.summary.histogram(self.name + '/inputs', inputs)
outputs = self._call(inputs)
if self.logging:
tf.summary.histogram(self.name + '/outputs', outputs)
return outputs
def _log_vars(self):
for var in self.vars:
tf.summary.histogram(self.name + '/vars/' + var, self.vars[var])
#根据 Layer 继承得到denseNet
class Dense(Layer):
"""Dense layer."""
def __init__(self, input_dim, output_dim, placeholders, dropout=0., sparse_inputs=False,
act=tf.nn.relu, bias=False, featureless=False, **kwargs):
super(Dense, self).__init__(**kwargs)
if dropout:
self.dropout = placeholders['dropout']
else:
self.dropout = 0.
self.act = act #激活函数
self.sparse_inputs = sparse_inputs #是否是稀疏数据
self.featureless = featureless #输入的数据带不带特征矩阵
self.bias = bias #是否有偏置
# helper variable for sparse dropout
self.num_features_nonzero = placeholders['num_features_nonzero']
with tf.variable_scope(self.name + '_vars'):
self.vars['weights'] = glorot([input_dim, output_dim],
name='weights')
if self.bias:
self.vars['bias'] = zeros([output_dim], name='bias')
if self.logging:
self._log_vars()
#重写了_call 函数,其中对稀疏矩阵做 drop_out:sparse_dropout()
def _call(self, inputs):
x = inputs
# dropout
if self.sparse_inputs:
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
else:
x = tf.nn.dropout(x, 1-self.dropout)
# transform
output = dot(x, self.vars['weights'], sparse=self.sparse_inputs)
# bias
if self.bias:
output += self.vars['bias']
return self.act(output)
#从 Layer 继承下来得到图卷积网络,与denseNet的唯一差别是_call函数和__init__函数(self.support = placeholders['support']的初始化)
class GraphConvolution(Layer):
"""Graph convolution layer."""
def __init__(self, input_dim, output_dim, placeholders, dropout=0.,
sparse_inputs=False, act=tf.nn.relu, bias=False,
featureless=False, **kwargs):
super(GraphConvolution, self).__init__(**kwargs)
if dropout:
self.dropout = placeholders['dropout']
else:
self.dropout = 0.
self.act = act
self.support = placeholders['support']
self.sparse_inputs = sparse_inputs
self.featureless = featureless
self.bias = bias
# helper variable for sparse dropout
self.num_features_nonzero = placeholders['num_features_nonzero']
# 下面是定义变量,主要是通过调用utils.py中的glorot函数实现
with tf.variable_scope(self.name + '_vars'):
for i in range(len(self.support)):
self.vars['weights_' + str(i)] = glorot([input_dim, output_dim],
name='weights_' + str(i))
if self.bias:
self.vars['bias'] = zeros([output_dim], name='bias')
if self.logging:
self._log_vars()
def _call(self, inputs):
x = inputs
# dropout
if self.sparse_inputs:
x = sparse_dropout(x, 1-self.dropout, self.num_features_nonzero)
else:
x = tf.nn.dropout(x, 1-self.dropout)
# convolve
# convolve 卷积的实现。主要是根据论文中公式Z = \tilde{D}^{-1/2}\tilde{A}^{-1/2}X\theta实现
supports = list() #support是邻接矩阵的一个变化
for i in range(len(self.support)):
if not self.featureless:
pre_sup = dot(x, self.vars['weights_' + str(i)],
sparse=self.sparse_inputs)
else:
pre_sup = self.vars['weights_' + str(i)]
support = dot(self.support[i], pre_sup, sparse=True)
supports.append(support)
output = tf.add_n(supports)
# bias
if self.bias:
output += self.vars['bias']
return self.act(output)
utils.py
LIL(Row-Based Linked List Format)-基于行的链表格式
稀疏矩阵转化成两个链表data和rows:
- 列表.data: data[k]是行k中的非零元素的列表。如果该行中的所有元素都为0,则它包含一个空列表。
- 列表.rows: 是在位置k包含了在行k中的非零元素列索引列表。
import numpy as np
import scipy.sparse as sp
A=np.array([[1,0,2,0],[0,0,0,0],[3,0,0,0],[1,0,0,4]])
AS=sp.lil_matrix(A)
print(AS.data)
# [list([1, 2]) list([]) list([3]) list([1, 4])]
print(AS.rows)
# [list([0, 2]) list([]) list([0]) list([0, 3])]
载入数据的维度(以Cora数据集为例)
-
adj(邻接矩阵):由于比较稀疏,邻接矩阵格式是LIL的,并且shape为(2708, 2708)
-
features(特征矩阵):每个节点的特征向量也是稀疏的,也用LIL格式存储,features.shape: (2708, 1433)
-
labels:ally, ty数据集叠加构成,labels.shape:(2708, 7)
-
train_mask, val_mask, test_mask:shaped都为(2708, )的向量,但是train_mask中的[0,140)范围的是True,其余是False;val_mask中范围为(140, 640]范围为True,其余的是False;test_mask中范围为[1708,2707]范围是True,其余的是False
-
y_train, y_val, y_test:shape都是(2708, 7) 。y_train的值为对应与labels中train_mask为True的行,其余全是0;y_val的值为对应与labels中val_mask为True的行,其余全是0;y_test的值为对应与labels中test_mask为True的行,其余全是0
-
特征矩阵进行归一化并返回一个格式为(coords, values, shape)的元组
-
将邻接矩阵加上自环以后,对称归一化,并存储为COO模式,最后返回格式为(coords, values, shape)的元组
import numpy as np
import pickle as pkl
import networkx as nx
import scipy.sparse as sp
from scipy.sparse.linalg.eigen.arpack import eigsh
import sys
def parse_index_file(filename):
"""Parse index file."""
index = []
for line in open(filename):
index.append(int(line.strip()))
print(int(line.strip()))
print("min", min(index))
return index
def sample_mask(idx, l):
"""Create mask."""
mask = np.zeros(l)
mask[idx] = 1
return np.array(mask, dtype=np.bool)
# 数据的读取,这个预处理是把训练集(其中一部分带有标签),测试集,标签的位置,对应的掩码训练标签等返回。
def load_data(dataset_str):
"""
Loads input data from gcn/data directory
ind.dataset_str.x => the feature vectors of the training instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.tx => the feature vectors of the test instances as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.allx => the feature vectors of both labeled and unlabeled training instances
(a superset of ind.dataset_str.x) as scipy.sparse.csr.csr_matrix object;
ind.dataset_str.y => the one-hot labels of the labeled training instances as numpy.ndarray object;
ind.dataset_str.ty => the one-hot labels of the test instances as numpy.ndarray object;
ind.dataset_str.ally => the labels for instances in ind.dataset_str.allx as numpy.ndarray object;
ind.dataset_str.graph => a dict in the format {index: [index_of_neighbor_nodes]} as collections.defaultdict
object;
ind.dataset_str.test.index => the indices of test instances in graph, for the inductive setting as list object.
All objects above must be saved using python pickle module.
:param dataset_str: Dataset name
:return: All data input files loaded (as well the training/test data).
"""
names = ['x', 'y', 'tx', 'ty', 'allx', 'ally', 'graph']
objects = []
for i in range(len(names)):
with open("data/ind.{}.{}".format(dataset_str, names[i]), 'rb') as f:
if sys.version_info > (3, 0): # get python version
objects.append(pkl.load(f, encoding='latin1'))
else:
objects.append(pkl.load(f))
# x.shape:(140, 1433); y.shape:(140, 7);tx.shape:(1000, 1433);ty.shape:(1708, 1433);
# allx.shape:(1708, 1433);ally.shape:(1708, 7)
x, y, tx, ty, allx, ally, graph = tuple(objects) # 转化成tuple
# 测试数据集
# print(x[0][0],x.shape,type(x)) ##x是一个稀疏矩阵,记住1的位置,140个实例,每个实例的特征向量维度是1433 (140,1433)
# print(y[0],y.shape) ##y是标签向量,7分类,140个实例 (140,7)
##训练数据集
# print(tx[0][0],tx.shape,type(tx)) ##tx是一个稀疏矩阵,1000个实例,每个实例的特征向量维度是1433 (1000,1433)
# print(ty[0],ty.shape) ##y是标签向量,7分类,1000个实例 (1000,7)
##allx,ally和上面的形式一致
# print(allx[0][0],allx.shape,type(allx)) ##tx是一个稀疏矩阵,1708个实例,每个实例的特征向量维度是1433 (1708,1433)
# print(ally[0],ally.shape) ##y是标签向量,7分类,1708个实例 (1708,7)
##graph是一个字典,大图总共2708个节点
# for i in graph:
# print(i,graph[i])
# 测试数据集的索引乱序版
test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset_str))
# print(test_idx_reorder)
# [2488, 2644, 3261, 2804, 3176, 2432, 3310, 2410, 2812,...]
# 从小到大排序,如[1707,1708,1709,...]
test_idx_range = np.sort(test_idx_reorder)
# 处理citeseer中一些孤立的点
if dataset_str == 'citeseer':
# Fix citeseer dataset (there are some isolated nodes in the graph)
# Find isolated nodes, add them as zero-vecs into the right position
test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder) + 1)
# print("test_idx_range_full.length",len(test_idx_range_full))
# test_idx_range_full.length 1015
# 转化成LIL格式的稀疏矩阵,tx_extended.shape=(1015,1433)
tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
# print(tx_extended)
# [2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
# ....
# 3321 3322 3323 3324 3325 3326]
# test_idx_range-min(test_idx_range):列表中每个元素都减去min(test_idx_range),即将test_idx_range列表中的index值变为从0开始编号
tx_extended[test_idx_range - min(test_idx_range), :] = tx
# print(tx_extended.shape) #(1015, 3703)
# print(tx_extended)
# (0, 19) 1.0
# (0, 21) 1.0
# (0, 169) 1.0
# (0, 170) 1.0
# (0, 425) 1.0
# ...
# (1014, 3243) 1.0
# (1014, 3351) 1.0
# (1014, 3472) 1.0
tx = tx_extended
# print(tx.shape)
# (1015, 3703)
# 997,994,993,980,938...等15行全为0
ty_extended = np.zeros((len(test_idx_range_full), y.shape[1]))
ty_extended[test_idx_range - min(test_idx_range), :] = ty
ty = ty_extended
# for i in range(ty.shape[0]):
# print(i," ",ty[i])
# # 980 [0. 0. 0. 0. 0. 0.]
# # 994 [0. 0. 0. 0. 0. 0.]
# # 993 [0. 0. 0. 0. 0. 0.]
# 将allx和tx叠起来并转化成LIL格式的feature,即输入一张整图
features = sp.vstack((allx, tx)).tolil()
# 把特征矩阵还原,和对应的邻接矩阵对应起来,因为之前是打乱的,不对齐的话,特征就和对应的节点搞错了。
features[test_idx_reorder, :] = features[test_idx_range, :]
# print("features.shape:",features.shape)
# features.shape: (2708, 1433)
# 邻接矩阵格式也是LIL的,并且shape为(2708, 2708)
adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))
# labels.shape:(2708, 7)
labels = np.vstack((ally, ty))
labels[test_idx_reorder, :] = labels[test_idx_range, :]
# len(list(idx_val)) + len(list(idx_train)) + len(idx_test) = 1640
idx_test = test_idx_range.tolist()
# print(idx_test)
# [1708, 1709, 1710, 1711, 1712, 1713,...,2705, 2706, 2707]
# print(len(idx_test))
# 1000
idx_train = range(len(y))
# print(idx_train)
# range(0, 140)
idx_val = range(len(y), len(y) + 500)
# print(idx_val,len(idx_val))
# range(140, 640) 500
# 训练mask:idx_train=[0,140)范围的是True,后面的是False
train_mask = sample_mask(idx_train, labels.shape[0]) # labels.shape[0]:(2708,)
# print(train_mask,train_mask.shape)
# [True True True... False False False] # labels.shape[0]:(2708,)
# 验证mask:val_mask的idx_val=(140, 640]范围为True,其余的是False
val_mask = sample_mask(idx_val, labels.shape[0]) # labels.shape[0]:(2708,)
# test_mask,idx_test=[1708,2707]范围是True,其余的是False
test_mask = sample_mask(idx_test, labels.shape[0])
y_train = np.zeros(labels.shape)
y_val = np.zeros(labels.shape)
y_test = np.zeros(labels.shape)
# print(y_train.shape," ",y_test.shape," ",y_val.shape)
# (2708, 7)(2708, 7)(2708, 7)
# 替换了true位置
y_train[train_mask, :] = labels[train_mask, :]
y_val[val_mask, :] = labels[val_mask, :]
y_test[test_mask, :] = labels[test_mask, :]
return adj, features, y_train, y_val, y_test, train_mask, val_mask, test_mask
# 将稀疏矩sparse_mx阵转换成tuple格式并返回
def sparse_to_tuple(sparse_mx):
"""Convert sparse matrix to tuple representation."""
def to_tuple(mx):
if not sp.isspmatrix_coo(mx):
mx = mx.tocoo()
coords = np.vstack((mx.row, mx.col)).transpose()
values = mx.data
shape = mx.shape
return coords, values, shape
if isinstance(sparse_mx, list):
for i in range(len(sparse_mx)):
sparse_mx[i] = to_tuple(sparse_mx[i])
else:
sparse_mx = to_tuple(sparse_mx)
return sparse_mx
# 处理特征:特征矩阵进行归一化并返回一个格式为(coords, values, shape)的元组
# 特征矩阵的每一行的每个元素除以行和,处理后的每一行元素之和为1
# 处理特征矩阵,跟谱图卷积的理论有关,目的是要把周围节点的特征和自身节点的特征都捕捉到,同时避免不同节点间度的不均衡带来的问题
def preprocess_features(features):
"""Row-normalize feature matrix and convert to tuple representation"""
print("preprocess_features")
# >> > b = [[1.0, 3], [2, 4], [3, 5]]
# >> > b = np.array(b)
# >> > b
# array([[1., 3.],
# [2., 4.],
# [3., 5.]])
# >> > np.array(b.sum(1))
# array([4., 6., 8.])
# >> > c = np.array(b.sum(1))
# >> > np.power(c, -1)
# array([0.25, 0.16666667, 0.125])
# >> > np.power(c, -1).flatten()
# array([0.25, 0.16666667, 0.125])
# >> > r_inv = np.power(c, -1).flatten()
# >> > import scipy.sparse as sp
# >> > r_mat_inv = sp.diags(r_inv)
# >> > r_mat_inv
# < 3x3 sparse matrix of type '<class 'numpy.float64 '>'
# with 3 stored elements (1 diagonals) in DIAgonal format >
# >> > r_mat_inv.toarray()
# array([[0.25, 0., 0.],
# [0., 0.16666667, 0.],
# [0., 0., 0.125]])
# >> > f = r_mat_inv.dot(b)
# >> > f
# array([[0.25, 0.75],
# [0.33333333, 0.66666667],
# [0.375, 0.625]])
# a.sum()是将矩阵中所有的元素进行求和;a.sum(axis = 0)是每一列列相加;a.sum(axis = 1)是每一行相加
rowsum = np.array(features.sum(1))
r_inv = np.power(rowsum, -1).flatten()
# print("r_inv:", r_inv)
# r_inv: [0.11111111 0.04347826 0.05263158... 0.05555556 0.07142857 0.07692308]
# np.isnan(ndarray)返回一个判断是否是NaN的bool型数组
r_inv[np.isinf(r_inv)] = 0.
# sp.diags创建一个对角稀疏矩阵
r_mat_inv = sp.diags(r_inv)
# dot矩阵乘法
features = r_mat_inv.dot(features)
return sparse_to_tuple(features)
# 邻接矩阵adj对称归一化并返回coo存储模式
def normalize_adj(adj):
"""Symmetrically normalize adjacency matrix."""
adj = sp.coo_matrix(adj)
rowsum = np.array(adj.sum(1))
d_inv_sqrt = np.power(rowsum, -0.5).flatten()
d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0.
d_mat_inv_sqrt = sp.diags(d_inv_sqrt)
return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo()
# 将邻接矩阵加上自环以后,对称归一化,并存储为COO模式,最后返回元组格式
def preprocess_adj(adj):
"""Preprocessing of adjacency matrix for simple GCN model and conversion to tuple representation."""
#加上自环,再对称归一化
adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0]))
return sparse_to_tuple(adj_normalized)
# 构建输入字典并返回
#labels和labels_mask传入的是具体的值,例如
# labels=y_train,labels_mask=train_mask;
# labels=y_val,labels_mask=val_mask;
# labels=y_test,labels_mask=test_mask;
def construct_feed_dict(features, support, labels, labels_mask, placeholders):
"""Construct feed dictionary."""
feed_dict = dict()
feed_dict.update({placeholders['labels']: labels})
feed_dict.update({placeholders['labels_mask']: labels_mask})
feed_dict.update({placeholders['features']: features})
#由于邻接矩阵是稀疏的,并且用LIL格式表示,因此定义为一个tf.sparse_placeholder(tf.float32),可以节省内存
feed_dict.update({placeholders['support'][i]: support[i] for i in range(len(support))})
# print(features)
# (array([[ 0, 1274],
# [ 0, 1247],
# [ 0, 1194],
# ...,
# [2707, 329],
# [2707, 186],
# [2707, 19]], dtype=int32), array([0.11111111, 0.11111111, 0.11111111, ..., 0.07692308, 0.07692308,
# 0.07692308], dtype=float32), (2708, 1433))
# print(type(features))
# <class 'tuple'>
# print("features[1]",features[1])
# features[1] [0.11111111 0.11111111 0.11111111 ... 0.07692308 0.07692308 0.07692308]
# print("features[1].shape",features[1].shape)
# features[1].shape (49216,)
#49126是特征矩阵存储为coo模式后非零元素的个数(2078*1433里只有49126个非零,稀疏度达1.3%)
feed_dict.update({placeholders['num_features_nonzero']: features[1].shape})
return feed_dict
# 切比雪夫多项式近似:计算K阶的切比雪夫近似矩阵
def chebyshev_polynomials(adj, k):
"""Calculate Chebyshev polynomials up to order k. Return a list of sparse matrices (tuple representation)."""
print("Calculating Chebyshev polynomials up to order {}...".format(k))
adj_normalized = normalize_adj(adj) # D^{-1/2}AD^{1/2}
laplacian = sp.eye(adj.shape[0]) - adj_normalized # L = I_N - D^{-1/2}AD^{1/2}
largest_eigval, _ = eigsh(laplacian, 1, which='LM') # \lambda_{max}
scaled_laplacian = (2. / largest_eigval[0]) * laplacian - sp.eye(adj.shape[0]) # 2/\lambda_{max}L-I_N
# 将切比雪夫多项式的 T_0(x) = 1和 T_1(x) = x 项加入到t_k中
t_k = list()
t_k.append(sp.eye(adj.shape[0]))
t_k.append(scaled_laplacian)
# 依据公式 T_n(x) = 2xT_n(x) - T_{n-1}(x) 构造递归程序,计算T_2 -> T_k
def chebyshev_recurrence(t_k_minus_one, t_k_minus_two, scaled_lap):
s_lap = sp.csr_matrix(scaled_lap, copy=True)
return 2 * s_lap.dot(t_k_minus_one) - t_k_minus_two
for i in range(2, k + 1):
t_k.append(chebyshev_recurrence(t_k[-1], t_k[-2], scaled_laplacian))
return sparse_to_tuple(t_k)
# load_data('cora')
metrics.py
import tensorflow as tf
# 其中 mask 是一个索引向量,值为1表示该位置的标签在训练数据中是给定的;比如100个数据中训练集已知带标签的数据有50个,
# 那么计算损失的时候,loss 乘以的 mask 等于 loss 在未带标签的地方都乘以0没有了,而在带标签的地方损失变成了mask倍;
# 即只对带标签的样本计算损失。
# 注:loss的shape与mask的shape相同,等于样本的数量:(None,),所以 loss *= mask 是向量点乘。
def masked_softmax_cross_entropy(preds, labels, mask):
"""Softmax cross-entropy loss with masking."""
loss = tf.nn.softmax_cross_entropy_with_logits(logits=preds, labels=labels)
mask = tf.cast(mask, dtype=tf.float32)
mask /= tf.reduce_mean(mask) #扩大了tf.reduce_mean(mask)倍,因此要除以这个数
loss *= mask
return tf.reduce_mean(loss)
def masked_accuracy(preds, labels, mask):
"""Accuracy with masking."""
correct_prediction = tf.equal(tf.argmax(preds, 1), tf.argmax(labels, 1))
accuracy_all = tf.cast(correct_prediction, tf.float32)
mask = tf.cast(mask, dtype=tf.float32)
mask /= tf.reduce_mean(mask)
accuracy_all *= mask
return tf.reduce_mean(accuracy_all)
inits.py
- glorot初始化方法:它为了保证前向传播和反向传播时每一层的方差一致:在正向传播时,每层的激活值的方差保持不变;在反向传播时,每层的梯度值的方差保持不变。根据每层的输入个数和输出个数来决定参数随机初始化的分布范围,是一个通过该层的输入和输出参数个数得到的分布范围内的均匀分布。
(推导见:https://blog.csdn.net/yyl424525/article/details/100823398#4_Xavier_21)
import tensorflow as tf
import numpy as np
#产生一个维度为shape的Tensor,值分布在(-0.005-0.005)之间,且为均匀分布
def uniform(shape, scale=0.05, name=None):
"""Uniform init."""
initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32)
return tf.Variable(initial, name=name)
def glorot(shape, name=None):
"""Glorot & Bengio (AISTATS 2010) init."""
#
init_range = np.sqrt(6.0/(shape[0]+shape[1]))
initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32)
return tf.Variable(initial, name=name)
#产生一个维度为shape,值全为1的Tensor
def zeros(shape, name=None):
"""All zeros."""
initial = tf.zeros(shape, dtype=tf.float32)
return tf.Variable(initial, name=name)
#产生一个维度为shape,值全为0的Tensor
def ones(shape, name=None):
"""All ones."""
initial = tf.ones(shape, dtype=tf.float32)
return tf.Variable(initial, name=name)
问题总结&欢迎讨论
Q1:总共2708个节点,但是训练数据仅用了140个,范围是(0, 140),验证集用了500个,范围是(140, 640],测试集用了1000个,范围是[1708,2707],其余范围从[641,1707]的数据集呢?以及这样分配数据集合理吗?
Q2:增加GCN层数,为何准确率还降低了?
# 构建模型:多层GCN
def _build(self):
#第一层的输入维度:input_dim=1433
#第一层的输出维度:output_dim=FLAGS.hidden1=16
#第一层的激活函数:relu
self.layers.append(GraphConvolution(input_dim=self.input_dim,
output_dim=124,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=True,
sparse_inputs=True,
logging=self.logging))
self.layers.append(GraphConvolution(input_dim=124,
output_dim=64,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=True,
logging=self.logging))
self.layers.append(GraphConvolution(input_dim=64,
output_dim=32,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=False,
logging=self.logging))
self.layers.append(GraphConvolution(input_dim=32,
output_dim=16,
placeholders=self.placeholders,
act=tf.nn.relu,
dropout=False,
logging=self.logging))
#第二层的输入等于第一层的输出维度:input_dim=FLAGS.hidden1=16
#第二层的输出维度:output_dim=placeholders['labels'].get_shape().as_list()[1]=7
#第二层的激活函数:lambda x: x,即没有加激活函数
self.layers.append(GraphConvolution(input_dim=FLAGS.hidden1,
output_dim=self.output_dim,
placeholders=self.placeholders,
act=lambda x: x,
dropout=True,
logging=self.logging))
print("GCN调用了此函数:_build")
在“A Comprehensive Survey on Graph Neural Networks”和“Deeper Insights into Graph Convolutional Networks for Semi-Supervised Learning”中的解释是over smoothing,也就是层数多了,反而使远处的节点和近处的节点相似而更难以区分,当层数到达一定时,整个网络呈一个稳定的不动点,达到平衡。
有错误的地方还望不吝指出,欢迎进群交流GNNs&GCNs(入群备注信息!!!,格式:姓名 -(学校或其他机构信息)- 研究方向)。