代码地址 https://github.com/AITTSMD/MTCNN-Tensorflow
这里我就不在进行MTCNN的介绍了。分析的再清楚都不如从源码的实现去分析。
Talk is cheap, just show me the code。
MTCNN主要分为三个网络 PNet RNet ONet
其中PNet是个全卷积网络 这是和RNet ONet最大的区别
由于篇幅有限 分成多篇进行分析
MTCNN源码详细解读(2)- PNet的训练和数据集的构建
MTCNN源码详细解读(3)- RNet的训练和数据集的构建
def P_Net(inputs,label=None,bbox_target=None,landmark_target=None,training=True):
#define common param
# 为相同的卷积操作 设置一样的初始化参数和激活函数prelu
with slim.arg_scope([slim.conv2d],
activation_fn=prelu,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
padding='valid'):
# PNet 训练输入时(batch_size, 12, 12, 3)
# (batch_size, 10, 10, 10)
net = slim.conv2d(inputs, 10, 3, stride=1,scope='conv1')
# (batch_size, 5, 5, 10)
net = slim.max_pool2d(net, kernel_size=[2,2], stride=2, scope='pool1', padding='SAME')
# (batch_size, 3, 3, 16)
net = slim.conv2d(net,num_outputs=16,kernel_size=[3,3],stride=1,scope='conv2')
# (batch_size, 1, 1, 32)
net = slim.conv2d(net,num_outputs=32,kernel_size=[3,3],stride=1,scope='conv3')
#batch*H*W*2
# 用 1 * 1卷积核来做输出
# 这里是类别输出 虽然是二分类 但是作者用2的维度来表示 第一位表示不是人脸置信度 第二位表示是人脸的置信度
conv4_1 = slim.conv2d(net,num_outputs=2,kernel_size=[1,1],stride=1,scope='conv4_1',activation_fn=tf.nn.softmax)
#batch*H*W*4
# 这里就是输出坐标的偏移 4个值
bbox_pred = slim.conv2d(net,num_outputs=4,kernel_size=[1,1],stride=1,scope='conv4_2',activation_fn=None)
#batch*H*W*10
# 这里是landmark五个点的坐标就是10个值 回归值
landmark_pred = slim.conv2d(net,num_outputs=10,kernel_size=[1,1],stride=1,scope='conv4_3',activation_fn=None)
if training:
#batch*2
# (batch, 1, 1, 2) 去掉dim=[1, 2]两个维度
# 下面也是同理
cls_prob = tf.squeeze(conv4_1,[1,2],name='cls_prob')
# 计算分类损失
cls_loss = cls_ohem(cls_prob,label)
#batch
bbox_pred = tf.squeeze(bbox_pred,[1,2],name='bbox_pred')
# 计算坐标损失
bbox_loss = bbox_ohem(bbox_pred,bbox_target,label)
#batch*10
landmark_pred = tf.squeeze(landmark_pred,[1,2],name="landmark_pred")
# 计算landMark损失
landmark_loss = landmark_ohem(landmark_pred,landmark_target,label)
accuracy = cal_accuracy(cls_prob,label)
L2_loss = tf.add_n(slim.losses.get_regularization_losses())
return cls_loss,bbox_loss,landmark_loss,L2_loss,accuracy
网络结构看上去简单清晰 下面分析下三个损失函数
1 分类损失cls_ohem 常用的交叉熵损失
def cls_ohem(cls_prob, label):
# 构建一个和label shape一致的0数组
# (batch, )
zeros = tf.zeros_like(label)
#label=-1 --> label=0net_factory
# 对于label小于0的过滤掉 label {0, 1}的保留
# 这里先简单说下 PNet总共有三种label 0-negative 1-positive -1-part -2-landmark 后面在数据集构建的时候会详细说明
# 对于分类损失只需要计算 label为 0, 1的图片
label_filter_invalid = tf.where(tf.less(label,0), zeros, label)
# (batch_size, 2) --> size: batch_size * 2
num_cls_prob = tf.size(cls_prob)
# reshape 后 (batch_size * 2, 1) 为什么这么做呢因为这里对二分类用了2个输出表示 所有每个位置的值度需要计算损失
# 如果用1个值来表示就没必要这么麻烦
cls_prob_reshape = tf.reshape(cls_prob,[num_cls_prob,-1])
# 将上面的label转成int
label_int = tf.cast(label_filter_invalid,tf.int32)
# cls_prob shape 还是 (batch_size, 2) 所有 num_row就是batch
num_row = tf.to_int32(cls_prob.get_shape()[0])
# 这里对num_row * 2 因为有两个值表示置信度 第一个位置不是人脸的 第二个位置是人脸的
# 这里详细分析下为什么乘2
# 假设batch_size=5 row = [0, 2, 4, 6, 8] 假设我们的label经过过滤后[1, 0, 0, 0, 1]
# 相加变成 [1, 2, 4, 6, 9] 也就是说如果第i张图片label为1 就把第i张图片输出第二个位置的置信度值取出来 对于0的不变就是第一个位置置信度
# 有可能会有人说那过滤掉的label也不是0嘛 后label为0的没区分开来 这里不用担心 坐着下面会做mask 这是个常用手段 不需要参与计算的位置都mask掉
row = tf.range(num_row)*2
indices_ = row + label_int
# 从 (batch_size *2, 1)中取出对应位置的label进行损失计算
label_prob = tf.squeeze(tf.gather(cls_prob_reshape, indices_))
# 计算负的log损失
loss = -tf.log(label_prob+1e-10)
zeros = tf.zeros_like(label_prob, dtype=tf.float32)
ones = tf.ones_like(label_prob,dtype=tf.float32)
# 这里就是添加mask 对于label小于0的mask掉
# 下面就是简单的求和
valid_inds = tf.where(label < zeros,zeros,ones)
num_valid = tf.reduce_sum(valid_inds)
keep_num = tf.cast(num_valid*num_keep_radio,dtype=tf.int32)
#set 0 to invalid sample
loss = loss * valid_inds
loss,_ = tf.nn.top_k(loss, k=keep_num)
return tf.reduce_mean(loss)
2 边框回归损失bbox_ohem 这里用的均方误差或者smoothL1 和RCNN提出的smoothL1一致
#label=1 or label=-1 then do regression
def bbox_ohem(bbox_pred,bbox_target,label):
zeros_index = tf.zeros_like(label, dtype=tf.float32)
ones_index = tf.ones_like(label,dtype=tf.float32)
# 对label为-1, 1的做边框回归
valid_inds = tf.where(tf.equal(tf.abs(label), 1),ones_index,zeros_index)
#(batch,)
# 下面就是简单的均方误差
square_error = tf.square(bbox_pred-bbox_target)
square_error = tf.reduce_sum(square_error,axis=1)
#keep_num scalar
num_valid = tf.reduce_sum(valid_inds)
#keep_num = tf.cast(num_valid*num_keep_radio,dtype=tf.int32)
keep_num = tf.cast(num_valid, dtype=tf.int32)
#keep valid index square_error
square_error = square_error*valid_inds
# 这里有个小技巧 支取topK个用来做反向传播
# 思想就是训练误差最大的topk个
_, k_index = tf.nn.top_k(square_error, k=keep_num)
square_error = tf.gather(square_error, k_index)
return tf.reduce_mean(square_error)
3 就是landmark损失 landmark_ohem
def landmark_ohem(landmark_pred,landmark_target,label):
'''
:param landmark_pred:
:param landmark_target:
:param label:
:return: mean euclidean loss
'''
#keep label =-2 then do landmark detection
# 对于landmark的样本label = -2 所以这里需要找到label为-2的样本
ones = tf.ones_like(label,dtype=tf.float32)
zeros = tf.zeros_like(label,dtype=tf.float32)
valid_inds = tf.where(tf.equal(label,-2),ones,zeros)
# 这个和边框回归损失是一致的 都市MSE损失 然后选取loss最大的来进行反向传播
square_error = tf.square(landmark_pred-landmark_target)
square_error = tf.reduce_sum(square_error,axis=1)
num_valid = tf.reduce_sum(valid_inds)
#keep_num = tf.cast(num_valid*num_keep_radio,dtype=tf.int32)
keep_num = tf.cast(num_valid, dtype=tf.int32)
square_error = square_error*valid_inds
_, k_index = tf.nn.top_k(square_error, k=keep_num)
square_error = tf.gather(square_error, k_index)
return tf.reduce_mean(square_error)
最后对于RNet和ONet的网络结构和PNet基本差不多,最大差别就是 RNet和ONet不是全卷机网络最后接了fc
def R_Net(inputs,label=None,bbox_target=None,landmark_target=None,training=True):
with slim.arg_scope([slim.conv2d],
activation_fn = prelu,
weights_initializer=slim.xavier_initializer(),
biases_initializer=tf.zeros_initializer(),
weights_regularizer=slim.l2_regularizer(0.0005),
padding='valid'):
net = slim.conv2d(inputs, num_outputs=28, kernel_size=[3,3], stride=1, scope="conv1")
net = slim.max_pool2d(net, kernel_size=[3, 3], stride=2, scope="pool1", padding='SAME')
net = slim.conv2d(net,num_outputs=48,kernel_size=[3,3],stride=1,scope="conv2")
net = slim.max_pool2d(net,kernel_size=[3,3],stride=2,scope="pool2")
net = slim.conv2d(net,num_outputs=64,kernel_size=[2,2],stride=1,scope="conv3")
fc_flatten = slim.flatten(net)
fc1 = slim.fully_connected(fc_flatten, num_outputs=128,scope="fc1")
#batch*2
cls_prob = slim.fully_connected(fc1,num_outputs=2,scope="cls_fc",activation_fn=tf.nn.softmax)
#batch*4
bbox_pred = slim.fully_connected(fc1,num_outputs=4,scope="bbox_fc",activation_fn=None)
#batch*10
landmark_pred = slim.fully_connected(fc1,num_outputs=10,scope="landmark_fc",activation_fn=None)
#train
if training:
cls_loss = cls_ohem(cls_prob,label)
bbox_loss = bbox_ohem(bbox_pred,bbox_target,label)
accuracy = cal_accuracy(cls_prob,label)
landmark_loss = landmark_ohem(landmark_pred,landmark_target,label)
L2_loss = tf.add_n(slim.losses.get_regularization_losses())
return cls_loss,bbox_loss,landmark_loss,L2_loss,accuracy
else:
return cls_prob,bbox_pred,landmark_pred