SE
Squeeze-and-Excitation Networks。
通过全局平均池化,先降维再升维的变换,使用sigmoid函数实现特征的重标定,给通道赋值不同的权重,最后再将权重加载到输入中。
class SELayer(nn.Module):
def __init__(self, channel, reduction=16):
super(SELayer, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=False),
nn.Sigmoid()
)
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
return x * y.expand_as(x)
SE模块和残差模块结合,一般放在残差分支,和输入相加之前,可以对残差分支学习到的特征进行加权调整,从而增强高级特征的表达能力。
class SEBasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
base_width=64, dilation=1, norm_layer=None,
*, reduction=16):
super(SEBasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,stride=stride, padding=1)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1)
self.bn2 = nn.BatchNorm2d(planes)
self.se = SELayer(planes, reduction)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.relu(self.bn1(self.conv1(x)))
out = self.se(self.bn2(self.conv2(out))) #use SE
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
ECA
Effificient Channel Attention for Deep Convolutional Neural Networks。
作者经过实验研究表明,SENet中降维对通道的注意力预测带来了副作用,提出了一种不降维的局部跨通道交互策略,使用一维卷积替代了SENet中的MLP。
class ECALayer(nn.Module):
"""Constructs a ECA module.
Args:
channel: Number of channels of the input feature map
k_size: Adaptive selection of kernel size
"""
def __init__(self, channel, k_size=3):
super(eca_layer, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
# x: input features with shape [b, c, h, w]
b, c, h, w = x.size()
# feature descriptor on the global spatial information
y = self.avg_pool(x) # [b, c, 1, 1]
# Two different branches of ECA module
y = y.squeeze(-1).transpose(-1, -2) # [b, c, 1] => [b, 1,c]
y = self.conv(y) # [b, 1,c]
y = y.transpose(-1, -2).unsqueeze(-1) # [b, c, 1] => [b, c, 1, 1]
# Multi-scale information fusion
y = self.sigmoid(y)
return x * y.expand_as(x)
CA
Coordinate attention for efficient mobile network design。
作者没有使用全局平均池化,而是从X方向和Y方向分别进行了池化,合并后进行降维操作,然后再分别进行升维操作。
class h_swish(nn.Module):
def forward(self, x):
return x * F.relu6(x + 3, inplace=True) / 6
class CoordAtt(nn.Module):
def __init__(self, inp, oup, groups=32):
super(CoordAtt, self).__init__()
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
mip = max(8, inp // groups)
self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0)
self.bn1 = nn.BatchNorm2d(mip)
self.conv2 = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)
self.conv3 = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)
self.relu = h_swish()
def forward(self, x):
identity = x
n,c,h,w = x.size()
x_h = self.pool_h(x)
x_w = self.pool_w(x).permute(0, 1, 3, 2)
y = torch.cat([x_h, x_w], dim=2)
y = self.conv1(y)
y = self.bn1(y)
y = self.relu(y)
x_h, x_w = torch.split(y, [h, w], dim=2)
x_w = x_w.permute(0, 1, 3, 2)
x_h = self.conv2(x_h).sigmoid()
x_w = self.conv3(x_w).sigmoid()
x_h = x_h.expand(-1, -1, h, w)
x_w = x_w.expand(-1, -1, h, w)
y = identity * x_w * x_h
return y
CBAM
Convolutional Block Attention Module
结合了通道注意力和空间注意力,并且结合了全局平均池化和最大池化。空间注意力对通道取了最大值和平均值以后,通过卷积进行了特征融合,然后通过sigmoid计算权重。
# 通道注意力模块
class ChannelAttention(nn.Module):
def __init__(self, in_planes, ratio=16):
super(ChannelAttention, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.max_pool = nn.AdaptiveMaxPool2d(1)
self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)
self.relu1 = nn.ReLU()
self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) # (b,c,1,1)
max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x)))) # (b,c,1,1)
out = avg_out + max_out
return self.sigmoid(out)
# 空间注意力模块
class SpatialAttention(nn.Module):
def __init__(self, kernel_size=7):
super(SpatialAttention, self).__init__()
assert kernel_size in (3, 7), 'kernel size must be 3 or 7'
padding = 3 if kernel_size == 7 else 1
self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
avg_out = torch.mean(x, dim=1, keepdim=True) #(b,1,h,w)
max_out, _ = torch.max(x, dim=1, keepdim=True) #(b,1,h,w)
x = torch.cat([avg_out, max_out], dim=1) #(b,2,h,w)
x = self.conv1(x) #(b,1,h,w)
return self.sigmoid(x)
class CBAM(nn.Module):
def __init__(self, in_planes, ratio=16, kernel_size=7):
super(CBAM, self).__init__()
self.ca = ChannelAttention(in_planes, ratio)
self.sa = SpatialAttention(kernel_size)
def forward(self, x):
out = x * self.ca(x) #(b,c,h,w) * (b,c,1,1) = (b,c,h,w)
result = out * self.sa(out) #(b,c,h,w) * (b,1,h,w) = (b,c,h,w)
return result
SRM
A Style-based Recalibration Module for Convolutional Neural Networks
Style Pooling 提取了每个通道的均值和标准差,然后通过一维卷积进行融合,最后输出每个通道的权重。
class SRMLayer(nn.Module):
def __init__(self, channel, reduction=None):
# Reduction for compatibility with layer_block interface
super(SRMLayer, self).__init__()
# CFC: channel-wise fully connected layer
self.cfc = nn.Conv1d(channel, channel, kernel_size=2, bias=False,
groups=channel)
self.bn = nn.BatchNorm1d(channel)
def forward(self, x):
b, c, _, _ = x.size()
# Style pooling
mean = x.view(b, c, -1).mean(-1).unsqueeze(-1)
std = x.view(b, c, -1).std(-1).unsqueeze(-1)
u = torch.cat((mean, std), -1) # (b, c, 2)
# Style integration
z = self.cfc(u) # (b, c, 1)
z = self.bn(z)
g = torch.sigmoid(z)
g = g.view(b, c, 1, 1)
return x * g.expand_as(x)
SimAM
A Simple, Parameter-Free Attention Module for Convolutional Neural Networks
一种基于无参的注意力机制,通过计算特征图的能量函数来动态生成注意力权重,不需要额外的参数。
class SimAM(nn.Module):
# X: input feature [N, C, H, W]
# lambda: coefficient λ in Eqn (5)
def forward (X, lambda):
# spatial size
n = X.shape[2] * X.shape[3] - 1
# square of (t - u)
d = (X - X.mean(dim=[2,3])).pow(2)
# d.sum() / n is channel variance
v = d.sum(dim=[2,3]) / n
# E_inv groups all importance of X
E_inv = d / (4 * (v + lambda)) + 0.5
# return attended features
return X * sigmoid(E_inv)
SOCA
Second-order Attention Network for Single Image Super-resolution
二阶通道注意力机制,计算较复杂,代码github
class SOCA(nn.Module):
def __init__(self, channel, reduction=8):
super(SOCA, self).__init__()
# global average pooling: feature --> point
# self.avg_pool = nn.AdaptiveAvgPool2d(1)
# self.max_pool = nn.AdaptiveMaxPool2d(1)
self.max_pool = nn.MaxPool2d(kernel_size=2)
# feature channel downscale and upscale --> channel weight
self.conv_du = nn.Sequential(
nn.Conv2d(channel, channel // reduction, 1, padding=0, bias=True),
nn.ReLU(inplace=True),
nn.Conv2d(channel // reduction, channel, 1, padding=0, bias=True),
nn.Sigmoid()
# nn.BatchNorm2d(channel)
)
def forward(self, x):
batch_size, C, h, w = x.shape # x: NxCxHxW
N = int(h * w)
min_h = min(h, w)
h1 = 1000
w1 = 1000
if h < h1 and w < w1:
x_sub = x
elif h < h1 and w > w1:
# H = (h - h1) // 2
W = (w - w1) // 2
x_sub = x[:, :, :, W:(W + w1)]
elif w < w1 and h > h1:
H = (h - h1) // 2
# W = (w - w1) // 2
x_sub = x[:, :, H:H + h1, :]
else:
H = (h - h1) // 2
W = (w - w1) // 2
x_sub = x[:, :, H:(H + h1), W:(W + w1)]
##
## MPN-COV
cov_mat = MPNCOV.CovpoolLayer(x_sub) # Global Covariance pooling layer
cov_mat_sqrt = MPNCOV.SqrtmLayer(cov_mat,5) # Matrix square root layer( including pre-norm,Newton-Schulz iter. and post-com. with 5 iteration)
##
cov_mat_sum = torch.mean(cov_mat_sqrt,1)
cov_mat_sum = cov_mat_sum.view(batch_size,C,1,1)
y_cov = self.conv_du(cov_mat_sum)
return y_cov*x
EMA
Efficient Multi-Scale Attention Module with Cross-Spatial Learning
特征分组,并行子网络,跨空间学习。
class EMA_attention(nn.Module):
def __init__(self, channels, c2=None, factor=32):
super(EMA_attention, self).__init__()
self.groups = factor
assert channels // self.groups > 0
self.softmax = nn.Softmax(-1)
self.agp = nn.AdaptiveAvgPool2d((1, 1))
self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
self.pool_w = nn.AdaptiveAvgPool2d((1, None))
self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)
def forward(self, x):
b, c, h, w = x.size()
group_x = x.reshape(b * self.groups, -1, h, w) # b*g,c//g,h,w
x_h = self.pool_h(group_x) # b*g,c//g,h,1
x_w = self.pool_w(group_x).permute(0, 1, 3, 2) # b*g,c//g,w,1
hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) # b*g,c//g,h+w,1
x_h, x_w = torch.split(hw, [h, w], dim=2)
x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid()) # b*g,c//g,h,w
x2 = self.conv3x3(group_x) # b*g,c//g,h,w
x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
x12 = x2.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw
x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
x22 = x1.reshape(b * self.groups, c // self.groups, -1) # b*g, c//g, hw
weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w)
return (group_x * weights.sigmoid()).reshape(b, c, h, w)
ShuffleAttention
SA-Net: Shuffle Attention for Deep Convolutional Neural Networks
shuffle attention 先对通道分组,然后分为两个分支,一个分支计算通道注意力,一个分支计算空间注意力,合并两个分支,再shuffle channel,输出结果。
class ShuffleAttention(nn.Module):
"""Constructs a Channel Spatial Group module.
Args:
k_size: Adaptive selection of kernel size
"""
def __init__(self, channel, groups=8):
super().__init__()
self.groups = groups
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.cweight = Parameter(torch.zeros(1, channel // (2 * groups), 1, 1))
self.cbias = Parameter(torch.ones(1, channel // (2 * groups), 1, 1))
self.sweight = Parameter(torch.zeros(1, channel // (2 * groups), 1, 1))
self.sbias = Parameter(torch.ones(1, channel // (2 * groups), 1, 1))
self.sigmoid = nn.Sigmoid()
self.gn = nn.GroupNorm(channel // (2 * groups), channel // (2 * groups))
@staticmethod
def channel_shuffle(x, groups):
b, c, h, w = x.shape
x = x.reshape(b, groups, -1, h, w)
x = x.permute(0, 2, 1, 3, 4)
# flatten
x = x.reshape(b, -1, h, w)
return x
def forward(self, x):
b, c, h, w = x.shape
x = x.reshape(b * self.groups, -1, h, w) #(b*g,c//g,h,w)
x_0, x_1 = x.chunk(2, dim=1) #(b*g,c//(2*g),h,w),(b*g,c//(2*g),h,w)
# channel attention
xn = self.avg_pool(x_0) #(b*g,c//(2*g),h,w)
xn = self.cweight * xn + self.cbias
xn = x_0 * self.sigmoid(xn)
# spatial attention
xs = self.gn(x_1) #(b*g,c//(2*g),h,w)
xs = self.sweight * xs + self.sbias
xs = x_1 * self.sigmoid(xs)
# concatenate along channel axis
out = torch.cat([xn, xs], dim=1) #(b*g,c//g,h,w)
out = out.reshape(b, -1, h, w) #(b,c,h,w)
out = self.channel_shuffle(out, 2)
return out
总结:
CNN的注意力机制,都是使网络在处理输入数据时,动态地聚焦于重要的部分,从而提高模型的性能和效率。每个模块的输入和输出都是一致的,可以方便的插入到现有模型中,替换不同的模块获得最优效果。
参考:
即插即用的轻量注意力机制ECA--Net_eca注意力机制-CSDN博客
注意力机制-CA注意力-Coordinate attention-CSDN博客
CBAM:一种增强卷积神经网络性能的双重注意力机制,进一步提高CNN性能【原理讲解及代码!!!】_神经网络cbam模块-CSDN博客
SRM : A Style-based Recalibration Module for Convolutional Neural Networks论文笔记_srm注意力机制-CSDN博客
即插即用模块-EMA跨空间学习的高效多尺度注意模块(论文+代码)_ema注意力模块-CSDN博客
SA-Net: Shuffle Attention for Deep Convolutional Neural Networks-CSDN博客