CNN中的注意力机制

SE

Squeeze-and-Excitation Networks。

通过全局平均池化，先降维再升维的变换，使用sigmoid函数实现特征的重标定，给通道赋值不同的权重，最后再将权重加载到输入中。

class SELayer(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

SE模块和残差模块结合，一般放在残差分支，和输入相加之前，可以对残差分支学习到的特征进行加权调整，从而增强高级特征的表达能力。

class SEBasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None,
                 *, reduction=16):
        super(SEBasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(planes)
        self.se = SELayer(planes, reduction)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.se(self.bn2(self.conv2(out)))  #use SE

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

ECA

Effificient Channel Attention for Deep Convolutional Neural Networks。

作者经过实验研究表明，SENet中降维对通道的注意力预测带来了副作用，提出了一种不降维的局部跨通道交互策略，使用一维卷积替代了SENet中的MLP。

class ECALayer(nn.Module):
    """Constructs a ECA module.
    Args:
        channel: Number of channels of the input feature map
        k_size: Adaptive selection of kernel size
    """
    def __init__(self, channel, k_size=3):
        super(eca_layer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=k_size, padding=(k_size - 1) // 2, bias=False)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        # x: input features with shape [b, c, h, w]
        b, c, h, w = x.size()
 
        # feature descriptor on the global spatial information
        y = self.avg_pool(x)   #  [b, c, 1, 1]
 
        # Two different branches of ECA module
        y = y.squeeze(-1).transpose(-1, -2) # [b, c, 1] => [b, 1,c]
        y = self.conv(y)  # [b, 1,c]
        y = y.transpose(-1, -2).unsqueeze(-1) # [b, c, 1] => [b, c, 1, 1]
 
        # Multi-scale information fusion
        y = self.sigmoid(y)
        
        return x * y.expand_as(x)

CA

Coordinate attention for efficient mobile network design。

作者没有使用全局平均池化，而是从X方向和Y方向分别进行了池化，合并后进行降维操作，然后再分别进行升维操作。

class h_swish(nn.Module):
    def forward(self, x):
        return x * F.relu6(x + 3, inplace=True) / 6

class CoordAtt(nn.Module):
    def __init__(self, inp, oup, groups=32):
        super(CoordAtt, self).__init__()
        self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
        self.pool_w = nn.AdaptiveAvgPool2d((1, None))

        mip = max(8, inp // groups)

        self.conv1 = nn.Conv2d(inp, mip, kernel_size=1, stride=1, padding=0)
        self.bn1 = nn.BatchNorm2d(mip)
        self.conv2 = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)
        self.conv3 = nn.Conv2d(mip, oup, kernel_size=1, stride=1, padding=0)
        self.relu = h_swish()

    def forward(self, x):
        identity = x
        n,c,h,w = x.size()
        x_h = self.pool_h(x)
        x_w = self.pool_w(x).permute(0, 1, 3, 2)

        y = torch.cat([x_h, x_w], dim=2)
        y = self.conv1(y)
        y = self.bn1(y)
        y = self.relu(y) 
        x_h, x_w = torch.split(y, [h, w], dim=2)
        x_w = x_w.permute(0, 1, 3, 2)

        x_h = self.conv2(x_h).sigmoid()
        x_w = self.conv3(x_w).sigmoid()
        x_h = x_h.expand(-1, -1, h, w)
        x_w = x_w.expand(-1, -1, h, w)

        y = identity * x_w * x_h

        return y

CBAM

Convolutional Block Attention Module

结合了通道注意力和空间注意力，并且结合了全局平均池化和最大池化。空间注意力对通道取了最大值和平均值以后，通过卷积进行了特征融合，然后通过sigmoid计算权重。

# 通道注意力模块
class ChannelAttention(nn.Module):
    def __init__(self, in_planes, ratio=16):
        super(ChannelAttention, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)  
        self.max_pool = nn.AdaptiveMaxPool2d(1)  

        self.fc1 = nn.Conv2d(in_planes, in_planes // ratio, 1, bias=False)  
        self.relu1 = nn.ReLU()  
        self.fc2 = nn.Conv2d(in_planes // ratio, in_planes, 1, bias=False)  
        self.sigmoid = nn.Sigmoid()  

    def forward(self, x):
        avg_out = self.fc2(self.relu1(self.fc1(self.avg_pool(x)))) # (b,c,1,1) 
        max_out = self.fc2(self.relu1(self.fc1(self.max_pool(x))))  # (b,c,1,1)
        out = avg_out + max_out  
        return self.sigmoid(out)  


# 空间注意力模块
class SpatialAttention(nn.Module):
    def __init__(self, kernel_size=7):
        super(SpatialAttention, self).__init__()

        assert kernel_size in (3, 7), 'kernel size must be 3 or 7' 
        padding = 3 if kernel_size == 7 else 1  

        self.conv1 = nn.Conv2d(2, 1, kernel_size, padding=padding, bias=False)
        self.sigmoid = nn.Sigmoid() 
        
    def forward(self, x):
        avg_out = torch.mean(x, dim=1, keepdim=True) #(b,1,h,w)
        max_out, _ = torch.max(x, dim=1, keepdim=True)  #(b,1,h,w)
        x = torch.cat([avg_out, max_out], dim=1)  #(b,2,h,w)
        x = self.conv1(x)   #(b,1,h,w)
        return self.sigmoid(x)  


class CBAM(nn.Module):
    def __init__(self, in_planes, ratio=16, kernel_size=7):
        super(CBAM, self).__init__()
        self.ca = ChannelAttention(in_planes, ratio)  
        self.sa = SpatialAttention(kernel_size) 

    def forward(self, x):
        out = x * self.ca(x)  #(b,c,h,w) * (b,c,1,1) = (b,c,h,w)
        result = out * self.sa(out) #(b,c,h,w) * (b,1,h,w) = (b,c,h,w)
        return result

SRM

A Style-based Recalibration Module for Convolutional Neural Networks

Style Pooling 提取了每个通道的均值和标准差，然后通过一维卷积进行融合，最后输出每个通道的权重。

class SRMLayer(nn.Module):
    def __init__(self, channel, reduction=None):
        # Reduction for compatibility with layer_block interface
        super(SRMLayer, self).__init__()

        # CFC: channel-wise fully connected layer
        self.cfc = nn.Conv1d(channel, channel, kernel_size=2, bias=False,
                             groups=channel)
        self.bn = nn.BatchNorm1d(channel)

    def forward(self, x):
        b, c, _, _ = x.size()

        # Style pooling
        mean = x.view(b, c, -1).mean(-1).unsqueeze(-1)
        std = x.view(b, c, -1).std(-1).unsqueeze(-1)
        u = torch.cat((mean, std), -1)  # (b, c, 2)

        # Style integration
        z = self.cfc(u)  # (b, c, 1)
        z = self.bn(z)
        g = torch.sigmoid(z)
        g = g.view(b, c, 1, 1)

        return x * g.expand_as(x)

SimAM

A Simple, Parameter-Free Attention Module for Convolutional Neural Networks

一种基于无参的注意力机制，通过计算特征图的能量函数来动态生成注意力权重，不需要额外的参数。

    class SimAM(nn.Module):
        # X: input feature [N, C, H, W]
        # lambda: coefficient λ in Eqn (5)
        def forward (X, lambda):
            # spatial size
            n = X.shape[2] * X.shape[3] - 1
            # square of (t - u)
            d = (X - X.mean(dim=[2,3])).pow(2)
            # d.sum() / n is channel variance
            v = d.sum(dim=[2,3]) / n
            # E_inv groups all importance of X
            E_inv = d / (4 * (v + lambda)) + 0.5
            # return attended features
            return X * sigmoid(E_inv)

SOCA

Second-order Attention Network for Single Image Super-resolution

二阶通道注意力机制，计算较复杂，代码github

class SOCA(nn.Module):
    def __init__(self, channel, reduction=8):
        super(SOCA, self).__init__()
        # global average pooling: feature --> point
        # self.avg_pool = nn.AdaptiveAvgPool2d(1)
        # self.max_pool = nn.AdaptiveMaxPool2d(1)
        self.max_pool = nn.MaxPool2d(kernel_size=2)

        # feature channel downscale and upscale --> channel weight
        self.conv_du = nn.Sequential(
            nn.Conv2d(channel, channel // reduction, 1, padding=0, bias=True),
            nn.ReLU(inplace=True),
            nn.Conv2d(channel // reduction, channel, 1, padding=0, bias=True),
            nn.Sigmoid()
            # nn.BatchNorm2d(channel)
        )

    def forward(self, x):
        batch_size, C, h, w = x.shape  # x: NxCxHxW
        N = int(h * w)
        min_h = min(h, w)
        h1 = 1000
        w1 = 1000
        if h < h1 and w < w1:
            x_sub = x
        elif h < h1 and w > w1:
            # H = (h - h1) // 2
            W = (w - w1) // 2
            x_sub = x[:, :, :, W:(W + w1)]
        elif w < w1 and h > h1:
            H = (h - h1) // 2
            # W = (w - w1) // 2
            x_sub = x[:, :, H:H + h1, :]
        else:
            H = (h - h1) // 2
            W = (w - w1) // 2
            x_sub = x[:, :, H:(H + h1), W:(W + w1)]

        ##
        ## MPN-COV
        cov_mat = MPNCOV.CovpoolLayer(x_sub) # Global Covariance pooling layer
        cov_mat_sqrt = MPNCOV.SqrtmLayer(cov_mat,5) # Matrix square root layer( including pre-norm,Newton-Schulz iter. and post-com. with 5 iteration)
        ##
        cov_mat_sum = torch.mean(cov_mat_sqrt,1)
        cov_mat_sum = cov_mat_sum.view(batch_size,C,1,1)
        y_cov = self.conv_du(cov_mat_sum)
        return y_cov*x

EMA

Efficient Multi-Scale Attention Module with Cross-Spatial Learning

特征分组，并行子网络，跨空间学习。

class EMA_attention(nn.Module):
    def __init__(self, channels, c2=None, factor=32):
        super(EMA_attention, self).__init__()
        self.groups = factor
        assert channels // self.groups > 0
        self.softmax = nn.Softmax(-1)
        self.agp = nn.AdaptiveAvgPool2d((1, 1))
        self.pool_h = nn.AdaptiveAvgPool2d((None, 1))
        self.pool_w = nn.AdaptiveAvgPool2d((1, None))
        self.gn = nn.GroupNorm(channels // self.groups, channels // self.groups)
        self.conv1x1 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=1, stride=1, padding=0)
        self.conv3x3 = nn.Conv2d(channels // self.groups, channels // self.groups, kernel_size=3, stride=1, padding=1)

    def forward(self, x):
        b, c, h, w = x.size()
        group_x = x.reshape(b * self.groups, -1, h, w)  # b*g,c//g,h,w
        x_h = self.pool_h(group_x) # b*g,c//g,h,1
        x_w = self.pool_w(group_x).permute(0, 1, 3, 2) # b*g,c//g,w,1
        hw = self.conv1x1(torch.cat([x_h, x_w], dim=2)) # b*g,c//g,h+w,1
        x_h, x_w = torch.split(hw, [h, w], dim=2) 
        x1 = self.gn(group_x * x_h.sigmoid() * x_w.permute(0, 1, 3, 2).sigmoid())  # b*g,c//g,h,w
        x2 = self.conv3x3(group_x) # b*g,c//g,h,w
        x11 = self.softmax(self.agp(x1).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
        x12 = x2.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw
        x21 = self.softmax(self.agp(x2).reshape(b * self.groups, -1, 1).permute(0, 2, 1))
        x22 = x1.reshape(b * self.groups, c // self.groups, -1)  # b*g, c//g, hw
        weights = (torch.matmul(x11, x12) + torch.matmul(x21, x22)).reshape(b * self.groups, 1, h, w)
        return (group_x * weights.sigmoid()).reshape(b, c, h, w)

ShuffleAttention

SA-Net: Shuffle Attention for Deep Convolutional Neural Networks

github

shuffle attention 先对通道分组，然后分为两个分支，一个分支计算通道注意力，一个分支计算空间注意力，合并两个分支，再shuffle channel，输出结果。

class ShuffleAttention(nn.Module):
    """Constructs a Channel Spatial Group module.

    Args:
        k_size: Adaptive selection of kernel size
    """

    def __init__(self, channel, groups=8):
        super().__init__()
        self.groups = groups
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.cweight = Parameter(torch.zeros(1, channel // (2 * groups), 1, 1))
        self.cbias = Parameter(torch.ones(1, channel // (2 * groups), 1, 1))
        self.sweight = Parameter(torch.zeros(1, channel // (2 * groups), 1, 1))
        self.sbias = Parameter(torch.ones(1, channel // (2 * groups), 1, 1))

        self.sigmoid = nn.Sigmoid()
        self.gn = nn.GroupNorm(channel // (2 * groups), channel // (2 * groups))

    @staticmethod
    def channel_shuffle(x, groups):
        b, c, h, w = x.shape

        x = x.reshape(b, groups, -1, h, w)
        x = x.permute(0, 2, 1, 3, 4)

        # flatten
        x = x.reshape(b, -1, h, w)

        return x

    def forward(self, x):
        b, c, h, w = x.shape

        x = x.reshape(b * self.groups, -1, h, w) #(b*g,c//g,h,w)
        x_0, x_1 = x.chunk(2, dim=1) #(b*g,c//(2*g),h,w),(b*g,c//(2*g),h,w)

        # channel attention
        xn = self.avg_pool(x_0) #(b*g,c//(2*g),h,w)
        xn = self.cweight * xn + self.cbias
        xn = x_0 * self.sigmoid(xn)

        # spatial attention
        xs = self.gn(x_1) #(b*g,c//(2*g),h,w)
        xs = self.sweight * xs + self.sbias
        xs = x_1 * self.sigmoid(xs)

        # concatenate along channel axis
        out = torch.cat([xn, xs], dim=1) #(b*g,c//g,h,w)
        out = out.reshape(b, -1, h, w) #(b,c,h,w)

        out = self.channel_shuffle(out, 2)
        return out

总结：

CNN的注意力机制，都是使网络在处理输入数据时，动态地聚焦于重要的部分，从而提高模型的性能和效率。每个模块的输入和输出都是一致的，可以方便的插入到现有模型中，替换不同的模块获得最优效果。

参考：

一文看尽深度学习中的各种注意力机制（1998-2020年）

即插即用的轻量注意力机制ECA--Net_eca注意力机制-CSDN博客

注意力机制-CA注意力-Coordinate attention-CSDN博客

CBAM：一种增强卷积神经网络性能的双重注意力机制，进一步提高CNN性能【原理讲解及代码！！！】_神经网络cbam模块-CSDN博客

 SRM : A Style-based Recalibration Module for Convolutional Neural Networks论文笔记_srm注意力机制-CSDN博客

注意力机制SimAM(SimAM: A Simple, Parameter-Free Attention Module for Convolutional Neural Networks)_simam注意力机制-CSDN博客

即插即用模块-EMA跨空间学习的高效多尺度注意模块（论文+代码）_ema注意力模块-CSDN博客

SA-Net: Shuffle Attention for Deep Convolutional Neural Networks-CSDN博客