Bootstrap

MaskRCNN-BenchMark pytorch源码阅读笔记

长文警告...

源码地址https://github.com/facebookresearch/maskrcnn-benchmark

modeling 部分解析来自:模型定义(modeling)之骨架网络(backbone),博主将resnet,fpn等实现细节讲述的非常详细,根据最新发布的maskrcnn-benchmark版本略微做了修改。

按照从外往里的顺序阅读一下maskrcnn的官方pytorch实现,首先第一步是是训练命令:

单卡训练:

python /path_to_maskrcnn_benchmark/tools/train_net.py --config-file "/path/to/config/file.yaml"

多卡训练:

python tools/train_net.py --config-file "configs/e2e_mask_rcnn_R_50_FPN_1x.yaml" SOLVER.IMS_PER_BATCH 2 SOLVER.BASE_LR 0.0025 SOLVER.MAX_ITER 720000 SOLVER.STEPS "(480000, 640000)" TEST.IMS_PER_BATCH 1 MODEL.RPN.FPN_POST_NMS_TOP_N_TRAIN 2000

运行的都是 tools/train_net.py,后面接配置文件位置和一些参数设置。

看一下 train_net.py 的内容,定义了 train, run_test, main 三个函数,命令行执行main函数;

从外往里依次看这三个函数,第一个main函数

def main():
    # 定义参数列表
    parser = argparse.ArgumentParser(description="PyTorch Object Detection Training")
    
    # 配置文件
    parser.add_argument(
        "--config-file",
        default="",
        metavar="FILE",
        help="path to config file",
        type=str,
    )

    parser.add_argument("--local_rank", type=int, default=0)

    # 是否跳过测试阶段
    parser.add_argument(
        "--skip-test",
        dest="skip_test",
        help="Do not test the final model",
        action="store_true",
    )
    
    # 是否使用 tensorboard 可视化训练过程
    parser.add_argument(
        "--use-tensorboard",
        dest="use_tensorboard",
        help="Use tensorboardX logger (Requires tensorboardX installed)",
        action="store_true",
        default=False
    )

    # 使用命令行修改配置参数
    parser.add_argument(
        "opts",
        help="Modify config options using the command-line",
        default=None,
        nargs=argparse.REMAINDER,
    )

    args = parser.parse_args()

    num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
    args.distributed = num_gpus > 1

    # gpu数目大于1则使用分布式训练模式
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        torch.distributed.init_process_group(
            backend="nccl", init_method="env://"
        )
        synchronize()

    cfg.merge_from_file(args.config_file)
    cfg.merge_from_list(args.opts)
    cfg.freeze()

    output_dir = cfg.OUTPUT_DIR
    if output_dir:
        mkdir(output_dir)

    # 日志设置
    logger = setup_logger("maskrcnn_benchmark", output_dir, get_rank())
    logger.info("Using {} GPUs".format(num_gpus))
    logger.info(args)

    logger.info("Collecting env info (might take some time)")
    logger.info("\n" + collect_env_info())

    logger.info("Loaded configuration file {}".format(args.config_file))
    with open(args.config_file, "r") as cf:
        config_str = "\n" + cf.read()
        logger.info(config_str)
    logger.info("Running with config:\n{}".format(cfg))

    # 输出配置文件的路径
    output_config_path = os.path.join(cfg.OUTPUT_DIR, 'config.yml')
    logger.info("Saving config into: {}".format(output_config_path))
    # save overloaded model config in the output directory
    save_config(cfg, output_config_path)

    # 训练模型
    # model = train(cfg, args.local_rank, args.distributed)
    model = train(
        cfg=cfg,
        local_rank=args.local_rank,
        distributed=args.distributed,
        use_tensorboard=args.use_tensorboard
    )

    # 是否测试模型
    if not args.skip_test:
        run_test(cfg, model, args.distributed)

train函数:

# See if we can use apex.DistributedDataParallel instead of the torch default,
# and enable mixed-precision via apex.amp
try:
    from apex import amp
except ImportError:
    raise ImportError('Use APEX for multi-precision via apex.amp')


def train(cfg, local_rank, distributed, use_tensorboard=False):
    # 根据cfg配置文件创建模型, from maskrcnn_benchmark.modeling.detector import build_detection_model
    model = build_detection_model(cfg)        
    device = torch.device(cfg.MODEL.DEVICE)
    model.to(device)

    # 优化器,from maskrcnn_benchmark.solver import make_optimizer
    # 学习率,from maskrcnn_benchmark.solver import make_lr_scheduler
    optimizer = make_optimizer(cfg, model)
    scheduler = make_lr_scheduler(cfg, optimizer)

    # Initialize mixed-precision training,训练混合精度初始化
    use_mixed_precision = cfg.DTYPE == "float16"
    amp_opt_level = 'O1' if use_mixed_precision else 'O0'
    # 默认(单卡)模型优化器定义
    model, optimizer = amp.initialize(model, optimizer, opt_level=amp_opt_level)

    # 分布式训练模型定义
    if distributed:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank,
            # this should be removed if we update BatchNorm stats
            broadcast_buffers=False,
        )

    arguments = {}
    arguments["iteration"] = 0
    output_dir = cfg.OUTPUT_DIR
    save_to_disk = get_rank() == 0

    # 保存模型的类
    # from maskrcnn_benchmark.utils.checkpoint import DetectronCheckpointer
    checkpointer = DetectronCheckpointer(
        cfg, model, optimizer, scheduler, output_dir, save_to_disk
    )

    # 加载外部(预训练)模型
    extra_checkpoint_data = checkpointer.load(cfg.MODEL.WEIGHT)
    arguments.update(extra_checkpoint_data)

    # 加载数据
    data_loader = make_data_loader(
        cfg,
        is_train=True,
        is_distributed=distributed,
        start_iter=arguments["iteration"],
    )

    checkpoint_period = cfg.SOLVER.CHECKPOINT_PERIOD
    # 是否使用tensorboard可视化
    if use_tensorboard:
        meters = TensorboardLogger(
            log_dir=cfg.TENSORBOARD_EXPERIMENT,
            start_iter=arguments['iteration'],
            delimiter="  ")
    else:
        meters = MetricLogger(delimiter="  ")

    # 训练模型, from maskrcnn_benchmark.engine.trainer import do_train
    do_train(
        model,
        data_loader,
        optimizer,
        scheduler,
        checkpointer,
        device,
        checkpoint_period,
        arguments,
        meters
    )

    return model

可以发现训练的第一步是 build_detection_model,定义在 maskrcnn_benchmark/modeling/detector.py 中:

from .generalized_rcnn import GeneralizedRCNN


_DETECTION_META_ARCHITECTURES = {"GeneralizedRCNN": GeneralizedRCNN}


# 此函数的功能就是根据给定的配置信息 cfg 实例化一个 GeneralizedRCNN 的对象
def build_detection_model(cfg):
    # 构建模型字典
    meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE]
    # 等价于 return GeneralizedRCNN(cfg)
    return meta_arch(cfg)

GeneralizedRCNN 类定义在 ./maskrcnn_benchmark/modeling/detector/generalized_rcnn.py

from torch import nn
from maskrcnn_benchmark.structures.image_list import to_image_list
from ..backbone import build_backbone
from ..rpn.rpn import build_rpn
from ..roi_heads.roi_heads import build_roi_heads


class GeneralizedRCNN(nn.Module):
    """
    maskrcnn-benchmark中所有模型的共同模板类,支持 boxes, masks;
    该类包括:
    -backbone 主干网
    -rpn 区域推荐网络,可选
    -heads 关于 roi 部分的 head
    """

    def __init__(self, cfg):
        super(GeneralizedRCNN, self).__init__()
        # 根据配置信息 cfg 分别创建 backbone, rpn,roi_heads
        self.backbone = build_backbone(cfg)
        self.rpn = build_rpn(cfg, self.backbone.out_channels)
        self.roi_heads = build_roi_heads(cfg, self.backbone.out_channels)

    def forward(self, images, targets=None):
        """
        images (list[Tensor] or ImageList): images to be processed
        targets (list[BoxList]): ground-truth boxes present in the image (optional)

        Returns: result (list[BoxList] or dict[Tensor]): the output from the model.
        在训练阶段, 返回字典类型的模型损失, 在测试阶段, 返回模型的预测结果,
        包括 scores, labels and mask (for Mask R-CNN models).
        """
        # trianing 阶段必须提供 targets
        if self.training and targets is None:
            raise ValueError("In training mode, targets should be passed")
        images = to_image_list(images)    # 转换数据类型为ImageList
        # 图像张量经过 backbone 得到卷积特征图 features
        features = self.backbone(images.tensors)
        # images,features和target 经过 rpn 得到候选框和对应的loss  
        proposals, proposal_losses = self.rpn(images, features, targets)
        
        if self.roi_heads:    # roi_heads 不为 None 的话, 就计算其输出的结果
            x, result, detector_losses = self.roi_heads(features, proposals, targets)
        else:
            # RPN-only models don't have roi_heads
            x = features
            result = proposals
            detector_losses = {}

        if self.training:    # 训练模式下, 输出损失值
            losses = {}
            losses.update(detector_losses)
            losses.update(proposal_losses)
            return losses

        return result

由此可见,上述函数主要依赖于 build_backbone(cfg)build_rpn(cfg)build_roi_heads(cfg)三个函数;

backbone 定义于 modeling/backbone/中,包括 backbone.py, fpn.py, resnet.py,build_backbone定义于backbone.py中:

from collections import OrderedDict
from torch import nn
# 注册器, 用于管理 module 的注册, 使得可以像使用字典一样使用 module
from maskrcnn_benchmark.modeling import registry
# 初始化卷积层的 kaiming_uniform 分布
from maskrcnn_benchmark.modeling.make_layers import conv_with_kaiming_uniform
from . import fpn as fpn_module    # 同文件夹下
from . import resnet               # 同文件夹下


# 创建 resnet 骨架网络, 根据配置信息会被后面的 build_backbone() 函数调用
@registry.BACKBONES.register("R-50-C4")
@registry.BACKBONES.register("R-50-C5")
@registry.BACKBONES.register("R-101-C4")
@registry.BACKBONES.register("R-101-C5")
def build_resnet_backbone(cfg):
    body = resnet.ResNet(cfg)        # resnet.py 文件中的 class ResNet(cfg)
    model = nn.Sequential(OrderedDict([("body", body)]))    # 利用 nn.Sequential 定义序贯模型
    model.out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
    return model

# 创建 fpn 网络, 根据配置信息会被下面 build_backbone 函数调用
@registry.BACKBONES.register("R-50-FPN")
@registry.BACKBONES.register("R-101-FPN")
@registry.BACKBONES.register("R-152-FPN")
def build_resnet_fpn_backbone(cfg):
    body = resnet.ResNet(cfg)    # 先创建 resnet 网络

    # 获取 fpn 所需的channels参数
    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS

    fpn = fpn_module.FPN(    # 利用 fpn.py 文件夹的 class FPN 创建 fpn 网络
        in_channels_list=[
            in_channels_stage2,
            in_channels_stage2 * 2,
            in_channels_stage2 * 4,
            in_channels_stage2 * 8,
        ],
        out_channels=out_channels,
        conv_block=conv_with_kaiming_uniform(
            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
        ),
        top_blocks=fpn_module.LastLevelMaxPool(),
    )
    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
    model.out_channels = out_channels
    return model


# 创建 fpn-retina 网络, 根据配置信息会被下面 build_backbone 函数调用,流程同上
@registry.BACKBONES.register("R-50-FPN-RETINANET")
@registry.BACKBONES.register("R-101-FPN-RETINANET")
def build_resnet_fpn_p3p7_backbone(cfg):
    body = resnet.ResNet(cfg)
    in_channels_stage2 = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS
    out_channels = cfg.MODEL.RESNETS.BACKBONE_OUT_CHANNELS
    in_channels_p6p7 = in_channels_stage2 * 8 if cfg.MODEL.RETINANET.USE_C5 \
        else out_channels
    fpn = fpn_module.FPN(
        in_channels_list=[
            0,
            in_channels_stage2 * 2,
            in_channels_stage2 * 4,
            in_channels_stage2 * 8,
        ],
        out_channels=out_channels,
        conv_block=conv_with_kaiming_uniform(
            cfg.MODEL.FPN.USE_GN, cfg.MODEL.FPN.USE_RELU
        ),
        top_blocks=fpn_module.LastLevelP6P7(in_channels_p6p7, out_channels),
    )
    model = nn.Sequential(OrderedDict([("body", body), ("fpn", fpn)]))
    model.out_channels = out_channels
    return model


def build_backbone(cfg):
    assert cfg.MODEL.BACKBONE.CONV_BODY in registry.BACKBONES, \
        "cfg.MODEL.BACKBONE.CONV_BODY: {} are not registered in registry".format(
            cfg.MODEL.BACKBONE.CONV_BODY
        )
    return registry.BACKBONES[cfg.MODEL.BACKBONE.CONV_BODY](cfg)

上述 backbone.py 文件中的三个函数 build_resnet_backbone() , build_resnet_fpn_backbone() ,build_resnet_fpn_p3p7_backbone() 都使用了 body = resnet.ResNet(cfg) 来创建网络的主体, 这部分的代码定义位于 ./maskrcnn_benchmark/modeling/backbone/resnet.py 文件中,篇幅较多,整体结构如下:

# ./maskrcnn_benchmark/modeling/backbone/resnet.py

# 导入各种包及函数
# ...
from maskrcnn_benchmark.layers ipmort FrozenBatchNorm2d
# ...

# ResNet stage specification
StageSpec = #...

# ResNet
class ResNet(nn.Module):
    def __init__(self, cfg):
        super(ResNet, self).__init__()
        # 初始化
        # ...

    def _freeze_backbone(self, freeze_at):
        # 将指定的参数置为: requires_grad = False
        # ...

    def forward(self, x):
        # 定义 resnet 的前向传播过程
        # ...

# ResNetHead
class ResNetHead(nn.Module):
    def __init__(...):
        # ...

    def foward(self, x):
        # ...

def _make_stage(...):
    # 创建 ResNet 的 residual-block
    # ...

class BottleneckWithFixedBatchNorm(nn.Module):
    # 使用固定的BN
    def __init__(...):
        # ...
    def forward(self, x):
        # ...

class StemWithFixedBatchNorm(nn.Module):
    def __init__(self, cfg):
        # ...

    def forward(self, x):
        # ...

_TRANSFORMATION_MODULES = Registry({..})

_STEM_MODULES = Registry({..})

_TRANSFORMATION_MODULES = Registry({
    "BottleneckWithFixedBatchNorm": BottleneckWithFixedBatchNorm
})

_STEM_MODULES = Registry({"StemWithFixedBatchNorm": StemWithFixedBatchNorm})

_STAGE_SPECS = Registry({        # 注册表中命名对应的含义
    "R-50-C4": ResNet50StagesTo4,
    "R-50-C5": ResNet50StagesTo5,
    "R-50-FPN": ResNet50FPNStagesTo5,
    "R-101-FPN": ResNet101FPNStagesTo5,
})

其中开始的 StageSpec 定义如下:

# ResNet stage specification
StageSpec = namedtuple(
    "StageSpec",
    [
        "index",            # stage的索引, eg 1, 2, ..,. 5
        "block_count",      # stage 中的 residual blocks 数量
        "return_features",  # True => 返回当前 stage 最后一层 feature map 
    ],
)

# ResNet-50 full stages 的2~5阶段的卷积层数分别为:3,4,6,3
ResNet50StagesTo5 = tuple(        # 元组内部的元素类型为 StageSpec
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, False), (4, 3, True))
)

# ResNet-50-C4, 只使用到第四阶段输出的特征图谱
ResNet50StagesTo4 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 6, True))
)
# ResNet-101 (including all stages)
ResNet101StagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, False), (4, 3, True))
)
# ResNet-101 up to stage 4 (excludes stage 5)
ResNet101StagesTo4 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, False), (2, 4, False), (3, 23, True))
)

# ResNet-50-FPN full stages, FPN 需要用到每一个阶段输出的特征图谱, 故 return_features 参数均为 True
ResNet50FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 6, True), (4, 3, True))
)

# ResNet-101-FPN full stages 的卷积层数分别为: 3, 4, 23, 3
ResNet101FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 4, True), (3, 23, True), (4, 3, True))
)

# ResNet-152-FPN full stages 的卷积层数分别为: 3, 8, 36, 3
ResNet152FPNStagesTo5 = tuple(
    StageSpec(index=i, block_count=c, return_features=r)
    for (i, c, r) in ((1, 3, True), (2, 8, True), (3, 36, True), (4, 3, True))
)

class ResNet 实现:

class ResNet(nn.Module):
    def __init__(self, cfg):
        super(ResNet, self).__init__()

        # 如果我们希望在 forward 函数中使用 cfg, 那么我们就应该创建一个副本以供其使用
        # self.cfg = cfg.clone()

       # 将配置文件中的字符串转化成具体的实现, 下面三个分别使用了对应的注册模块, 定义在文件的最后
        stem_module = _STEM_MODULES[cfg.MODEL.RESNETS.STEM_FUNC]
        stage_specs = _STAGE_SPECS[cfg.MODEL.BACKBONE.CONV_BODY]
        transformation_module = _TRANSFORMATION_MODULES[cfg.MODEL.RESNETS.TRANS_FUNC]


        # 这里是 stem 的实现, 也就是 resnet 的第一阶段 conv1
        # cfg.MODEL.RESNETS.STEM_FUNC = "StemWithFixedBatchNorm"
        self.stem = stem_module(cfg)

        # resnet conv2_x~conv5_x 的实现
        # eg: cfg.MODEL.CONV_BODY="R-50-FPN"    
        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS    # num_groups=1 时为 ResNet, >1 时为 ResNeXt
        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP

        # in_channels 指的是向后面的第二阶段输入时特征图谱的通道数, 也就是 stem 的输出通道数, 默认为 64
        in_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS

        # 第二阶段输入的特别图谱的通道数
        stage2_bottleneck_channels = num_groups * width_per_group

        # 第二阶段的输出, resnet 系列标准模型可从 resnet 第二阶段的输出通道数判断后续的通道数
        # 默认为256, 则后续分别为512, 1024, 2048, 若为64, 则后续分别为128, 256, 512
        stage2_out_channels = cfg.MODEL.RESNETS.RES2_OUT_CHANNELS

        # 创建一个空的 stages 列表和对应的特征图谱字典
        self.stages = []
        self.return_features = {}

        for stage_spec in stage_specs:
            name = "layer" + str(stage_spec.index)
            
            # 计算每个stage的输出通道数, 每经过一个stage, 通道数都会加倍
            stage2_relative_factor = 2 ** (stage_spec.index - 1)

            # 计算输入图谱的通道数
            bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor

            # 计算输出图谱的通道数
            out_channels = stage2_out_channels * stage2_relative_factor
            stage_with_dcn = cfg.MODEL.RESNETS.STAGE_WITH_DCN[stage_spec.index -1]

            # 当获取到所有需要的参数以后, 调用本文件的 `_make_stage` 函数,
            # 该函数可以根据传入的参数创建对应 stage 的模块(注意是module而不是model)
            module = _make_stage(
                transformation_module,    
                in_channels,            # 输入的通道数
                bottleneck_channels,    # 压缩后的通道数
                out_channels,           # 输出的通道数
                stage_spec.block_count, # 当前stage的卷积层数量
                num_groups,             # ResNet时为1, ResNeXt时>1
                cfg.MODEL.RESNETS.STRIDE_IN_1X1,

                # 当处于 stage3~5时, 需要在开始的时候使用 stride=2 来downsize
                first_stride=int(stage_spec.index > 1) + 1,
                dcn_config={
                    "stage_with_dcn": stage_with_dcn,
                    "with_modulated_dcn": cfg.MODEL.RESNETS.WITH_MODULATED_DCN,
                    "deformable_groups": cfg.MODEL.RESNETS.DEFORMABLE_GROUPS,
                }
            )
            # 下一个 stage 的输入通道数即为当前 stage 的输出通道数
            in_channels = out_channels

            # 将当前stage模块添加到模型中
            self.add_module(name, module)

            # 将stage的名称添加到列表中
            self.stages.append(name)

            # 将stage的布尔值添加到字典中
            self.return_features[name] = stage_spec.return_features

        # 根据配置文件的参数选择性的冻结某些层(requires_grad=False)
        self._freeze_backbone(cfg.MODEL.BACKBONE.FREEZE_CONV_BODY_AT)

    def _freeze_backbone(self, freeze_at):
        # 根据给定的参数冻结某些层的参数更新
        if freeze_at < 0:
            return
        for stage_index in range(freeze_at):
            if stage_index == 0:
                m = self.stem   # resnet 的第一阶段, 即为 stem
            else:
                m = getattr(self, "layer" + str(stage_index))
            # 将 m 中的所有参数置为不更新状态.
            for p in m.parameters():
                p.requires_grad = False

    def forward(self, x):
        outputs = []    
        x = self.stem(x)     # 先经过 stem(stage 1)
        for stage_name in self.stages:
            x = getattr(self, stage_name)(x)
            if self.return_features[stage_name]:
                # 将stage2~5的所有计算结果(也就是特征图谱)以列表形式保存
                outputs.append(x)

        # 将结果返回, outputs为列表形式, 元素为各个stage的特征图谱, 刚好作为 FPN 的输入
        return outputs

class ResNetHead 实现:

class ResNetHead(nn.Module):
    def __init__(
        self,
        block_module,
        stages,
        num_groups=1,
        width_per_group=64,
        stride_in_1x1=True,
        stride_init=None,
        res2_out_channels=256,
        dilation=1,
        dcn_config={}
    ):
        super(ResNetHead, self).__init__()
        
        # 获取不同stage的通道数相对于stage2的倍数
        stage2_relative_factor = 2 ** (stages[0].index - 1)

        # 获取压缩后的 stage2 的 channels
        stage2_bottleneck_channels = num_groups * width_per_group

        # 获取输出的 channels
        out_channels = res2_out_channels * stage2_relative_factor

        # 获取输入的 channels
        in_channels = out_channels // 2

        # 获取压缩后的 channels
        bottleneck_channels = stage2_bottleneck_channels * stage2_relative_factor

        # 根据给定的名称获取相应 block_module
        # 目前 _TRANSFORMATION_MODULES 只包含 "BottleneckWithFixedBatchNorm" 这一个模块
        block_module = _TRANSFORMATION_MODULES[block_module]

        # 初始化 stages 和 stride
        self.stages = []
        stride = stride_init

        for stage in stages:
            name = "layer" + str(stage.index)
            if not stride:
                # 当处于 stage3~5时, 需要在开始的时候使用 stride=2 来downsize
                stride = int(stage.index > 1) + 1
            module = _make_stage(
                block_module,
                in_channels,
                bottleneck_channels,
                out_channels,
                stage.block_count,
                num_groups,
                stride_in_1x1,
                first_stride=stride,
                dilation=dilation,
                dcn_config=dcn_config
            )
            stride = None
            self.add_module(name, module)
            self.stages.append(name)
        self.out_channels = out_channels

    def forward(self, x):
        for stage in self.stages:
            x = getattr(self, stage)(x)
        return x

上述两个类都使用了 _make_stage() 来创建 stage,代码如下:

# ./maskrcnn_benchmark/modeling/backbone/resnet.py

def _make_stage(
    transformation_module,
    in_channels,
    bottleneck_channels,
    out_channels,
    block_count,
    num_groups,
    stride_in_1x1,
    first_stride,
):
    blocks = []
    stride = first_stride
    for _ in range(block_count):
        blocks.append(
            transformation_module(
                in_channels,
                bottleneck_channels,
                out_channels,
                num_groups,
                stride_in_1x1,
                stride,
            )
        )
        stride = 1
        in_channels = out_channels

StemWithFixedBatchNorm 类负责构建 ResNet 的 stem 模块, 也可以认为是 ResNet 的第一阶段(或者说是第零阶段), 在 ResNet 50 中, 该阶段主要包含一个 7×7 大小的卷积核, 为了可以方便的复用实现各个 stage 的代码, 它将第二阶段最开始的 3×3 的 max pooling 层也放到了 stem 中的 forward 函数中实现(一般不带参数网络层的都放在 forward 中), 该类的实现如下:

# ./maskrcnn_benchmark/modeling/backbone/resnet.py

class StemWithFixedBatchNorm(nn.Module):
    def __init__(self, cfg):
        super(StemWithFixedBatchNorm, self).__init__()

        # resnet-50, out_channels=64
        out_channels = cfg.MODEL.RESNETS.STEM_OUT_CHANNELS

        # 输入的 channels 为 3, 输出为 64
        self.conv1 = Conv2d(
            3, out_channels, kernel_size=7, stride=2, padding=3, bias=False
        )

        # 使用固定参数的 BN 层
        self.bn1 = FrozenBatchNorm2d(out_channels)

    # 定义前向传播过程
    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = F.relu_(x) # 原地激活, 因为不含参数, 因此不放在模型定义中, 而放在 forward 中实现
        x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x

BottleneckWithFixedBatchNorm 类:创建完 stem(stage1) 以后, 接下来就是需要创建 resnet 的 stage2~5, 根据 resnet 的特点我们可以知道, resnet2~5 阶段的整体结构是非常相似的, 都是有最基础的 resnet bottleneck block 堆叠形成的, 不同 stage 的 bottleneck block 的数量不同, 对于 resnet50 来说, 每一个阶段的 bottleneck block 的数量分别为 3,4,6,3, 并且各个相邻 stage 之间的通道数都是两倍的关系, 所以可以很容易的从一个 stage 的通道数推知另一个 stage 的通道数, 关于 bottleneck block 的代码解析如下所示:

# ./maskrcnn_benchmark/modeling/backbone/resnet.py

class BottleneckWithFixedBatchNorm(nn.Module):
    def __init__(
        self,
        in_channels, # bottleneck 的输入 channels
        bottleneck_channels, # bottleneck 压缩后的 channels
        out_channels, # 当前stage的输出channels
        num_groups=1,
        stride_in_1x1=True,
        stride=1,
    ):
        super(BottleneckWithFixedBatchNorm, self).__init__()

        # downsample: 当 bottleneck 的输入和输出的 channels 不相等时, 则需要采用一定的策略
        # 在原文中, 有 A, B, C三种策略, 本文采用的是 B 策略(也是原文推荐的)
        # 即只有在输入输出通道数不相等时才使用 projection shortcuts,
        # 也就是利用参数矩阵映射使得输入输出的 channels 相等
        self.downsample = None

        # 当输入输出通道数不同时, 额外添加一个 1×1 的卷积层使得输入通道数映射成输出通道数
        if in_channels != out_channels:
            self.downsample = nn.Sequential(
                Conv2d(
                    in_channels, out_channels, kernel_size=1, stride=stride, bias=False
                ),
                FrozenBatchNorm2d(out_channels), # 后街一个固定参数的 BN 层
            )

        # 在 resnet 原文中, 会在 conv3_1, conv4_1, conv5_1 处使用 stride=2 的卷积
        # 而在 fb.torch.resnet 和 caffe2 的实现中, 是将之后的 3×3 的卷积层的 stride 置为2
        # 下面中的 stride 虽然默认值为1, 但是在函数调用时, 如果stage为3~5, 则会显示置为2
        stride_1x1, stride_3x3 = (stride, 1) if stride_in_1x1 else (1, stride)

        # 当获取到当前stage所需的参数后, 就创建相应的卷积层, 创建原则参见 resnet50 的定义
        self.conv1 = Conv2d(
            in_channels,
            bottleneck_channels,
            kernel_size=1,
            stride=stride_1x1,
            bias=False,
        )
        self.bn1 = FrozenBatchNorm2d(bottleneck_channels) # 后接一个固定参数的 BN 层

        # 创建 bottleneck 的第二层卷积层
        self.conv2 = Conv2d(
            bottleneck_channels,
            bottleneck_channels,
            kernel_size=3,
            stride=stride_3x3,
            padding=1,
            bias=False,
            groups=num_groups,
        )
        self.bn2 = FrozenBatchNorm2d(bottleneck_channels) # 后接一个 BN 层

        # 创建 bottleneck 的最后一个卷积层, padding默认为1
        self.conv3 = Conv2d(
            bottleneck_channels, out_channels, kernel_size=1, bias=False
        )
        self.bn3 = FrozenBatchNorm2d(out_channels)

    def forward(self, x):
        # 执行一次forward, 相当于执行一次 bottleneck,
        # 默认情况下, 具有三个卷积层, 一个恒等连接, 每个卷积层之后都带有 BN 和 relu 激活
        # 注意, 最后一个激活函数要放在恒等连接之后

        residual = x # 恒等连接, 直接令残差等于x即可

        # conv1, bn1
        out = self.conv1(x)
        out = self.bn1(out)
        out = F.relu_(out)

        # conv2, bn2
        out = self.conv2(out)
        out = self.bn2(out)
        out = F.relu_(out)

        # conv3, bn3
        out0 = self.conv3(out) # 这里的out0好像没必要带0?
        out = self.bn3(out0)

        if self.downsample is not None:
            # 如果输入输出的通道数不同, 则需要通过映射使之相同.
            residual = self.downsample(x)

        out += residual # H = F + x
        out = F.relu_(out) # 最后进行激活

        return out # 返回带有残差项的卷积结果

fpn.py 特征金字塔网络:对于 ResNet-50-C4 来说, 只需要上面的 ResNet 模型即可完成特征提取任务, 但是对于 ResNet-50-FPN 来说, 我们还需要实现 FPN 网络以获得更强的特征提取能力, 在 backbone.py 文件中的 build_resnet_fpn_backbone(cfg) 函数中, 就使用了 fpn = fpn_module.FPN(...) 来创建一个 FPN 类的实例对象, 并且利用 nn.Sequential() 将 ResNet 和 FPN 组合在一起形成一个模型, 并将其返回, 下面, 我们就来看看 FPN 网络的具体实现, 实例代码位于 ./maskrcnn_benchmark/modeling/backbone/fpn.py 文件中, 解析如下:

# ./maskrcnn_benchmark/modeling/backbone/fpn.py

import torch
import torch.nn.functional as F
from torch import nn

class FPN(nn.Module):
    # 在一系列的 feature map (实际上就是stage2~5的最后一层输出)添加 FPN
    # 这些 feature maps 的 depth 假定是不断递增的, 并且 feature maps 必须是连续的(从stage角度)

    def __init__(self, in_channels_list, out_channels, top_blocks=None):
        # in_channels_list (list[int]): 指示了送入 fpn 的每个 feature map 的通道数
        # out_channels (int): FPN表征的通道数, 所有的特征图谱最终都会转换成这个通道数大小
        # top_blocks (nn.Module or None): 当提供了 top_blocks 时, 就会在 FPN 的最后
        # 的输出上进行一个额外的 op, 然后 result 会扩展成 result list 返回
        super(FPN, self).__init__()

        # 创建两个空列表
        self.inner_blocks = []
        self.layer_blocks = []

        # 假设我们使用的是 ResNet-50-FPN 和配置, 则 in_channels_list 的值为:
        # [256, 512, 1024, 2048]
        for idx, in_channels in enumerate(in_channels_list, 1): # 下标从1开始
            # 用下表起名: fpn_inner1, fpn_inner2, fpn_inner3, fpn_inner4
            inner_block = "fpn_inner{}".format(idx)

            # fpn_layer1, fpn_layer2, fpn_layer3, fpn_layer4
            layer_block = "fpn_layer{}".format(idx)

            # 创建 inner_block 模块, 这里 in_channels 为各个stage输出的通道数
            # out_channels 为 256, 定义在用户配置文件中
            # 这里的卷积核大小为1, 该卷积层主要作用为改变通道数到 out_channels(降维)
            inner_block_module = nn.Conv2d(in_channels, out_channels, 1)

            # 改变 channels 后, 在每一个 stage 的特征图谱上再进行 3×3 的卷积计算, 通道数不变
            layer_block_module = nn.Conv2d(out_channels, out_channels, 3, 1, 1)

            for module in [inner_block_module, layer_block_module]:
                # Caffe2 的实现使用了 XavierFill,
                # 实际上相当于 PyTorch 中的 kaiming_uniform_
                nn.init.kaiming_uniform_(module.weight, a=1)
                nn.init.constant_(module.bias, 0)

            # 在当前的特征图谱上添加 FPN
            self.add_module(inner_block, inner_block_module) #name, module
            self.add_module(layer_block, layer_block_module)

            # 将当前 stage 的 fpn 模块的名字添加到对应的列表当中
            self.inner_blocks.append(inner_block)
            self.layer_blocks.append(layer_block)

        # 将top_blocks作为 FPN 类的成员变量
        self.top_blocks = top_blocks

    def forward(self, x):
        # x (list[Tensor]): 每个 feature level 的 feature maps,
        # ResNet的计算结果正好满足 FPN 的输入要求, 也因此可以使用 nn.Sequential 将二者直接结合
        # results (tuple[Tensor]): 经过FPN后的特征图谱组成的列表, 排列顺序是高分辨率的在前

        # 先计算最后一层(分辨率最低)特征图谱的fpn结果.
        last_inner = getattr(self, self.inner_blocks[-1])(x[-1])

        # 创建一个空的结果列表
        results=[]

        # 将最后一层的计算结果添加到 results 中
        results.append(getattr(self, self.layer_blocks[-1])(last_inner))


        # [:-1] 获取了前三项, [::-1] 代表从头到尾切片, 步长为-1, 效果为列表逆置
        # 举例来说, zip里的操作 self.inner_block[:-1][::-1] 的运行结果为
        # [fpn_inner3, fpn_inner2, fpn_inner1], 相当于对列表进行了逆置
        for feature, inner_block, layer_block in zip(
            x[:-1][::-1], self.inner_block[:-1][::-1], self.layer_blocks[:-1][::-1]
        ):
            # 根据给定的scale参数对特征图谱进行放大/缩小, 这里scale=2, 所以是放大
            inner_top_down = F.interpolate(last_inner, scale_factor=2, mode="nearest")

            # 获取 inner_block 的计算结果
            inner_lateral = getattr(self, inner_block)(feature)

            # 将二者叠加, 作为当前stage的输出 同时作为下一个stage的输入
            last_inner = inner_lateral + inner_top_down

            # 将当前stage输出添加到结果列表中, 注意还要用 layer_block 执行卷积计算
            # 同时为了使得分辨率最大的在前, 我们需要将结果插入到0位置
            results.insert(0, getattr(self, layer_block)(last_inner))

        # 如果 top_blocks 不为空, 则执行这些额外op
        if self.top_blocks is not None:
            last_results = self.top_blocks(results[-1])
            results.extend(last_results) # 将新计算的结果追加到列表中

        # 以元组(只读)形式返回
        return tuple(results)

# 最后一级的 max pool 层
class LastLevelMaxPool(nn.Module):
    def forward(self, x):
        return [F.max_pool2d(x, 1, 2, 0)]

backbone 部分算是结束了,接下来就是 rpn 网络部分, rpn.py 位于 ./maskrcnn_benchmark/modeling/rpn/ 中,实现如下:

# ./maskrcnn_benchmark/modeling/rpn/rpn.py

import torch
import torch.nn.functional as F
from torch import nn

from maskrcnn_benchmark.modeling import registry
from maskrcnn_benchmark.modeling.box_coder import BoxCoder
from maskrcnn_benchmark.modeling.rpn.retinanet.retinanet import build_retinanet
from .loss import make_rpn_loss_evaluator
from .anchor_generator import make_anchor_generator
from .inference import make_rpn_postprocessor


class RPNHeadConvRegressor(nn.Module):
    # 基础 RPN Head, 用于分类(区分anchor是前景还是背景)和边框回归(bbox regression)
    def __init__(self, cfg, in_channels, num_anchors):
        """
        cfg        : 配置信息
        in_channels: 输入的 feature 的维度
        num_anchors: 预测的 anchors 的数目
        """
        super(RPNHeadConvRegressor, self).__init__()

        # 用1x1卷积将输入的 feature 的维度转为预测的 anchors 的数目(num_anchors)
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)

        # 用1x1卷积将输入的 feature 的维度转为预测的 anchors 的 bbox 信息维度 num_anchors * 4 (x, y, h, w)
        self.bbox_pred = nn.Conv2d(
            in_channels, num_anchors * 4, kernel_size=1, stride=1
        )

        # 初始化 cls_logits 和 bbox_pred
        for l in [self.cls_logits, self.bbox_pred]:
            torch.nn.init.normal_(l.weight, std=0.01)
            torch.nn.init.constant_(l.bias, 0)

    def forward(self, x):
        assert isinstance(x, (list, tuple))
        logits = [self.cls_logits(y) for y in x]
        bbox_reg = [self.bbox_pred(y) for y in x]

        # 输出的 logits 是布尔张量,对应每一个anchor是正样本或负样本
        return logits, bbox_reg


class RPNHeadFeatureSingleConv(nn.Module):
    # 单个卷积层的 RPN Head,用于提取特征
    def __init__(self, cfg, in_channels):
        super(RPNHeadFeatureSingleConv, self).__init__()

        # 3x3 卷积用于提取特征
        self.conv = nn.Conv2d(
            in_channels, in_channels, kernel_size=3, stride=1, padding=1
        )

        # 初始化参数
        for l in [self.conv]:
            torch.nn.init.normal_(l.weight, std=0.01)
            torch.nn.init.constant_(l.bias, 0)

        self.out_channels = in_channels

    def forward(self, x):
        assert isinstance(x, (list, tuple))
        x = [F.relu(self.conv(z)) for z in x]

        return x


@registry.RPN_HEADS.register("SingleConvRPNHead")
class RPNHead(nn.Module):
    # 包含分类和边框回归的 RPN Head
    def __init__(self, cfg, in_channels, num_anchors):
        super(RPNHead, self).__init__()

        # 用于提取特征的卷积层
        self.conv = nn.Conv2d(
            in_channels, in_channels, kernel_size=3, stride=1, padding=1
        )

        # 用于分类的 1x1 卷积层
        self.cls_logits = nn.Conv2d(in_channels, num_anchors, kernel_size=1, stride=1)

        # 用于 anchor 边框回归的卷积层, * 4 表示 (x, y, h, w) 边框信息
        self.bbox_pred = nn.Conv2d(
            in_channels, num_anchors * 4, kernel_size=1, stride=1
        )

        # 初始化
        for l in [self.conv, self.cls_logits, self.bbox_pred]:
            torch.nn.init.normal_(l.weight, std=0.01)
            torch.nn.init.constant_(l.bias, 0)

    def forward(self, x):
        logits = []
        bbox_reg = []
        for feature in x:
            t = F.relu(self.conv(feature))
            logits.append(self.cls_logits(t))
            bbox_reg.append(self.bbox_pred(t))
        return logits, bbox_reg


class RPNModule(torch.nn.Module):
    # RPN 计算模块,接收backbone输出的 featuremaps 和 RPN 输出的 proposals和losses 作为输入
    # 对 FPN and non-FPN 都起作用
    def __init__(self, cfg, in_channels):
        super(RPNModule, self).__init__()
        self.cfg = cfg.clone()

        # 生成anchor,来自同文件夹下 anchor_generator.py
        anchor_generator = make_anchor_generator(cfg)

        # 定义RPN层,即anchor的类别得分层和边框回归层
        rpn_head = registry.RPN_HEADS[cfg.MODEL.RPN.RPN_HEAD]
        head = rpn_head(
            cfg, in_channels, anchor_generator.num_anchors_per_location()[0]
        )

        # 边框编码器,主要用于计算边框偏差以及利用偏差计算预测边框
        rpn_box_coder = BoxCoder(weights=(1.0, 1.0, 1.0, 1.0))

        # 指定获得预测边框的工具类,包括训练阶段和测试阶段
        box_selector_train = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=True)
        box_selector_test = make_rpn_postprocessor(cfg, rpn_box_coder, is_train=False)

        # 指定误差值计算的工具类
        loss_evaluator = make_rpn_loss_evaluator(cfg, rpn_box_coder)

        # 赋值
        self.anchor_generator = anchor_generator
        self.head = head
        self.box_selector_train = box_selector_train
        self.box_selector_test = box_selector_test
        self.loss_evaluator = loss_evaluator

    def forward(self, images, features, targets=None):
        """
        输入:
            images: 图片张量列表
            features: backbone 提取的特征图
            targets: 图片中的边框标签

        返回值:
            boxes: RPN 预测的边框,一张图对应一个边框列表
            losses: 训练过程中对应的损失
        """

        # RPN head 得到的类别得分特征图以及anchor的边框回归特征图
        objectness, rpn_box_regression = self.head(features)

        # 生成 anchors
        anchors = self.anchor_generator(images, features)

        if self.training:
            # 训练阶段,给定 targets 以计算 loss
            return self._forward_train(anchors, objectness, rpn_box_regression, targets)
        else:
            return self._forward_test(anchors, objectness, rpn_box_regression)

    def _forward_train(self, anchors, objectness, rpn_box_regression, targets):
        if self.cfg.MODEL.RPN_ONLY:
            # 对于 RPN-only 模型, loss 取决于类别得分特征图以及anchor的边框回归特征图
            # 无需再将 anchors 转化为预测的边框
            boxes = anchors
        else:
            # 对于端到端模型, anchors 必须被转化为边框,并且在训练的batch中采样
            with torch.no_grad():
                # 计算得到预测边框
                boxes = self.box_selector_train(
                    anchors, objectness, rpn_box_regression, targets
                )
        loss_objectness, loss_rpn_box_reg = self.loss_evaluator(
            anchors, objectness, rpn_box_regression, targets
        )
        losses = {
            "loss_objectness": loss_objectness,
            "loss_rpn_box_reg": loss_rpn_box_reg,
        }
        return boxes, losses

    def _forward_test(self, anchors, objectness, rpn_box_regression):
        boxes = self.box_selector_test(anchors, objectness, rpn_box_regression)
        if self.cfg.MODEL.RPN_ONLY:
            # 对于 RPN-only 模型,proposals(anchors)就是最终的输出,我们将它们按置信度从高到低排序
            # 对于 end-to-end 模型,RPN proposals 是一个中间状态,无需排序
            inds = [
                box.get_field("objectness").sort(descending=True)[1] for box in boxes
            ]
            boxes = [box[ind] for box, ind in zip(boxes, inds)]
        return boxes, {}


def build_rpn(cfg, in_channels):
    # 生成rpn模块
    if cfg.MODEL.RETINANET_ON:
        return build_retinanet(cfg, in_channels)

    return RPNModule(cfg, in_channels)

rpn module创建过程中使用了同文件夹下 anchor_generator.py 和 loss.py 中的生成anchor功能和损失计算,其中anchor_generator.py 如下:

# ./maskrcnn_benchmark/modeling/rpn/anchor_generator.py

class BufferList(nn.Module):
    # 把anchors转换成buffer的list
    def __init__(self, buffers=None):
        super(BufferList, self).__init__()
        if buffers is not None:
            self.extend(buffers)

    def extend(self, buffers):
        offset = len(self)
        for i, buffer in enumerate(buffers):
            self.register_buffer(str(offset + i), buffer)
        return self

    def __len__(self):
        return len(self._buffers)

    def __iter__(self):
        return iter(self._buffers.values())


class AnchorGenerator(nn.Module):
    # 给定images和对应的features,生成一系列anchors
    def __init__(
        self,
        sizes=(128, 256, 512),             # 不同size的features
        aspect_ratios=(0.5, 1.0, 2.0),     # anchor的宽高比
        anchor_strides=(8, 16, 32),        # anchor的步长
        straddle_thresh=0,
    ):
        super(AnchorGenerator, self).__init__()

        # 如果只从一个特征图上提取特征,即不使用rpn
        if len(anchor_strides) == 1:
            anchor_stride = anchor_strides[0]
            cell_anchors = [
                generate_anchors(anchor_stride, sizes, aspect_ratios).float()
            ]
        else:
            # size 和 stride 要一一对应
            if len(anchor_strides) != len(sizes):
                raise RuntimeError("FPN should have #anchor_strides == #sizes")
            # 循环的得到不同大小尺度不同步长情况下不同长宽比的anchor
            cell_anchors = [
                generate_anchors(
                    anchor_stride,
                    size if isinstance(size, (tuple, list)) else (size,),
                    aspect_ratios
                ).float()
                for anchor_stride, size in zip(anchor_strides, sizes)
            ]
        self.strides = anchor_strides
        self.cell_anchors = BufferList(cell_anchors)
        self.straddle_thresh = straddle_thresh

    # 返回每一个大小尺度上生成的anchor数
    def num_anchors_per_location(self):
        return [len(cell_anchors) for cell_anchors in self.cell_anchors]

    # anchor按照步长从图像的左上角向全图滑动,从左上向右下,按照步长划遍全图截取anchor
    def grid_anchors(self, grid_sizes):
        anchors = []

        # grid_sizes为在某个特征层上特征图的大小
        # 要求stride,anchor size和FPN输出层的个数需要是一致的
        for size, stride, base_anchors in zip(
            grid_sizes, self.strides, self.cell_anchors
        ):
            # 特征图的高和宽
            grid_height, grid_width = size
            device = base_anchors.device

            # anchor在x轴上的滑动距离的列表
            shifts_x = torch.arange(
                0, grid_width * stride, step=stride, dtype=torch.float32, device=device
            )

            # anchor在y轴上的滑动距离的列表
            shifts_y = torch.arange(
                0, grid_height * stride, step=stride, dtype=torch.float32, device=device
            )

            # 由anchor在x轴和y轴上需要滑动的距离列表构造出滑动距离的网格
            shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)

            # 把anchor在x,y轴上的滑动距离的列表压缩成一位数组
            shift_x = shift_x.reshape(-1)
            shift_y = shift_y.reshape(-1)

            # 把anchor在x,y轴上的滑动距离的数组构造出二维的滑动网格
            shifts = torch.stack((shift_x, shift_y, shift_x, shift_y), dim=1)

            # 计算得到anchor在各个滑动节点上的目标,包括很多超出图片范围的anchor
            anchors.append(
                (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4)).reshape(-1, 4)
            )

        return anchors

    # 把由grid_anchors方法构造出来的所有anchor中生成在特征图内的anchor保存起来,
    # 超出图片本身的anchor是绘制不出来的,所以是不可见的,
    # 在图像内部的anchor才可以绘制出来,所以是visible的
    def add_visibility_to(self, boxlist):
        # boxlist为grid_anchors方法构造出来的所有anchor转换成的BufferList
        image_width, image_height = boxlist.size

        # 获得所有的生成的anchors
        anchors = boxlist.bbox

        # 保存是否某anchor存在于图片内部的信息,如果是1则anchor在图片内部,0为不在图片内
        if self.straddle_thresh >= 0:
            inds_inside = (
                (anchors[..., 0] >= -self.straddle_thresh)
                & (anchors[..., 1] >= -self.straddle_thresh)
                & (anchors[..., 2] < image_width + self.straddle_thresh)
                & (anchors[..., 3] < image_height + self.straddle_thresh)
            )
        else:
            # 如果straddle_thresh小于0,则保存所有的anchor都是0,即不在图片内
            device = anchors.device
            inds_inside = torch.ones(anchors.shape[0], dtype=torch.uint8, device=device)
        boxlist.add_field("visibility", inds_inside)

    def forward(self, image_list, feature_maps):
        '''
        :param image_list: 传入模型的图片数据 (n * w * h)
        :param feature_maps: (n_layer * n_pic * map_width * map_height)
        :return: generated anchors
        '''

        # 各个特征图的大小的列表 (h, w)
        grid_sizes = [feature_map.shape[-2:] for feature_map in feature_maps]
        anchors_over_all_feature_maps = self.grid_anchors(grid_sizes)
        anchors = []
        for i, (image_height, image_width) in enumerate(image_list.image_sizes):
            anchors_in_image = []
            for anchors_per_feature_map in anchors_over_all_feature_maps:
                # 由所有的anchor和原始图像构造 BoxList
                boxlist = BoxList(
                    anchors_per_feature_map, (image_width, image_height), mode="xyxy"
                )
                # 得到所有的anchor有没有包含在原始图像范围内的信息,
                # 保存在boxlist里面的extra_fields的visibility中
                self.add_visibility_to(boxlist)
                anchors_in_image.append(boxlist)
            anchors.append(anchors_in_image)
        return anchors


def make_anchor_generator(config):
    # anchor的大小,可以为多个,其大小是针对源图像的
    anchor_sizes = config.MODEL.RPN.ANCHOR_SIZES

    # anchor边框比例,即长宽比,一般为3组,即1:2, 1:1, 2:1
    aspect_ratios = config.MODEL.RPN.ASPECT_RATIOS

    # anchor在特征图上滑动的步长,为一组,在不同的特征图上滑动的步长不同
    anchor_stride = config.MODEL.RPN.ANCHOR_STRIDE

    # 设置针对anchor的宽容度,也就是允许anchor在多大尺度上错误,
    # 比如anchor位置超出图片多少程度,大小比图片大多少可以接受,超过部分将视作错误边框
    straddle_thresh = config.MODEL.RPN.STRADDLE_THRESH

    if config.MODEL.RPN.USE_FPN:
        # 如果使用FPN,则anchor的步长数和features的形状的数目必须相等
        assert len(anchor_stride) == len(
            anchor_sizes
        ), "FPN should have len(ANCHOR_STRIDE) == len(ANCHOR_SIZES)"
    else:
        # 不使用FPN必须只有一个anchor步长
        assert len(anchor_stride) == 1, "Non-FPN should have a single ANCHOR_STRIDE"

    # 得到生成anchors的类
    anchor_generator = AnchorGenerator(
        anchor_sizes, aspect_ratios, anchor_stride, straddle_thresh
    )
    return anchor_generator


def make_anchor_generator_retinanet(config):
    # retinanet相关的anchor参数
    anchor_sizes = config.MODEL.RETINANET.ANCHOR_SIZES
    aspect_ratios = config.MODEL.RETINANET.ASPECT_RATIOS
    anchor_strides = config.MODEL.RETINANET.ANCHOR_STRIDES
    straddle_thresh = config.MODEL.RETINANET.STRADDLE_THRESH
    octave = config.MODEL.RETINANET.OCTAVE
    scales_per_octave = config.MODEL.RETINANET.SCALES_PER_OCTAVE

    assert len(anchor_strides) == len(anchor_sizes), "Only support FPN now"
    new_anchor_sizes = []
    for size in anchor_sizes:
        per_layer_anchor_sizes = []
        for scale_per_octave in range(scales_per_octave):
            octave_scale = octave ** (scale_per_octave / float(scales_per_octave))
            per_layer_anchor_sizes.append(octave_scale * size)
        new_anchor_sizes.append(tuple(per_layer_anchor_sizes))

    anchor_generator = AnchorGenerator(
        tuple(new_anchor_sizes), aspect_ratios, anchor_strides, straddle_thresh
    )
    return anchor_generator


def generate_anchors(
    stride=16, sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.5, 1, 2)
):
    # 按照不同的步长、特征图大小、宽高比参数,初始化所有尺度内所有长宽比情况的anchors (x1, y1, x2, y2)
    # anchors 的中心在 stride / 2 的位置
    return _generate_anchors(
        stride,
        np.array(sizes, dtype=np.float) / stride,
        np.array(aspect_ratios, dtype=np.float),
    )


def _generate_anchors(base_size, scales, aspect_ratios):
    # 获得起始的scales大小的正方形anchor, 坐标为(0,0,scales-1,scales-1)
    anchor = np.array([1, 1, base_size, base_size], dtype=np.float) - 1

    # 获得在scales大小情况下,anchor各种比例的情况
    anchors = _ratio_enum(anchor, aspect_ratios)

    # 获得不同size的anchor在不同ratio情况下的anchor
    anchors = np.vstack(
        [_scale_enum(anchors[i, :], scales) for i in range(anchors.shape[0])]
    )
    return torch.from_numpy(anchors)


def _whctrs(anchor):
    # 输入 (x1, y1, x2, y2) 的 anchors
    # 输出 (w, h, x_center, y_center) 的 anchors 窗口
    w = anchor[2] - anchor[0] + 1
    h = anchor[3] - anchor[1] + 1
    x_ctr = anchor[0] + 0.5 * (w - 1)
    y_ctr = anchor[1] + 0.5 * (h - 1)
    return w, h, x_ctr, y_ctr


def _mkanchors(ws, hs, x_ctr, y_ctr):
    # 输入中心点 (x_ctr, y_ctr) 的一组 widths (ws), heights (hs) 向量
    # 输出一系列  anchors 窗口
    ws = ws[:, np.newaxis]
    hs = hs[:, np.newaxis]
    anchors = np.hstack(
        (
            x_ctr - 0.5 * (ws - 1),
            y_ctr - 0.5 * (hs - 1),
            x_ctr + 0.5 * (ws - 1),
            y_ctr + 0.5 * (hs - 1),
        )
    )

    # (x1, x2, y1, y2)
    return anchors


# 在size一定的情况下,按照长宽比构造不同的anchor
def _ratio_enum(anchor, ratios):
    w, h, x_ctr, y_ctr = _whctrs(anchor)

    # anchor面积
    size = w * h

    # 面积除以变换率
    size_ratios = size / ratios

    # 计算在某变换率下应有的anchor宽度
    ws = np.round(np.sqrt(size_ratios))

    # 计算在某变换率下应有的anchor高度
    hs = np.round(ws * ratios)

    # 将anchor恢复成保存左下角和右上角两个点的坐标
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors


# 构造完不同长宽比的anchor之后,在不同size尺度的构造不同长宽比的anchor
def _scale_enum(anchor, scales):
    w, h, x_ctr, y_ctr = _whctrs(anchor)
    ws = w * scales
    hs = h * scales
    anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
    return anchors

rpn 下的 loss.py 如下:

# ./maskrcnn_benchmark/modeling/rpn/loss.py

class RPNLossComputation(object):
    # 计算 RPN loss
    def __init__(self, proposal_matcher, fg_bg_sampler, box_coder,
                 generate_labels_func):
        # anchor 匹配器,用于匹配anchor和target
        self.proposal_matcher = proposal_matcher

        # 前背景采样器,用于选择一定比例的目标anchor和背景
        self.fg_bg_sampler = fg_bg_sampler

        # 边框编码器,用于实现边框回归和预测边框
        self.box_coder = box_coder

        # 初始化需要复制的属性
        self.copied_fields = []

        # 指定标签生成函数,用以生成anchors对应的基准边框的索引
        self.generate_labels_func = generate_labels_func

        # 指定需要放弃的anchors类型
        self.discard_cases = ['not_visibility', 'between_thresholds']

    def match_targets_to_anchors(self, anchor, target, copied_fields=[]):
        # 计算 ground truth 和 anchor 的交并比
        match_quality_matrix = boxlist_iou(target, anchor)

        # 计算anchors对应的基准边框的索引,其中背景的索引为-1
        matched_idxs = self.proposal_matcher(match_quality_matrix)

        # RPN doesn't need any fields from target
        # for creating the labels, so clear them all
        # 将需要复制的属性复制到基准边框列表中
        target = target.copy_with_fields(copied_fields)

        # get the targets corresponding GT for each anchor
        # NB: need to clamp the indices because we can have a single
        # GT in the image, and matched_idxs can be -2, which goes out of bounds
        # 得到anchors对应的基准边框索引,将背景边框等无对应边框的anchors统统映射到第一个基准边框
        matched_targets = target[matched_idxs.clamp(min=0)]

        # 得到所有anchors对应的基准边框列表
        matched_targets.add_field("matched_idxs", matched_idxs)
        return matched_targets

    # 获得anchor的标签:-1为舍弃的,0为背景,其余的为对应的gt。
    # 获得所有anchors与和其对应的gt的偏差,即边框回归
    def prepare_targets(self, anchors, targets):
        # 初始化anchors标签和边框回归标签
        labels = []
        regression_targets = []

        # 循环读取每一张图片中的anchor和gt,然后进行处理
        for anchors_per_image, targets_per_image in zip(anchors, targets):
            # 得到与各个anchor对应的gt
            matched_targets = self.match_targets_to_anchors(
                anchors_per_image, targets_per_image, self.copied_fields
            )
            # 得到与各个anchor对应的gt的索引
            matched_idxs = matched_targets.get_field("matched_idxs")

            # 得到与各个anchor对应的gt的标签列表,其中0为舍弃边框,1为有用边框
            labels_per_image = self.generate_labels_func(matched_targets)
            labels_per_image = labels_per_image.to(dtype=torch.float32)

            # 将背景anchor的标签设为0
            bg_indices = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
            labels_per_image[bg_indices] = 0

            # 丢弃超出图片范围的anchor,将标签设为-1
            if "not_visibility" in self.discard_cases:
                labels_per_image[~anchors_per_image.get_field("visibility")] = -1

            # 丢弃IoU结余背景和目标之间的anchors
            if "between_thresholds" in self.discard_cases:
                inds_to_discard = matched_idxs == Matcher.BETWEEN_THRESHOLDS
                labels_per_image[inds_to_discard] = -1

            # 计算anchors和targets之间的边框回归
            regression_targets_per_image = self.box_coder.encode(
                matched_targets.bbox, anchors_per_image.bbox
            )

            # 将标签信息和边框回归信息保存到最开始初始化的列表里
            labels.append(labels_per_image)
            regression_targets.append(regression_targets_per_image)

        return labels, regression_targets

    def __call__(self, anchors, objectness, box_regression, targets):
        """
        Arguments:
            anchors:生成的anchors
            objectness:fpn得到的目标得分特征图
            box_regression:fpn得到的计算边框回归的特征图
            targets:每个图片的gt边框

        Returns:
            objectness_loss (Tensor):分类损失
            box_loss (Tensor):边框回归损失
        """
        # 将每一个图片的不同FPN层中生成的锚点合并起来
        anchors = [cat_boxlist(anchors_per_image) for anchors_per_image in anchors]

        # 得到每一个图片的所有anchors相对应的gt列表
        labels, regression_targets = self.prepare_targets(anchors, targets)

        # 根据所有anchors的标签选取作为背景的anchors和作为目标的anchors的标签
        # 该标签中0为未选择,1为选择
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)

        # 将选择的anchor标签转换为anchor的索引值,并去除掉索引列表中多余的维度
        sampled_pos_inds = torch.nonzero(torch.cat(sampled_pos_inds, dim=0)).squeeze(1)
        sampled_neg_inds = torch.nonzero(torch.cat(sampled_neg_inds, dim=0)).squeeze(1)

        # 将选中的正负anchors索引值合并
        sampled_inds = torch.cat([sampled_pos_inds, sampled_neg_inds], dim=0)

        # 将所有图片中的RPN Head中的边框目标得分层和边框回归层分别合并成统一张量
        # N * ratio(或 N *4 * ratio)的边框分类信息和边框回归信息
        objectness, box_regression = \
                concat_box_prediction_layers(objectness, box_regression)

        # 去除多余维度
        objectness = objectness.squeeze()

        # 将所有图片中的anchors标签合并
        labels = torch.cat(labels, dim=0)

        # 将所有图片中的anchors边框回归标签合并
        regression_targets = torch.cat(regression_targets, dim=0)

        # 边框损失:随机选择正样本anchors计算
        # 损失函数 smooth L1 = (0.5 * x) ** 2 if abs(x) < 1 else (abs(x) - 0.5)
        # 相比于L1损失函数,可以收敛得更快(且L1在0出不可导)
        # 相比于L2损失函数,对离群点、异常值不敏感,梯度变化相对更小,训练时不容易跑飞
        box_loss = smooth_l1_loss(
            box_regression[sampled_pos_inds],
            regression_targets[sampled_pos_inds],
            beta=1.0 / 9,
            size_average=False,
        ) / (sampled_inds.numel())

        # 分类损失:随机选择正负样本anchors计算
        objectness_loss = F.binary_cross_entropy_with_logits(
            objectness[sampled_inds], labels[sampled_inds]
        )

        # 得到最终的anchors分类损失和边框回归损失
        return objectness_loss, box_loss


# 此函数在 RetinaNet 中应该重写
# 生成anchors的标签,正样本anchors标签为1,反之为0
def generate_rpn_labels(matched_targets):
    # 获取anchors对应gt的索引
    matched_idxs = matched_targets.get_field("matched_idxs")

    # 正样本anchors标签设为1
    labels_per_image = matched_idxs >= 0
    return labels_per_image


# 指定边框匹配函数,用于anchors对应的gt,指定两个NMS参数等
def make_rpn_loss_evaluator(cfg, box_coder):
    matcher = Matcher(
        cfg.MODEL.RPN.FG_IOU_THRESHOLD,     # 前景IOU阈值
        cfg.MODEL.RPN.BG_IOU_THRESHOLD,     # 背景IOU阈值
        allow_low_quality_matches=True,
    )

    # 指定前背景anchors选择函数,用于选择一定比例的背景和前景边框,指定两种anchors的个数和比例
    fg_bg_sampler = BalancedPositiveNegativeSampler(
        cfg.MODEL.RPN.BATCH_SIZE_PER_IMAGE, cfg.MODEL.RPN.POSITIVE_FRACTION
    )

    # 调用 RPN 损失函数
    loss_evaluator = RPNLossComputation(
        matcher,
        fg_bg_sampler,
        box_coder,
        generate_rpn_labels
    )
    return loss_evaluator

backbone 和 rpn 主体代码大致如上所述,还有一些 utils 辅助函数的实现可以参考官方源码,接下来要分析的就是 roi_heads 部分,文件夹下分为 box_head, keypoint_head, mask_head 三个文件夹对应相应的head,同级目录下的 roi_heads.py 用于合并单独的 heads 为一个统一的head,其实现如下:

# ./maskrcnn_benchmark/modeling/roi_heads/roi_heads.py

class CombinedROIHeads(torch.nn.ModuleDict):
    # 合并一系列独立的heads(box / mask)为一个统一的head
    def __init__(self, cfg, heads):
        super(CombinedROIHeads, self).__init__(heads)
        self.cfg = cfg.clone()

        # 如果box和mask的head的特征值共享,则将box head的features赋值给mask head
        if cfg.MODEL.MASK_ON and cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
            self.mask.feature_extractor = self.box.feature_extractor

        # 同上,keypoint head
        if cfg.MODEL.KEYPOINT_ON and cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
            self.keypoint.feature_extractor = self.box.feature_extractor

    def forward(self, features, proposals, targets=None):
        losses = {}
        # box head 的损失
        x, detections, loss_box = self.box(features, proposals, targets)

        # 更新box head损失
        losses.update(loss_box)

        # 如果存在 mask 分支
        if self.cfg.MODEL.MASK_ON:
            mask_features = features

            # 优化:训练阶段如果我们共享box和mask head的特征图,则我们可以复用已经计算过的特征图
            if (
                self.training
                and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
            ):
                mask_features = x

            # 训练阶段, self.box() 会返回未经变换的proposals作为检测结果
            # 这使得训练和测试阶段的API是一致的
            # mask head 的损失
            x, detections, loss_mask = self.mask(mask_features, detections, targets)

            # 更新mask head 损失
            losses.update(loss_mask)

        if self.cfg.MODEL.KEYPOINT_ON:
            keypoint_features = features
            # optimization: during training, if we share the feature extractor between
            # the box and the mask heads, then we can reuse the features already computed
            if (
                self.training
                and self.cfg.MODEL.ROI_KEYPOINT_HEAD.SHARE_BOX_FEATURE_EXTRACTOR
            ):
                keypoint_features = x
            # During training, self.box() will return the unaltered proposals as "detections"
            # this makes the API consistent during training and testing
            x, detections, loss_keypoint = self.keypoint(keypoint_features, detections, targets)
            losses.update(loss_keypoint)
        return x, detections, losses


def build_roi_heads(cfg, in_channels):
    # 创建独立的 roi heads,并在之后合并
    roi_heads = []
    if cfg.MODEL.RETINANET_ON:
        return []

    # 依据条件依次添加各个head
    if not cfg.MODEL.RPN_ONLY:
        roi_heads.append(("box", build_roi_box_head(cfg, in_channels)))
    if cfg.MODEL.MASK_ON:
        roi_heads.append(("mask", build_roi_mask_head(cfg, in_channels)))
    if cfg.MODEL.KEYPOINT_ON:
        roi_heads.append(("keypoint", build_roi_keypoint_head(cfg, in_channels)))

    # 合并各个独立的 heads
    if roi_heads:
        roi_heads = CombinedROIHeads(cfg, roi_heads)

    return roi_heads

接下来依次看一下 box_head 部分和 mask_head 部分,对于 keypoint_head 了解有限,通用的检测和分割前两个head也够用了,就不做描述。首先看 box_head 文件夹下的 box_head.py:

# ./maskrcnn_benchmark/modeling/roi_heads/box_head/box_head.py

class ROIBoxHead(torch.nn.Module):
    def __init__(self, cfg, in_channels):
        super(ROIBoxHead, self).__init__()
        # RoI 层中的特征提取器
        self.feature_extractor = make_roi_box_feature_extractor(cfg, in_channels)

        # RoI 层中的边框预测类
        self.predictor = make_roi_box_predictor(
            cfg, self.feature_extractor.out_channels)

        # RoI 层中的后处理类,包括边框回归
        self.post_processor = make_roi_box_post_processor(cfg)

        # box head 的损失计算类
        self.loss_evaluator = make_roi_box_loss_evaluator(cfg)

    def forward(self, features, proposals, targets=None):
        """
        Arguments:
            features: 可能来自多个层的 feature map
            proposals: proposal boxes
            targets: ground-truth targets

        Returns:
            x: 特征提取器的输出结果
            proposals: 训练过程中,返回采样后的proposals;测试过程中,返回预测的边框列表;
            losses: 训练过程中返回 box head 的 loss;测试过程中返回空字典;
        """

        if self.training:
            # Faster-RCNN 训练过程中会依据固定的正负样本比例对proposals采样
            with torch.no_grad():
                proposals = self.loss_evaluator.subsample(proposals, targets)

        # 提取的特征用于最后的分类器分类,特征提取器对应 pooler + heads
        x = self.feature_extractor(features, proposals)

        # 最后的分类器将特征图转换为预测值(类别+边框回归)
        class_logits, box_regression = self.predictor(x)

        if not self.training:
            # 测试阶段直接预测边框
            result = self.post_processor((class_logits, box_regression), proposals)
            return x, result, {}

        # 就算分类器和边框回归的损失
        loss_classifier, loss_box_reg = self.loss_evaluator(
            [class_logits], [box_regression]
        )
        return (
            x,
            proposals,
            dict(loss_classifier=loss_classifier, loss_box_reg=loss_box_reg),
        )


def build_roi_box_head(cfg, in_channels):
    # 创建一个新的 box head,默认使用 ROIBoxHead
    return ROIBoxHead(cfg, in_channels)

此文件中用到了同文件下 roi_box_feature_extractor.py, roi_box_predictor.py, loss.py, inference.py 的功能,首先看 roi_box_feature_extractor.py:

# ./maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_feature_extractor.py

# 使用 ResNet50 中的 conv5 来提取 roi 特征
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("ResNet50Conv5ROIFeatureExtractor")
class ResNet50Conv5ROIFeatureExtractor(nn.Module):
    def __init__(self, config, in_channels):
        super(ResNet50Conv5ROIFeatureExtractor, self).__init__()

        # resolution 为 roi pooling 之后特征图的大小,一般为7
        resolution = config.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION

        # 获得原始图像和各个特征图之间的尺度比例
        scales = config.MODEL.ROI_BOX_HEAD.POOLER_SCALES

        # 采样率为anchors大小与池化后的特征图的大小比例,一般不指定
        sampling_ratio = config.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO

        # 池化类,内含 RoI Align
        pooler = Pooler(
            output_size=(resolution, resolution),       # 输出尺寸为 7 * 7
            scales=scales,
            sampling_ratio=sampling_ratio,
        )

        # 获得Stage5的模板,用以构造Resnet的Stage5
        stage = resnet.StageSpec(index=4, block_count=3, return_features=False)

        # 构造Resnet的Stage5层网络结构
        head = resnet.ResNetHead(
            block_module=config.MODEL.RESNETS.TRANS_FUNC,
            stages=(stage,),
            num_groups=config.MODEL.RESNETS.NUM_GROUPS,
            width_per_group=config.MODEL.RESNETS.WIDTH_PER_GROUP,
            stride_in_1x1=config.MODEL.RESNETS.STRIDE_IN_1X1,
            stride_init=None,
            res2_out_channels=config.MODEL.RESNETS.RES2_OUT_CHANNELS,
            dilation=config.MODEL.RESNETS.RES5_DILATION
        )

        self.pooler = pooler
        self.head = head
        self.out_channels = head.out_channels

    def forward(self, x, proposals):
        x = self.pooler(x, proposals)
        x = self.head(x)
        return x


# 使用MLP的全连接网络结构来提取ROI特征
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPN2MLPFeatureExtractor")
class FPN2MLPFeatureExtractor(nn.Module):
    # 用于分类的FPN层模型

    def __init__(self, cfg, in_channels):
        super(FPN2MLPFeatureExtractor, self).__init__()

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )

        # 输入层大小为把每一个元素拉成一个向量,为全连接层
        input_size = in_channels * resolution ** 2

        # MLP的全连接输出层通道数
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM

        # 是否使用GN
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN
        self.pooler = pooler
        self.fc6 = make_fc(input_size, representation_size, use_gn)
        self.fc7 = make_fc(representation_size, representation_size, use_gn)
        self.out_channels = representation_size

    def forward(self, x, proposals):
        x = self.pooler(x, proposals)
        x = x.view(x.size(0), -1)

        x = F.relu(self.fc6(x))
        x = F.relu(self.fc7(x))

        return x


# 由多个堆叠的卷积层来对RoI Pooling后的特征进行进一步特征加工
@registry.ROI_BOX_FEATURE_EXTRACTORS.register("FPNXconv1fcFeatureExtractor")
class FPNXconv1fcFeatureExtractor(nn.Module):
    # 用于分类的FPN层模型
    def __init__(self, cfg, in_channels):
        super(FPNXconv1fcFeatureExtractor, self).__init__()

        resolution = cfg.MODEL.ROI_BOX_HEAD.POOLER_RESOLUTION
        scales = cfg.MODEL.ROI_BOX_HEAD.POOLER_SCALES
        sampling_ratio = cfg.MODEL.ROI_BOX_HEAD.POOLER_SAMPLING_RATIO
        pooler = Pooler(
            output_size=(resolution, resolution),
            scales=scales,
            sampling_ratio=sampling_ratio,
        )
        self.pooler = pooler
        use_gn = cfg.MODEL.ROI_BOX_HEAD.USE_GN

        # 本卷积层的输出通道数
        conv_head_dim = cfg.MODEL.ROI_BOX_HEAD.CONV_HEAD_DIM

        # 指定卷积层的个数
        num_stacked_convs = cfg.MODEL.ROI_BOX_HEAD.NUM_STACKED_CONVS

        # 是否采用膨胀卷积
        dilation = cfg.MODEL.ROI_BOX_HEAD.DILATION

        # 初始化多个卷积层的模型,并循环添加多个卷积层
        xconvs = []
        for ix in range(num_stacked_convs):
            xconvs.append(
                nn.Conv2d(
                    in_channels,
                    conv_head_dim,
                    kernel_size=3,
                    stride=1,
                    padding=dilation,
                    dilation=dilation,
                    bias=False if use_gn else True
                )
            )
            in_channels = conv_head_dim
            if use_gn:
                # 添加 GN
                xconvs.append(group_norm(in_channels))

            # 每一个卷积层后添加一个激活函数层
            xconvs.append(nn.ReLU(inplace=True))

        # 将当前卷积层添加至模型中
        self.add_module("xconvs", nn.Sequential(*xconvs))

        # 初始化模型参数
        for modules in [self.xconvs,]:
            for l in modules.modules():
                if isinstance(l, nn.Conv2d):
                    torch.nn.init.normal_(l.weight, std=0.01)
                    if not use_gn:
                        torch.nn.init.constant_(l.bias, 0)

        # 添加全连接层
        input_size = conv_head_dim * resolution ** 2
        representation_size = cfg.MODEL.ROI_BOX_HEAD.MLP_HEAD_DIM
        self.fc6 = make_fc(input_size, representation_size, use_gn=False)
        self.out_channels = representation_size

    def forward(self, x, proposals):
        x = self.pooler(x, proposals)
        x = self.xconvs(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc6(x))
        return x


# 实例化roi边框特征提取方式的类
def make_roi_box_feature_extractor(cfg, in_channels):
    # 用参数里指定的特征提取方式来实例化相应的类或者函数
    func = registry.ROI_BOX_FEATURE_EXTRACTORS[
        cfg.MODEL.ROI_BOX_HEAD.FEATURE_EXTRACTOR
    ]
    return func(cfg, in_channels)

其次是同文件夹下的 roi_box_predictor.py:

# ./maskrcnn_benchmark/modeling/roi_heads/box_head/roi_box_predictor.py

# 将预测边框的特征进行池化,再使用边框分类器和边框回归器来预测边框的类别以及边框回归的坐标偏差值
@registry.ROI_BOX_PREDICTOR.register("FastRCNNPredictor")
class FastRCNNPredictor(nn.Module):
    def __init__(self, config, in_channels):
        super(FastRCNNPredictor, self).__init__()
        assert in_channels is not None

        # 输入层的通道数
        num_inputs = in_channels

        # 边框的类别数,一般都是 实际类别 + 1(背景类)
        num_classes = config.MODEL.ROI_BOX_HEAD.NUM_CLASSES

        # 对输入层特征先进行平均池化
        self.avgpool = nn.AdaptiveAvgPool2d(1)

        # 用于预测边框类别的线性变换,输入层通道数 -> 类别数
        self.cls_score = nn.Linear(num_inputs, num_classes)

        # 当模式为方式为CLS_AGNOSTIC_BBOX_REG时,只回归2类bounding box,即前景和背景,否则为实际类别数
        num_bbox_reg_classes = 2 if config.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes

        # 用于预测边框回归的线性变换,输入层通道数 -> 类别数 * 4
        self.bbox_pred = nn.Linear(num_inputs, num_bbox_reg_classes * 4)

        # 参数初始化
        nn.init.normal_(self.cls_score.weight, mean=0, std=0.01)
        nn.init.constant_(self.cls_score.bias, 0)
        nn.init.normal_(self.bbox_pred.weight, mean=0, std=0.001)
        nn.init.constant_(self.bbox_pred.bias, 0)

    def forward(self, x):
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        cls_logit = self.cls_score(x)
        bbox_pred = self.bbox_pred(x)
        return cls_logit, bbox_pred


# 不经过池化直接使用边框分类器和边框回归器来预测边框的类别以及边框回归的坐标偏差值
@registry.ROI_BOX_PREDICTOR.register("FPNPredictor")
class FPNPredictor(nn.Module):
    def __init__(self, cfg, in_channels):
        super(FPNPredictor, self).__init__()
        num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
        representation_size = in_channels

        self.cls_score = nn.Linear(representation_size, num_classes)
        num_bbox_reg_classes = 2 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else num_classes
        self.bbox_pred = nn.Linear(representation_size, num_bbox_reg_classes * 4)

        nn.init.normal_(self.cls_score.weight, std=0.01)
        nn.init.normal_(self.bbox_pred.weight, std=0.001)
        for l in [self.cls_score, self.bbox_pred]:
            nn.init.constant_(l.bias, 0)

    def forward(self, x):
        if x.ndimension() == 4:
            assert list(x.shape[2:]) == [1, 1]
            x = x.view(x.size(0), -1)
        scores = self.cls_score(x)
        bbox_deltas = self.bbox_pred(x)

        return scores, bbox_deltas


# 实例化边框预测的类
def make_roi_box_predictor(cfg, in_channels):
    func = registry.ROI_BOX_PREDICTOR[cfg.MODEL.ROI_BOX_HEAD.PREDICTOR]
    return func(cfg, in_channels)

再次是同文件夹下的 loss.py:

# ./maskrcnn_benchmark/modeling/roi_heads/box_head/loss.py

class FastRCNNLossComputation(object):
    # 计算 Faster-RCNN 的 loss (也支持 FPN)
    def __init__(
        self,
        proposal_matcher,
        fg_bg_sampler,
        box_coder,
        cls_agnostic_bbox_reg=False
    ):
        self.proposal_matcher = proposal_matcher
        self.fg_bg_sampler = fg_bg_sampler
        self.box_coder = box_coder
        self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg

    def match_targets_to_proposals(self, proposal, target):
        # gt 和 预测边框之间的 IoU
        match_quality_matrix = boxlist_iou(target, proposal)

        # 预测边框和对应的 gt 的索引,背景边框为-2,模糊边框为-1
        matched_idxs = self.proposal_matcher(match_quality_matrix)

        # 获得 gt 的类别标签
        target = target.copy_with_fields("labels")

        # 计算各个预测边框对应的 gt 边框列表,所有背景边框以及模糊边框都对应成第一个gt
        matched_targets = target[matched_idxs.clamp(min=0)]

        # 将对应的列表索引添加至 gt 列表中
        matched_targets.add_field("matched_idxs", matched_idxs)
        return matched_targets

    # 计算出所有预测边框所对应的 gt 边框
    def prepare_targets(self, proposals, targets):
        # 初始化边框类别标签和边框回归列表
        labels = []
        regression_targets = []

        # 对每一张图片计算预测边框对应的 gt 列表
        for proposals_per_image, targets_per_image in zip(proposals, targets):
            # 得到各个预测边框对应的 gt 边框列表,所有背景边框以及模糊边框都对应成第一个gt
            matched_targets = self.match_targets_to_proposals(
                proposals_per_image, targets_per_image
            )

            # 获得对应的列表索引
            matched_idxs = matched_targets.get_field("matched_idxs")

            # 获得每一张图片生成的预测边框对应的具体类别标签,并将其转换为相应的数据类型
            labels_per_image = matched_targets.get_field("labels")
            labels_per_image = labels_per_image.to(dtype=torch.int64)

            # 获得背景边框列表的索引(小于最低阈值)
            bg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD

            # 把背景边框类别设置为0
            labels_per_image[bg_inds] = 0

            # 获得模糊边框(得分介于高低阈值之间)列表的索引
            ignore_inds = matched_idxs == Matcher.BETWEEN_THRESHOLDS

            # 把模糊边框对应的边框类别设置为-1
            labels_per_image[ignore_inds] = -1  # -1 is ignored by sampler

            # 计算边框偏差值
            regression_targets_per_image = self.box_coder.encode(
                matched_targets.bbox, proposals_per_image.bbox
            )

            labels.append(labels_per_image)
            regression_targets.append(regression_targets_per_image)

        return labels, regression_targets

    # 这个方法完成了正负类别筛选,即从一系列预测边框中选择出一定个数的
    # 含有一定比例的背景边框和含有目标的边框,且这个采样方式始终固定
    def subsample(self, proposals, targets):
        # 得到预测边框的类别标签以及边框回归信息列表
        labels, regression_targets = self.prepare_targets(proposals, targets)

        # 按照一定方式选取背景边框和目标边框,并返回其标签,在label中1为目标,0为背景
        sampled_pos_inds, sampled_neg_inds = self.fg_bg_sampler(labels)
        proposals = list(proposals)

        # 按照图片,将分类标签和边框回归标签保存到边框列表中
        for labels_per_image, regression_targets_per_image, proposals_per_image in zip(
            labels, regression_targets, proposals
        ):
            proposals_per_image.add_field("labels", labels_per_image)
            proposals_per_image.add_field(
                "regression_targets", regression_targets_per_image
            )

        # distributed sampled proposals, that were obtained on all feature maps
        # concatenated via the fg_bg_sampler, into individual feature map levels
        # 把采样器在各个featuremaps上采样得到的连在一起的 proposals 分发至对应的 feature map 上
        for img_idx, (pos_inds_img, neg_inds_img) in enumerate(
            zip(sampled_pos_inds, sampled_neg_inds)
        ):
            img_sampled_inds = torch.nonzero(pos_inds_img | neg_inds_img).squeeze(1)
            proposals_per_image = proposals[img_idx][img_sampled_inds]
            proposals[img_idx] = proposals_per_image
        self._proposals = proposals

        return proposals

    def __call__(self, class_logits, box_regression):
        # 计算Faster R-CNN的损失值,这一操作需要subsample方法已经在之前被调用过
        class_logits = cat(class_logits, dim=0)
        box_regression = cat(box_regression, dim=0)
        device = class_logits.device

        if not hasattr(self, "_proposals"):
            raise RuntimeError("subsample needs to be called before")

        proposals = self._proposals
        labels = cat([proposal.get_field("labels") for proposal in proposals], dim=0)
        regression_targets = cat(
            [proposal.get_field("regression_targets") for proposal in proposals], dim=0
        )
        # 分类损失函数: cross entropy
        classification_loss = F.cross_entropy(class_logits, labels)

        # 获得预测边框标签中>0的索引,即获得有目标的预测边框在边框列表中的索引
        sampled_pos_inds_subset = torch.nonzero(labels > 0).squeeze(1)

        # 获得所有含有目标的预测边框的标签列表
        labels_pos = labels[sampled_pos_inds_subset]

        if self.cls_agnostic_bbox_reg:
            # 如果整个模型采用agnostic模型,即只分别含目标与不含目标两类
            map_inds = torch.tensor([4, 5, 6, 7], device=device)
        else:
            # 当时正常模式时,获得含有目标的边框在对应的边框回归信息矩阵中的索引
            map_inds = 4 * labels_pos[:, None] + torch.tensor(
                [0, 1, 2, 3], device=device)

        # 边框回归损失函数: smooth L1
        box_loss = smooth_l1_loss(
            box_regression[sampled_pos_inds_subset[:, None], map_inds],
            regression_targets[sampled_pos_inds_subset],
            size_average=False,
            beta=1,
        )
        box_loss = box_loss / labels.numel()

        return classification_loss, box_loss


# 指定RoI_Box层边框损失的方法
def make_roi_box_loss_evaluator(cfg):
    # 预测边框与 gt 边框的匹配方法
    matcher = Matcher(
        cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD,
        cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD,
        allow_low_quality_matches=False,
    )

    # 边框回归过程中边框回归各个部分之间的权重
    bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS

    # 边框的编码器,其可以完成边框回归偏差值的计算以及通过网络生成的偏差值来计算新的预测边框
    box_coder = BoxCoder(weights=bbox_reg_weights)

    # 平衡背景边框与目标边框的采样器
    fg_bg_sampler = BalancedPositiveNegativeSampler(
        cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE, cfg.MODEL.ROI_HEADS.POSITIVE_FRACTION
    )

    # 是否采用将边框分为背景和目标两类的策略
    cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG

    # 预测边框的loss计算类
    loss_evaluator = FastRCNNLossComputation(
        matcher,
        fg_bg_sampler,
        box_coder,
        cls_agnostic_bbox_reg
    )

    return loss_evaluator

最后是同文件下的 inference.py:

# ./maskrcnn_benchmark/modeling/roi_heads/box_head/inference.py

# RPN 后处理类
class PostProcessor(nn.Module):
    # 由一系列分类得分 cls_scores、边框回归信息 bbox_reg、候选框 proposals
    # 经过计算得到后处理边框,并使用 NMS 来得到最终的结果
    def __init__(
        self,
        score_thresh=0.05,
        nms=0.5,
        detections_per_img=100,
        box_coder=None,
        cls_agnostic_bbox_reg=False,
        bbox_aug_enabled=False
    ):
        super(PostProcessor, self).__init__()
        self.score_thresh = score_thresh
        self.nms = nms
        self.detections_per_img = detections_per_img
        if box_coder is None:
            box_coder = BoxCoder(weights=(10., 10., 5., 5.))
        self.box_coder = box_coder
        self.cls_agnostic_bbox_reg = cls_agnostic_bbox_reg
        self.bbox_aug_enabled = bbox_aug_enabled

    def forward(self, x, boxes):
        """
        Arguments:
            x : contains the class logits and the box_regression from the model.
            boxes: bboxes that are used as reference, one for each image

        Returns:
            results: one BoxList for each image, containing the extra fields labels and scores
        """
        class_logits, box_regression = x
        class_prob = F.softmax(class_logits, -1)

        # TODO think about a representation of batch of boxes
        image_shapes = [box.size for box in boxes]
        boxes_per_image = [len(box) for box in boxes]
        concat_boxes = torch.cat([a.bbox for a in boxes], dim=0)

        if self.cls_agnostic_bbox_reg:
            # 只预测前景背景则获取边框回归信息(x, y, w, h)
            box_regression = box_regression[:, -4:]

        # 边框解码获得候选框
        proposals = self.box_coder.decode(
            box_regression.view(sum(boxes_per_image), -1), concat_boxes
        )

        # 通过repeat使得 cls_agnostic 和 cls_specific 的proposals形状一致
        if self.cls_agnostic_bbox_reg:
            proposals = proposals.repeat(1, class_prob.shape[1])

        # 类别数目
        num_classes = class_prob.shape[1]

        # 按每张图的边框数和类别数划分总的列表
        proposals = proposals.split(boxes_per_image, dim=0)
        class_prob = class_prob.split(boxes_per_image, dim=0)

        results = []
        for prob, boxes_per_img, image_shape in zip(
            class_prob, proposals, image_shapes
        ):
            # 得到边框列表和相应的概率
            boxlist = self.prepare_boxlist(boxes_per_img, prob, image_shape)

            # 滤掉边缘不符合条件的边框
            boxlist = boxlist.clip_to_image(remove_empty=False)
            if not self.bbox_aug_enabled:  # If bbox aug is enabled, we will do it later
                boxlist = self.filter_results(boxlist, num_classes)
            results.append(boxlist)
        return results

    def prepare_boxlist(self, boxes, scores, image_shape):
        # 得到边框列表并添加相应的概率作为附加属性
        # boxes 的格式为 (#detections, 4 * #classes)
        # scores 的格式为 (#detection, #classes)
        # scores[i, j] 对应边框 boxes[i, j * 4: (j + 1) * 4].
        boxes = boxes.reshape(-1, 4)
        scores = scores.reshape(-1)

        # 边框格式为 (x1, y1, x2, y2)
        boxlist = BoxList(boxes, image_shape, mode="xyxy")

        # 边框类别分数添加到边框列表中
        boxlist.add_field("scores", scores)
        return boxlist

    def filter_results(self, boxlist, num_classes):
        # NMS 过滤结果
        # 展开 boxlist 以防止额外的开销
        boxes = boxlist.bbox.reshape(-1, num_classes * 4)
        scores = boxlist.get_field("scores").reshape(-1, num_classes)

        device = scores.device
        result = []
        # 选取大于阈值的 boxlist 的索引
        inds_all = scores > self.score_thresh

        # 跳过 j = 0 (背景类)
        for j in range(1, num_classes):
            inds = inds_all[:, j].nonzero().squeeze(1)

            # 类别分数和边框信息
            scores_j = scores[inds, j]
            boxes_j = boxes[inds, j * 4 : (j + 1) * 4]
            boxlist_for_class = BoxList(boxes_j, boxlist.size, mode="xyxy")
            boxlist_for_class.add_field("scores", scores_j)

            # NMS 过滤
            boxlist_for_class = boxlist_nms(
                boxlist_for_class, self.nms
            )

            num_labels = len(boxlist_for_class)
            boxlist_for_class.add_field(
                "labels", torch.full((num_labels,), j, dtype=torch.int64, device=device)
            )
            result.append(boxlist_for_class)

        result = cat_boxlist(result)
        number_of_detections = len(result)

        # 限制每张图片最多检测结果数目
        if number_of_detections > self.detections_per_img > 0:
            cls_scores = result.get_field("scores")

            # 选取前 k 个置信度最高的
            image_thresh, _ = torch.kthvalue(
                cls_scores.cpu(), number_of_detections - self.detections_per_img + 1
            )
            keep = cls_scores >= image_thresh.item()
            keep = torch.nonzero(keep).squeeze(1)
            result = result[keep]
        return result


def make_roi_box_post_processor(cfg):
    # 是否使用fpn
    use_fpn = cfg.MODEL.ROI_HEADS.USE_FPN

    # 边框回归权重
    bbox_reg_weights = cfg.MODEL.ROI_HEADS.BBOX_REG_WEIGHTS
    
    # 边框信息编码器
    box_coder = BoxCoder(weights=bbox_reg_weights)

    # 类别置信度阈值
    score_thresh = cfg.MODEL.ROI_HEADS.SCORE_THRESH
    
    # NMS 阈值
    nms_thresh = cfg.MODEL.ROI_HEADS.NMS
    
    # 每张图最多检测出目标的数目
    detections_per_img = cfg.MODEL.ROI_HEADS.DETECTIONS_PER_IMG
    
    # 只区分前景背景的边框回归
    cls_agnostic_bbox_reg = cfg.MODEL.CLS_AGNOSTIC_BBOX_REG
    
    # bbox是否增强
    bbox_aug_enabled = cfg.TEST.BBOX_AUG.ENABLED

    # RPN 后处理类
    postprocessor = PostProcessor(
        score_thresh,
        nms_thresh,
        detections_per_img,
        box_coder,
        cls_agnostic_bbox_reg,
        bbox_aug_enabled
    )
    return postprocessor

roi_head 中的 box_head 基本就这些了,另外一个 mask_head 是 MaskRCNN 系列的分割头部,roi_heads/mask_head/ 问价下也是分为 mask_head.py, roi_mask_feature_extractor.py, roi_mask_predictor.py, loss.py, inference.py 这几个和 box_head 相似的文件,依次来看,首先是 mask_head.py:

# ./maskrcnn_benchmark/modeling/roi_heads/mask_head/mask_head.py

# 传入含有类别(labels)属性的边框列表,返回 labels > 0 (非背景类)的边框列表
def keep_only_positive_boxes(boxes):
    assert isinstance(boxes, (list, tuple))
    assert isinstance(boxes[0], BoxList)
    assert boxes[0].has_field("labels")
    
    # 非背景边框及其索引
    positive_boxes = []
    positive_inds = []
    num_boxes = 0
    for boxes_per_image in boxes:
        labels = boxes_per_image.get_field("labels")
        
        # 得到非背景类的索引矩阵
        inds_mask = labels > 0
        inds = inds_mask.nonzero().squeeze(1)
        positive_boxes.append(boxes_per_image[inds])
        positive_inds.append(inds_mask)
    return positive_boxes, positive_inds


class ROIMaskHead(torch.nn.Module):
    def __init__(self, cfg, in_channels):
        super(ROIMaskHead, self).__init__()
        self.cfg = cfg.clone()
        self.feature_extractor = make_roi_mask_feature_extractor(cfg, in_channels)
        self.predictor = make_roi_mask_predictor(
            cfg, self.feature_extractor.out_channels)
        self.post_processor = make_roi_mask_post_processor(cfg)
        self.loss_evaluator = make_roi_mask_loss_evaluator(cfg)

    def forward(self, features, proposals, targets=None):
        """
        Arguments:
            features: 可能来自多个网络层(FPN)的feature-maps
            proposals: 候选框
            targets: ground-truth

        Returns:
            x: 特征提取后的特征图
            proposals: 训练过程中返回输入的候选框;测试时返回带 mask 的预测框列表
            losses: 训练时返回损失,测试时返回空字典
        """

        if self.training:
            # 只训练正样本
            all_proposals = proposals
            proposals, positive_inds = keep_only_positive_boxes(proposals)
        if self.training and self.cfg.MODEL.ROI_MASK_HEAD.SHARE_BOX_FEATURE_EXTRACTOR:
            # 如果训练阶段 mask_head 和 box_head 共享卷积特征图
            x = features
            x = x[torch.cat(positive_inds, dim=0)]
        else:
            # 测试阶段返回提取特征后的特征图
            x = self.feature_extractor(features, proposals)
        
        # 得到预测的类别号
        mask_logits = self.predictor(x)

        if not self.training:
            # 测试阶段 loss 返回空的字典
            result = self.post_processor(mask_logits, proposals)
            return x, result, {}
        
        # 计算 mask loss
        loss_mask = self.loss_evaluator(proposals, mask_logits, targets)
        return x, all_proposals, dict(loss_mask=loss_mask)


def build_roi_mask_head(cfg, in_channels):
    return ROIMaskHead(cfg, in_channels)

其次是 roi_mask_feature_extractor.py:

# ./maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_feature_extractor.py

# 用 resnet50 的 conv5 来提取特征
registry.ROI_MASK_FEATURE_EXTRACTORS.register(
    "ResNet50Conv5ROIFeatureExtractor", ResNet50Conv5ROIFeatureExtractor
)


@registry.ROI_MASK_FEATURE_EXTRACTORS.register("MaskRCNNFPNFeatureExtractor")
class MaskRCNNFPNFeatureExtractor(nn.Module):
    # 用于 FPN 分类的 head
    def __init__(self, cfg, in_channels):
        """
        Arguments:
            num_classes: 输出类别数目
            input_size: 输入 flattened 后的通道数
            representation_size: 中间层大小
        """
        super(MaskRCNNFPNFeatureExtractor, self).__init__()

        # mask head 池化后的分辨率(大小,默认为14
        resolution = cfg.MODEL.ROI_MASK_HEAD.POOLER_RESOLUTION

        # 池化尺度
        scales = cfg.MODEL.ROI_MASK_HEAD.POOLER_SCALES

        # ROIAlign 的采样率
        sampling_ratio = cfg.MODEL.ROI_MASK_HEAD.POOLER_SAMPLING_RATIO

        # 池化类
        pooler = Pooler(
            output_size=(resolution, resolution),       # 池化后为 14 * 14
            scales=scales,
            sampling_ratio=sampling_ratio,
        )
        input_size = in_channels
        self.pooler = pooler

        # 是否使用 GN
        use_gn = cfg.MODEL.ROI_MASK_HEAD.USE_GN

        # 提取特征的卷积层
        layers = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS

        # 是否使用空洞卷积
        dilation = cfg.MODEL.ROI_MASK_HEAD.DILATION

        # 赋初值,用于迭代
        next_feature = input_size
        self.blocks = []
        # 循环构建特征提取器
        for layer_idx, layer_features in enumerate(layers, 1):
            layer_name = "mask_fcn{}".format(layer_idx)
            module = make_conv3x3(
                next_feature, layer_features,
                dilation=dilation, stride=1, use_gn=use_gn
            )
            self.add_module(layer_name, module)
            next_feature = layer_features
            self.blocks.append(layer_name)
        self.out_channels = layer_features

    def forward(self, x, proposals):
        x = self.pooler(x, proposals)

        for layer_name in self.blocks:
            x = F.relu(getattr(self, layer_name)(x))

        return x


def make_roi_mask_feature_extractor(cfg, in_channels):
    func = registry.ROI_MASK_FEATURE_EXTRACTORS[
        cfg.MODEL.ROI_MASK_HEAD.FEATURE_EXTRACTOR
    ]
    return func(cfg, in_channels)

再次是 roi_mask_predictor.py:

# ./maskrcnn_benchmark/modeling/roi_heads/mask_head/roi_mask_predictor.py

# 用 MaskRCNN C4 层预测类别
@registry.ROI_MASK_PREDICTOR.register("MaskRCNNC4Predictor")
class MaskRCNNC4Predictor(nn.Module):
    def __init__(self, cfg, in_channels):
        super(MaskRCNNC4Predictor, self).__init__()
        
        # 输出的类别数,默认为2(即是否为当前类,否则为背景)
        num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
        
        # 降维后的维度
        dim_reduced = cfg.MODEL.ROI_MASK_HEAD.CONV_LAYERS[-1]
        
        # 输入维度
        num_inputs = in_channels
        
        # 反卷积上采样同时降低维度
        self.conv5_mask = ConvTranspose2d(num_inputs, dim_reduced, 2, 2, 0)
        
        # FCN 得到最终的类别维度
        self.mask_fcn_logits = Conv2d(dim_reduced, num_classes, 1, 1, 0)

        # 初始化权重
        for name, param in self.named_parameters():
            if "bias" in name:
                nn.init.constant_(param, 0)
            elif "weight" in name:
                # Caffe2 implementation uses MSRAFill, which in fact
                # corresponds to kaiming_normal_ in PyTorch
                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")

    def forward(self, x):
        x = F.relu(self.conv5_mask(x))
        return self.mask_fcn_logits(x)


# 用1x1卷积预测类别
@registry.ROI_MASK_PREDICTOR.register("MaskRCNNConv1x1Predictor")
class MaskRCNNConv1x1Predictor(nn.Module):
    def __init__(self, cfg, in_channels):
        super(MaskRCNNConv1x1Predictor, self).__init__()
        num_classes = cfg.MODEL.ROI_BOX_HEAD.NUM_CLASSES
        num_inputs = in_channels
        # 不经过上采样直接利用FCN从输入维度转换为输出类别的维度
        self.mask_fcn_logits = Conv2d(num_inputs, num_classes, 1, 1, 0)

        for name, param in self.named_parameters():
            if "bias" in name:
                nn.init.constant_(param, 0)
            elif "weight" in name:
                # Caffe2 implementation uses MSRAFill, which in fact
                # corresponds to kaiming_normal_ in PyTorch
                nn.init.kaiming_normal_(param, mode="fan_out", nonlinearity="relu")

    def forward(self, x):
        return self.mask_fcn_logits(x)


def make_roi_mask_predictor(cfg, in_channels):
    func = registry.ROI_MASK_PREDICTOR[cfg.MODEL.ROI_MASK_HEAD.PREDICTOR]
    return func(cfg, in_channels)

再次是 inference.py:

# ./maskrcnn_benchmark/modeling/roi_heads/mask_head/inference.py


# TODO check if want to return a single BoxList or a composite object
class MaskPostProcessor(nn.Module):
    # mask 后处理类只对拥有最大置信度的那一层进行分割(只分割预测的那一类)
    # 且这一层是固定尺寸并直接由CNN输出的
    # 输出结果是二值分割掩码,将会被添加到 BoxList 的 mask 部分
    def __init__(self, masker=None):
        super(MaskPostProcessor, self).__init__()
        self.masker = masker

    def forward(self, x, boxes):
        """
        Arguments:
            x: mask 数值
            boxes: 作为参考的边框, 一张图片对应一组

        Returns:
            results (list[BoxList]): 包含 mask 的 BoxList
        """
        # 用sigmoid函数计算mask每一个像素作为前景的概率
        mask_prob = x.sigmoid()

        # 类别数目
        num_masks = x.shape[0]

        # 对应的具体类别
        labels = [bbox.get_field("labels") for bbox in boxes]
        labels = torch.cat(labels)

        # (根据类别索引)选择预测出的类别对应的那一层 mask
        index = torch.arange(num_masks, device=labels.device)
        mask_prob = mask_prob[index, labels][:, None]

        # 依据每张图对应的检测边框数目切分 mask
        boxes_per_image = [len(box) for box in boxes]
        mask_prob = mask_prob.split(boxes_per_image, dim=0)

        # 依据边框类别索引,将mask映射到对应的图片
        if self.masker:
            mask_prob = self.masker(mask_prob, boxes)

        results = []
        for prob, box in zip(mask_prob, boxes):
            bbox = BoxList(box.bbox, box.size, mode="xyxy")
            for field in box.fields():
                bbox.add_field(field, box.get_field(field))
            bbox.add_field("mask", prob)
            results.append(bbox)

        return results


# coco格式数据的mask 后处理类(继承自上面的类)
class MaskPostProcessorCOCOFormat(MaskPostProcessor):
    # 分割 + 转换为 coco 格式输出
    def forward(self, x, boxes):
        import pycocotools.mask as mask_util
        import numpy as np

        results = super(MaskPostProcessorCOCOFormat, self).forward(x, boxes)
        for result in results:
            masks = result.get_field("mask").cpu()
            rles = [
                mask_util.encode(np.array(mask[0, :, :, np.newaxis], order="F"))[0]
                for mask in masks
            ]
            for rle in rles:
                rle["counts"] = rle["counts"].decode("utf-8")
            result.add_field("mask", rles)
        return results


# the next two functions should be merged inside Masker
# but are kept here for the moment while we need them
# temporarily gor paste_mask_in_image
def expand_boxes(boxes, scale):
    w_half = (boxes[:, 2] - boxes[:, 0]) * .5
    h_half = (boxes[:, 3] - boxes[:, 1]) * .5
    x_c = (boxes[:, 2] + boxes[:, 0]) * .5
    y_c = (boxes[:, 3] + boxes[:, 1]) * .5

    w_half *= scale
    h_half *= scale

    boxes_exp = torch.zeros_like(boxes)
    boxes_exp[:, 0] = x_c - w_half
    boxes_exp[:, 2] = x_c + w_half
    boxes_exp[:, 1] = y_c - h_half
    boxes_exp[:, 3] = y_c + h_half
    return boxes_exp


def expand_masks(mask, padding):
    N = mask.shape[0]
    M = mask.shape[-1]
    pad2 = 2 * padding
    scale = float(M + pad2) / M
    padded_mask = mask.new_zeros((N, 1, M + pad2, M + pad2))

    padded_mask[:, :, padding:-padding, padding:-padding] = mask
    return padded_mask, scale


def paste_mask_in_image(mask, box, im_h, im_w, thresh=0.5, padding=1):
    # Need to work on the CPU, where fp16 isn't supported - cast to float to avoid this
    mask = mask.float()
    box = box.float()

    padded_mask, scale = expand_masks(mask[None], padding=padding)
    mask = padded_mask[0, 0]
    box = expand_boxes(box[None], scale)[0]
    box = box.to(dtype=torch.int32)

    TO_REMOVE = 1
    w = int(box[2] - box[0] + TO_REMOVE)
    h = int(box[3] - box[1] + TO_REMOVE)
    w = max(w, 1)
    h = max(h, 1)

    # Set shape to [batchxCxHxW]
    mask = mask.expand((1, 1, -1, -1))

    # Resize mask
    mask = mask.to(torch.float32)
    mask = interpolate(mask, size=(h, w), mode='bilinear', align_corners=False)
    mask = mask[0][0]

    if thresh >= 0:
        mask = mask > thresh
    else:
        # for visualization and debugging, we also
        # allow it to return an unmodified mask
        mask = (mask * 255).to(torch.uint8)

    im_mask = torch.zeros((im_h, im_w), dtype=torch.uint8)
    x_0 = max(box[0], 0)
    x_1 = min(box[2] + 1, im_w)
    y_0 = max(box[1], 0)
    y_1 = min(box[3] + 1, im_h)

    im_mask[y_0:y_1, x_0:x_1] = mask[
        (y_0 - box[1]) : (y_1 - box[1]), (x_0 - box[0]) : (x_1 - box[0])
    ]
    return im_mask


class Masker(object):
    # 依据边框类别索引,将mask映射到对应的图片

    def __init__(self, threshold=0.5, padding=1):
        self.threshold = threshold
        self.padding = padding

    def forward_single_image(self, masks, boxes):
        # 边框转为 (x1, y1, x2, y2) 格式
        boxes = boxes.convert("xyxy")
        im_w, im_h = boxes.size
        # 用于可视化,将 mask(14*14) resize + padding 到原图尺寸大小
        res = [
            paste_mask_in_image(mask[0], box, im_h, im_w, self.threshold, self.padding)
            for mask, box in zip(masks, boxes.bbox)
        ]
        if len(res) > 0:
            res = torch.stack(res, dim=0)[:, None]
        else:
            res = masks.new_empty((0, 1, masks.shape[-2], masks.shape[-1]))
        return res

    def __call__(self, masks, boxes):
        if isinstance(boxes, BoxList):
            boxes = [boxes]

        # Make some sanity check
        assert len(boxes) == len(masks), "Masks and boxes should have the same length."

        # TODO:  Is this JIT compatible?
        # If not we should make it compatible.
        results = []
        for mask, box in zip(masks, boxes):
            assert mask.shape[0] == len(box), "Number of objects should be the same."
            result = self.forward_single_image(mask, box)
            results.append(result)
        return results


def make_roi_mask_post_processor(cfg):
    if cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS:
        mask_threshold = cfg.MODEL.ROI_MASK_HEAD.POSTPROCESS_MASKS_THRESHOLD
        masker = Masker(threshold=mask_threshold, padding=1)
    else:
        masker = None
    mask_post_processor = MaskPostProcessor(masker)
    return mask_post_processor

最后是 loss.py:

# ./maskrcnn_benchmark/modeling/roi_heads/mask_head/loss.py

def project_masks_on_boxes(segmentation_masks, proposals, discretization_size):
    # 给定分割后的 masks 和边框信息,此函数将 masks 分割+缩放到边框对应的位置
    # segmentation_masks: 实例(物体)对应的分割掩码
    # proposals: 实例的 BoxList

    masks = []
    M = discretization_size
    device = proposals.bbox.device
    proposals = proposals.convert("xyxy")
    assert segmentation_masks.size == proposals.size, "{}, {}".format(
        segmentation_masks, proposals
    )

    # FIXME: CPU computation bottleneck, this should be parallelized
    proposals = proposals.bbox.to(torch.device("cpu"))
    for segmentation_mask, proposal in zip(segmentation_masks, proposals):
        # 从 proposals 中裁剪得到 masks 并将其缩放到 M*M 的大小后转为 tensor 格式
        # 作为 gt_masks
        cropped_mask = segmentation_mask.crop(proposal)
        scaled_mask = cropped_mask.resize((M, M))
        mask = scaled_mask.get_mask_tensor()
        masks.append(mask)
    if len(masks) == 0:
        return torch.empty(0, dtype=torch.float32, device=device)
    return torch.stack(masks, dim=0).to(device, dtype=torch.float32)


class MaskRCNNLossComputation(object):
    def __init__(self, proposal_matcher, discretization_size):
        self.proposal_matcher = proposal_matcher
        self.discretization_size = discretization_size

    def match_targets_to_proposals(self, proposal, target):
        # 候选框和 gt 的 IoU
        match_quality_matrix = boxlist_iou(target, proposal)

        # 获得上述 IoU 的索引
        matched_idxs = self.proposal_matcher(match_quality_matrix)

        # Mask RCNN 需要 "labels" and "masks " 来构建 gt
        target = target.copy_with_fields(["labels", "masks"])
        # 获取每一个候选框的 gt,需要滤掉 matched_idxs 为-2的(超过边界的proposals)
        matched_targets = target[matched_idxs.clamp(min=0)]
        matched_targets.add_field("matched_idxs", matched_idxs)
        return matched_targets

    def prepare_targets(self, proposals, targets):
        # 将 proposals 和 targets 对应 (获取 gt)
        labels = []
        masks = []
        for proposals_per_image, targets_per_image in zip(proposals, targets):
            matched_targets = self.match_targets_to_proposals(
                proposals_per_image, targets_per_image
            )
            matched_idxs = matched_targets.get_field("matched_idxs")

            labels_per_image = matched_targets.get_field("labels")
            labels_per_image = labels_per_image.to(dtype=torch.int64)

            # this can probably be removed, but is left here for clarity
            # and completeness
            neg_inds = matched_idxs == Matcher.BELOW_LOW_THRESHOLD
            labels_per_image[neg_inds] = 0

            # mask 的分数只在正样本上计算
            positive_inds = torch.nonzero(labels_per_image > 0).squeeze(1)

            # 获取 gt_masks
            segmentation_masks = matched_targets.get_field("masks")

            # 保留其中的正样本 gt_masks
            segmentation_masks = segmentation_masks[positive_inds]

            # 获取对应的正样本的 proposals
            positive_proposals = proposals_per_image[positive_inds]

            # 建立 masks 和边框的映射
            masks_per_image = project_masks_on_boxes(
                segmentation_masks, positive_proposals, self.discretization_size
            )

            labels.append(labels_per_image)
            masks.append(masks_per_image)

        return labels, masks

    def __call__(self, proposals, mask_logits, targets):
        """
        Arguments:
            proposals (list[BoxList])
            mask_logits (Tensor)
            targets (list[BoxList])

        Return:
            mask_loss (Tensor): scalar tensor containing the loss
        """
        # 获取类别和mask标签,并各自连接
        labels, mask_targets = self.prepare_targets(proposals, targets)
        labels = cat(labels, dim=0)
        mask_targets = cat(mask_targets, dim=0)

        # 获取正样本的labels
        positive_inds = torch.nonzero(labels > 0).squeeze(1)
        labels_pos = labels[positive_inds]

        # 二值交叉熵损失中的 torch.mean() 不接收空的张量,所以单独处理
        if mask_targets.numel() == 0:
            return mask_logits.sum() * 0

        # mask 损失计算:二值交叉熵损失函数(with logits 表示内部实现了 softmax)
        mask_loss = F.binary_cross_entropy_with_logits(
            mask_logits[positive_inds, labels_pos], mask_targets
        )
        return mask_loss


def make_roi_mask_loss_evaluator(cfg):
    matcher = Matcher(
        cfg.MODEL.ROI_HEADS.FG_IOU_THRESHOLD,   # 前景IOU阈值
        cfg.MODEL.ROI_HEADS.BG_IOU_THRESHOLD,   # 背景IOU阈值
        allow_low_quality_matches=False,
    )

    loss_evaluator = MaskRCNNLossComputation(
        matcher, cfg.MODEL.ROI_MASK_HEAD.RESOLUTION
    )

    return loss_evaluator

 

;