YOLOV8从不同的特征层,得到不同大小的特征图,然后预测每个特征图的每个格子anchor的类别概率,以及每个格子中物体的边框,即相对于中心点上下左右的偏移量box。
shape为[(1, 144, 80, 80),(1, 144, 40, 40),(1,144,20,20)]。
输入x为从不同的上采样层得到的结果
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)
#(1,64,8400),(1,80,8400)
整合这些结果,得到的shape为 (1,144,8400)。其中:
8400 = 80 * 80+40 * 40+20 * 20,总的预测数
144 为80个class和4*16个box
4 为预测的四个边框距离中心点的距离,是Anchor-Free的预测目标,格式为[left,top,right,bottom]。
self.reg_max = 16,是中心点的最大预测范围,即边框距离中心点的最远距离为16,但并不是16个像素,因为预测值都进行了不同stride的缩放。这个参数也决定了检测物体最大边框为 reg_max * stride*2。
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
#(2,8400),(1,8400)
self.shape = shape #(1, 144, 80, 80)
def make_anchors(feats, strides, grid_cell_offset=0.5):
"""Generate anchors from features."""
anchor_points, stride_tensor = [], []
assert feats is not None
dtype, device = feats[0].dtype, feats[0].device
for i, stride in enumerate(strides):
_, _, h, w = feats[i].shape
sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x
sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y
sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
return torch.cat(anchor_points), torch.cat(stride_tensor)
self.anchors[:,:10]
tensor([[0.5000, 1.5000, 2.5000, 3.5000, 4.5000, 5.5000, 6.5000, 7.5000, 8.5000, 9.5000],
[0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.5000]], device='cuda:0')
self.strides[:,:10]
tensor([[8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]], device='cuda:0')
make_anchors,主要生成预测的网格点,
其中x 的shape [(1, 144, 80, 80),(1, 144, 40, 40),(1,144,20,20)]
self.stride 的值为:tensor([8., 16., 32.])
对应 80 * 80的特征图,生成 80 * 80的anchor和 80 * 80 的stride,anchor就是每个 1*1 网格的中心点,stride是缩放系数,大的特征图缩放系数小,用来预测小物体。
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides
#(1,4,8400),(1,2,8400) => (1,4,8400)
class DFL(nn.Module):
def __init__(self, c1=16):
"""Initialize a convolutional layer with a given number of input channels."""
super().__init__()
self.conv = nn.Conv2d(c1, 1, 1, bias=False).requires_grad_(False)
x = torch.arange(c1, dtype=torch.float)
self.conv.weight.data[:] = nn.Parameter(x.view(1, c1, 1, 1))
self.c1 = c1
def forward(self, x):
#(1,64,8400) => (1,4,16,8400) => (1,16,4,8400) => (1,1,4,8400) => (1,4,8400)
"""Applies a transformer layer on input tensor 'x' and returns a tensor."""
b, _, a = x.shape # batch, channels, anchors
return self.conv(x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1)).view(b, 4, a)
def decode_bboxes(self, bboxes, anchors):
"""Decode bounding boxes."""
if self.export:
return dist2bbox(bboxes, anchors, xywh=False, dim=1)
return dist2bbox(bboxes, anchors, xywh=True, dim=1)
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
"""Transform distance(ltrb) to box(xywh or xyxy)."""
lt, rb = distance.chunk(2, dim)
x1y1 = anchor_points - lt
x2y2 = anchor_points + rb
if xywh:
c_xy = (x1y1 + x2y2) / 2
wh = x2y2 - x1y1
return torch.cat((c_xy, wh), dim) # xywh bbox
return torch.cat((x1y1, x2y2), dim) # xyxy bbox
self.dfl(box):计算box偏移量
x.view(b, 4, self.c1, a).transpose(2, 1).softmax(1):先把box从(1,64,8400) => (1,4,16,8400) => (1,16,4,8400),然后对dim=1进行softmax计算,给16个距离对应的权重。
self.conv的参数requires_grad_(False),等于x = torch.arange(c1, dtype=torch.float),固定为 tensor([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15.]),再进行 nn.Conv2d(c1, 1, 1, bias=False)计算,就相当于用softmax后的权重乘以对应的数值,得到最终的偏移量。
decode_bboxes 使用 dist2bbox函数,box的格式为[left,top,right,bottom],将box分为两部分,用中心点减去left,top,得到左上角x1y1,用中心点加上 right,bottom,得到右下角的点x2y2,这样就得到了xyxy格式(也可以转换为xywh格式)的坐标点,再乘以对应的stride,得到最终的坐标点。(1,4,8400)
y = torch.cat((dbox, cls.sigmoid()), 1) #(1,84,8400)
将预测的坐标点和类别合并,得到最终返回结果。
完整代码:
class Detect(nn.Module):
"""YOLOv8 Detect head for detection models."""
dynamic = False # force grid reconstruction
export = False # export mode
shape = None
anchors = torch.empty(0) # init
strides = torch.empty(0) # init
def __init__(self, nc=80, ch=()):
"""Initializes the YOLOv8 detection layer with specified number of classes and channels."""
super().__init__()
self.nc = nc # number of classes
self.nl = len(ch) # number of detection layers
self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
self.no = nc + self.reg_max * 4 # number of outputs per anchor
self.stride = torch.zeros(self.nl) # strides computed during build
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
def inference(self, x):#[(1, 144, 80, 80),(1, 144, 40, 40),(1,144,20,20)]
# Inference path
shape = x[0].shape # BCHW (1, 144, 80, 80)
x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2) #(1,144,8400)
if self.dynamic or self.shape != shape:
self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5)) #(2,8400),(1,8400)
self.shape = shape #(1, 144, 80, 80)
if self.export and self.format in ("saved_model", "pb", "tflite", "edgetpu", "tfjs"): # avoid TF FlexSplitV ops
box = x_cat[:, : self.reg_max * 4]
cls = x_cat[:, self.reg_max * 4 :]
else:
box, cls = x_cat.split((self.reg_max * 4, self.nc), 1) #(1,64,8400),(1,80,8400)
if self.export and self.format in ("tflite", "edgetpu"):
# Precompute normalization factor to increase numerical stability
# See https://github.com/ultralytics/ultralytics/issues/7371
grid_h = shape[2]
grid_w = shape[3]
grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
norm = self.strides / (self.stride[0] * grid_size)
dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
else:
dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides #(1,4,8400),(1,2,8400) => (1,4,8400)
y = torch.cat((dbox, cls.sigmoid()), 1) #(1,84,8400)
return y if self.export else (y, x)
def forward_feat(self, x, cv2, cv3):
y = []
for i in range(self.nl):
y.append(torch.cat((cv2[i](x[i]), cv3[i](x[i])), 1))
return y
def forward(self, x):
"""Concatenates and returns predicted bounding boxes and class probabilities."""
y = self.forward_feat(x, self.cv2, self.cv3)
if self.training:
return y
return self.inference(y)
def bias_init(self):
"""Initialize Detect() biases, WARNING: requires stride availability."""
m = self # self.model[-1] # Detect() module
# cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
# ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum()) # nominal class frequency
for a, b, s in zip(m.cv2, m.cv3, m.stride): # from
a[-1].bias.data[:] = 1.0 # box
b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2) # cls (.01 objects, 80 classes, 640 img)
def decode_bboxes(self, bboxes, anchors):
"""Decode bounding boxes."""
if self.export:
return dist2bbox(bboxes, anchors, xywh=False, dim=1)
return dist2bbox(bboxes, anchors, xywh=True, dim=1)
参考:
YOLOv8详解:损失函数、Anchor-Free、样本分配策略;以及与v5的对比_yolov8的损失函数为什么大于1-CSDN博客