【C#深度学习之路】如何使用C#实现Yolov8模型的训练和推理

项目背景
算法实现
- 模型结构
项目展望
写在最后
项目下载链接

本文为原创文章，若需要转载，请注明出处。
原文地址：https://blog.csdn.net/qq_30270773/article/details/143529308
项目对应的Github地址：https://github.com/IntptrMax/YoloSharp
C#深度学习之路专栏地址：https://blog.csdn.net/qq_30270773/category_12829217.html
关注我的Github，可以获取更多资料，请为你感兴趣的项目送上一颗小星星：https://github.com/IntptrMax

项目背景

Yolov8模型是Yolov5模型的升级版本，在训练和推理速度上较Yolov5有一定的优势，也是一种使用十分广泛的视觉识别的深度学习模型。同大多数深度学习模型的跨平台使用者一样，一般都是在Python平台下进行训练，然后导出成onnx模型，供其他平台使用。这种使用方法必须经过一层转化，而且使用平台和训练平台大多不一样，存在一定的界限。
之前发布了如何使用除Python以外的平台训练Yolov5模型这份资料，估计属于公开的除Python平台外进行Yolov5训练的独一份资料，在这里继续公开发布如何使用除Python以外的平台训练Yolov8模型的相关资料，仍旧是全网独一份。如果该资料对你有帮助，请在我的Github上送我一颗小星星。该项目的Github链接为https://github.com/IntptrMax/YoloSharp

算法实现

由于Yolov8的算法衍生自Yolov5，两者在大多数地方相似，因此本文只介绍不同的地方，Yolov5的算法实现请参考https://blog.csdn.net/qq_30270773/article/details/143529308

模型结构

Yolov8的模型分为BackBone、Head两个主要模块。Yolov8的官方模型又可以分为n、s、l、x等几种不同的尺寸。尺寸越大，模型复杂度越高，效果越好，但消耗的资源越多。本文可以通过调整depth_multiple和width_multiple这两个参数显示不同尺寸的模型。

Yolov8的官方定义如下：

scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
  # [depth, width, max_channels]
  n: [0.33, 0.25, 1024] # YOLOv8n summary: 225 layers,  3157200 parameters,  3157184 gradients,   8.9 GFLOPs
  s: [0.33, 0.50, 1024] # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients,  28.8 GFLOPs
  m: [0.67, 0.75, 768] # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients,  79.3 GFLOPs
  l: [1.00, 1.00, 512] # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs
  x: [1.00, 1.25, 512] # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs

# YOLOv8.0n backbone
backbone:
  # [from, repeats, module, args]
  - [-1, 1, Conv, [64, 3, 2]] # 0-P1/2
  - [-1, 1, Conv, [128, 3, 2]] # 1-P2/4
  - [-1, 3, C2f, [128, True]]
  - [-1, 1, Conv, [256, 3, 2]] # 3-P3/8
  - [-1, 6, C2f, [256, True]]
  - [-1, 1, Conv, [512, 3, 2]] # 5-P4/16
  - [-1, 6, C2f, [512, True]]
  - [-1, 1, Conv, [1024, 3, 2]] # 7-P5/32
  - [-1, 3, C2f, [1024, True]]
  - [-1, 1, SPPF, [1024, 5]] # 9

# YOLOv8.0n head
head:
  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
  - [[-1, 6], 1, Concat, [1]] # cat backbone P4
  - [-1, 3, C2f, [512]] # 12

  - [-1, 1, nn.Upsample, [None, 2, "nearest"]]
  - [[-1, 4], 1, Concat, [1]] # cat backbone P3
  - [-1, 3, C2f, [256]] # 15 (P3/8-small)

  - [-1, 1, Conv, [256, 3, 2]]
  - [[-1, 12], 1, Concat, [1]] # cat head P4
  - [-1, 3, C2f, [512]] # 18 (P4/16-medium)

  - [-1, 1, Conv, [512, 3, 2]]
  - [[-1, 9], 1, Concat, [1]] # cat head P5
  - [-1, 3, C2f, [1024]] # 21 (P5/32-large)

  - [[15, 18, 21], 1, Detect, [nc]] # Detect(P3, P4, P5)

与Yolov5相比，Yolov8使用了C2f模块代替了C3模块其他部分保持一致。Head部分减少了2个Conv层，可以加快训练/推理速度。

C2f层的实现方式如下：

public class C2f : Module<Tensor, Tensor>
{
	private readonly Conv cv1;
	private readonly Conv cv2;
	private readonly Sequential m = Sequential();
	public C2f(int inChannels, int outChannels, int n = 1, bool shortcut = false, int groups = 1, float e = 0.5f) : base("C2f")
	{
		int c = (int)(outChannels * e);
		this.cv1 = new Conv(inChannels, 2 * c, 1, 1);
		this.cv2 = new Conv((2 + n) * c, outChannels, 1);  // optional act=FReLU(c2)
		for (int i = 0; i < n; i++)
		{
			m = m.append(new Bottleneck(c, c, (3, 3), shortcut, groups, e));
		}
		RegisterComponents();
	}

	public override Tensor forward(Tensor input)
	{
		var y = this.cv1.forward(input).chunk(2, 1).ToList();
		for (int i = 0; i < m.Count; i++)
		{
			 y.Add(m[i].call(y.Last()));
		}
		return cv2.forward(cat(y, 1));
	}
}

与C3层相比，此处也减少了一次Conv层运算，速度可以进一步提升。
从以上看，Yolov8较Yolov5更注重的是速度。

Yolov8与Yolov5比，Detect也有所不同，Yolov5的的Detect层中有对预测条目的概率项目，但在Yolov8中取消了这个项目，直接用各个Sort对应的概率值作为最终结果，这样数据量会更少。总体上Yolov8的Detect较Yolov5在思路上更加复杂，但对于经过Detect层后，处理结果会更加容易。详细代码如下：

public class Yolov8Detect : Module<Tensor[], Tensor[]>
		{
			internal bool end2end = false; // end2end
			private int max_det = 300; // max_det
			private long[] shape = null;
			private Tensor anchors = torch.empty(0); // init
			private Tensor strides = torch.empty(0); // init

			private readonly int nc;
			private readonly int nl;
			private readonly int reg_max;
			private readonly int no;
			private readonly int[] stride;
			private readonly ModuleList<Sequential> cv2 = new ModuleList<Sequential>();
			private readonly ModuleList<Sequential> cv3 = new ModuleList<Sequential>();
			private readonly Module<Tensor, Tensor> dfl;

			public Yolov8Detect(int nc, int[] ch, bool end2end = true) : base("Yolov8Detect")
			{
				this.end2end = end2end;
				this.nc = nc; // number of classes
				this.nl = ch.Length;// number of detection layers
				this.reg_max = 16; // DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
				this.no = nc + this.reg_max * 4; // number of outputs per anchor
				this.stride = new int[] { 8, 16, 32 }; // strides computed during build

				int c2 = Math.Max(Math.Max(16, ch[0] / 4), this.reg_max * 4);
				int c3 = Math.Max(ch[0], Math.Min(this.nc, 100));// channels

				foreach (int x in ch)
				{
					cv2.append(Sequential(new Conv(x, c2, 3), new Conv(c2, c2, 3), nn.Conv2d(c2, 4 * this.reg_max, 1)));
					cv3.append(nn.Sequential(new Conv(x, c3, 3), new Conv(c3, c3, 3), nn.Conv2d(c3, this.nc, 1)));
				}

				this.dfl = this.reg_max > 1 ? new DFL(this.reg_max) : nn.Identity();
				RegisterComponents();
			}

			public override Tensor[] forward(Tensor[] x)
			{
				for (int i = 0; i < nl; i++)
				{
					x[i] = torch.cat(new[] { cv2[i].forward(x[i]), cv3[i].forward(x[i]) }, 1);
				}

				if (training)
				{
					return x;
				}
				else
				{
					var shape = x[0].shape; // BCHW
					Tensor y = _inference(x);
					return new Tensor[] { y }.Concat(x).ToArray();
				}
			}

			//Decode predicted bounding boxes and class probabilities based on multiple-level feature maps.
			private Tensor _inference(Tensor[] x)
			{
				long[] shape = x[0].shape;  // BCHW

				List<Tensor> xi_mix = new List<Tensor>();
				foreach (var xi in x)
				{
					xi_mix.Add(xi.view(shape[0], this.no, -1));
				}
				Tensor x_cat = torch.cat(xi_mix, 2);

				if (this.shape != shape)
				{
					var (anchors, strides) = make_anchors(x, this.stride, 0.5f);
					this.anchors = anchors.transpose(0, 1);
					this.strides = strides.transpose(0, 1);
					this.shape = shape;
				}

				Tensor[] box_cls = x_cat.split([this.reg_max * 4, this.nc], 1);
				Tensor box = box_cls[0];
				Tensor cls = box_cls[1];
				Tensor dbox = decode_bboxes(this.dfl.forward(box), this.anchors.unsqueeze(0)) * this.strides;
				return  torch.cat([dbox, cls.sigmoid()], 1);
			}

			// Decode bounding boxes.
			private Tensor decode_bboxes(Tensor bboxes, Tensor anchors)
			{
				return dist2bbox(bboxes, anchors, xywh: true, dim: 1);
			}

			// Transform distance(ltrb) to box(xywh or xyxy).
			private Tensor dist2bbox(Tensor distance, Tensor anchor_points, bool xywh = true, int dim = -1)
			{
				Tensor[] ltrb = distance.chunk(2, dim);
				Tensor lt = ltrb[0];
				Tensor rb = ltrb[1];

				Tensor x1y1 = anchor_points - lt;
				Tensor x2y2 = anchor_points + rb;

				if (xywh)
				{
					Tensor c_xy = (x1y1 + x2y2) / 2;
					Tensor wh = x2y2 - x1y1;
					return torch.cat([c_xy, wh], dim);  // xywh bbox
				}
				return torch.cat([x1y1, x2y2], dim); // xyxy bbox
			}

			private (Tensor, Tensor) make_anchors(Tensor[] feats, int[] strides, float grid_cell_offset = 0.5f)
			{
				ScalarType dtype = feats[0].dtype;
				Device device = feats[0].device;
				List<Tensor> anchor_points = new List<Tensor>();
				List<Tensor> stride_tensor = new List<Tensor>();
				for (int i = 0; i < strides.Length; i++)
				{
					long h = feats[i].shape[2];
					long w = feats[i].shape[3];
					Tensor sx = torch.arange(w, device: device, dtype: dtype) + grid_cell_offset;  // shift x
					Tensor sy = torch.arange(h, device: device, dtype: dtype) + grid_cell_offset;  // shift y
					Tensor[] sy_sx = torch.meshgrid([sy, sx], indexing: "ij");
					sy = sy_sx[0];
					sx = sy_sx[1];
					anchor_points.Add(torch.stack([sx, sy], -1).view(-1, 2));
					stride_tensor.Add(torch.full([h * w, 1], strides[i], dtype: dtype, device: device));
				}
				return (torch.cat(anchor_points), torch.cat(stride_tensor));
			}

			private Tensor postprocess(Tensor preds, int max_det, int nc = 80)
			{
				//	Post-processes YOLO model predictions.
				//	Args:
				//		preds (torch.Tensor): Raw predictions with shape (batch_size, num_anchors, 4 + nc) with last dimension
				//			format [x, y, w, h, class_probs].
				//		max_det (int): Maximum detections per image.
				//		nc (int, optional): Number of classes. Default: 80.
				//	Returns:
				//		(torch.Tensor): Processed predictions with shape (batch_size, min(max_det, num_anchors), 6) and last
				//			dimension format [x, y, w, h, max_class_prob, class_index].


				long batch_size = preds.shape[0];  // i.e. shape(16,8400,84)
				int anchors = (int)preds.shape[1];
				Tensor[] boxes_scores = preds.split([4, nc], dim: -1);
				Tensor boxes = boxes_scores[0];
				Tensor scores = boxes_scores[1];

				Tensor index = scores.amax(-1).topk(Math.Min(max_det, anchors)).indexes.unsqueeze(-1);
				boxes = boxes.gather(dim: 1, index: index.repeat(1, 1, 4));
				scores = scores.gather(dim: 1, index: index.repeat(1, 1, nc));
				(scores, index) = scores.flatten(1).topk(Math.Min(max_det, anchors));

				Tensor i = torch.arange(batch_size)[TensorIndex.Ellipsis, TensorIndex.None]; // batch indices
				return torch.cat([boxes[i, index / nc], scores[TensorIndex.Ellipsis, TensorIndex.None], (index % nc)[TensorIndex.Ellipsis, TensorIndex.None].@float()], dim: -1);
			}

		}

其他与Yolov5相同的部分就不再解读了，详细请参考我之前发布的资料。

项目展望

目前已经实现了Yolov5、Yolov8的训练和推理方法，目前正在着手研究Yolov10清华版和Yolov11的实现，这两个模型与目前的Yolov5、Yolov8相比，分别在Detect层和C2PSA层进行了变更，而且C2PSA层中还引入了Self-Attention机制，如读者有兴趣，可以协助我一起完善该项目。

写在最后

使用C#深度学习项目是很多人所希望的。不过在该方向上资料很少，开发难度大。常规使用C#进行深度学习项目的方法为使用Python训练，转为Onnx模型再用C#调用。
目前我希望能够改变这一现象，希望能用纯C#平台进行训练和推理。这条路还很长，也很困难，希望有兴趣的读者能跟我一起让让C#的深度学习开发环境更为完善，以此能帮助到更多的人。

我在Github上已经将完整的代码发布了，项目地址为：https://github.com/IntptrMax/YoloSharp，期待你能在Github上送我一颗小星星。在我的Github里还GGMLSharp这个项目，这个项目也是C#平台下深度学习的开发包，希望能得到你的支持。

项目下载链接

https://download.csdn.net/download/qq_30270773/89958343

本项目支持的权重下载链接在我的Github项目上，已经提供了n、s、m、l、x五种不同的初始权重，如果需要可以自行下载。