Bootstrap

昇思25天学习打卡营第21天|热门LLM及其他AI应用-基于MobileNetv2的垃圾分类

昇思25天学习打卡营第21天|热门LLM及其他AI应用-基于MobileNetv2的垃圾分类

MobileNetV2

深度可分离卷积分为两步:

  1. 深度卷积:对每个输入通道分别进行卷积操作,不混合不同通道的信息。
  2. 逐点卷积:使用1x1卷积将不同通道的信息混合在一起。

这种方式大大减少了计算量和参数量,因为它将标准卷积的复杂操作分解为两个较简单的操作。

倒残差结构(Inverted Residuals):(两头小中间大的结构+逐通道卷积操作)

传统的残差结构会先通过卷积层将输入的维度提高,然后再进行计算,而倒残差结构则相反。它先通过1x1卷积扩展通道,然后进行深度卷积,再通过1x1卷积压缩通道。其主要流程如下:

  1. 扩展通道:通过1x1卷积将输入通道数扩展到较高的维度,使得网络能够在高维空间中捕捉更多的特征信息
  2. 深度卷积:在扩展后的高维空间进行3x3深度卷积。
  3. 压缩通道:通过另一个1x1卷积将通道数压缩回原始维度。

数据集准备

from download import download
# 下载data_en数据集
url = "https://ascend-professional-construction-dataset.obs.cn-north-4.myhuaweicloud.com:443/MindStudio-pc/data_en.zip" 
path = download(url, "./", kind="zip", replace=True)
#参数设置
garbage_classes = {
    '干垃圾': ['贝壳', '打火机', '旧镜子', '扫把', '陶瓷碗', '牙刷', '一次性筷子', '脏污衣服'],
    '可回收物': ['报纸', '玻璃制品', '篮球', '塑料瓶', '硬纸板', '玻璃瓶', '金属制品', '帽子', '易拉罐', '纸张'],
    '湿垃圾': ['菜叶', '橙皮', '蛋壳', '香蕉皮'],
    '有害垃圾': ['电池', '药片胶囊', '荧光灯', '油漆桶']
}

class_cn = ['贝壳', '打火机', '旧镜子', '扫把', '陶瓷碗', '牙刷', '一次性筷子', '脏污衣服',
            '报纸', '玻璃制品', '篮球', '塑料瓶', '硬纸板', '玻璃瓶', '金属制品', '帽子', '易拉罐', '纸张',
            '菜叶', '橙皮', '蛋壳', '香蕉皮',
            '电池', '药片胶囊', '荧光灯', '油漆桶']
class_en = ['Seashell', 'Lighter','Old Mirror', 'Broom','Ceramic Bowl', 'Toothbrush','Disposable Chopsticks','Dirty Cloth',
            'Newspaper', 'Glassware', 'Basketball', 'Plastic Bottle', 'Cardboard','Glass Bottle', 'Metalware', 'Hats', 'Cans', 'Paper',
            'Vegetable Leaf','Orange Peel', 'Eggshell','Banana Peel',
            'Battery', 'Tablet capsules','Fluorescent lamp', 'Paint bucket']

index_en = {'Seashell': 0, 'Lighter': 1, 'Old Mirror': 2, 'Broom': 3, 'Ceramic Bowl': 4, 'Toothbrush': 5, 'Disposable Chopsticks': 6, 'Dirty Cloth': 7,
            'Newspaper': 8, 'Glassware': 9, 'Basketball': 10, 'Plastic Bottle': 11, 'Cardboard': 12, 'Glass Bottle': 13, 'Metalware': 14, 'Hats': 15, 'Cans': 16, 'Paper': 17,
            'Vegetable Leaf': 18, 'Orange Peel': 19, 'Eggshell': 20, 'Banana Peel': 21,
            'Battery': 22, 'Tablet capsules': 23, 'Fluorescent lamp': 24, 'Paint bucket': 25}

# 训练超参
config = EasyDict({
    "num_classes": 26,
    "image_height": 224,
    "image_width": 224,
    #"data_split": [0.9, 0.1],
    "backbone_out_channels":1280,
    "batch_size": 16,
    "eval_batch_size": 8,
    "epochs": 10,
    "lr_max": 0.05,
    "momentum": 0.9,
    "weight_decay": 1e-4,
    "save_ckpt_epochs": 1,
    "dataset_path": "./data_en",
    "class_index": index_en,
    "pretrained_ckpt": "./mobilenetV2-200_1067.ckpt" # mobilenetV2-200_1067.ckpt 
})

数据加载

import math
import numpy as np
import os
import random

from matplotlib import pyplot as plt
from easydict import EasyDict
from PIL import Image
import numpy as np
import mindspore.nn as nn
from mindspore import ops as P
from mindspore.ops import add
from mindspore import Tensor
import mindspore.common.dtype as mstype
import mindspore.dataset as de
import mindspore.dataset.vision as C
import mindspore.dataset.transforms as C2
import mindspore as ms
from mindspore import set_context, nn, Tensor, load_checkpoint, save_checkpoint, export
from mindspore.train import Model
from mindspore.train import Callback, LossMonitor, ModelCheckpoint, CheckpointConfig

os.environ['GLOG_v'] = '3' # Log level includes 3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG).
os.environ['GLOG_logtostderr'] = '0' # 0:输出到文件,1:输出到屏幕
os.environ['GLOG_log_dir'] = '../../log' # 日志目录
os.environ['GLOG_stderrthreshold'] = '2' # 输出到目录也输出到屏幕:3(ERROR), 2(WARNING), 1(INFO), 0(DEBUG).
set_context(mode=ms.GRAPH_MODE, device_target="CPU", device_id=0) # 设置采用图模式执行,设备为Ascend#

数据处理

归一化以及修改图像频道

def create_dataset(dataset_path, config, training=True, buffer_size=1000):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        config(struct): the config of train and eval in diffirent platform.
    Returns:
        ds: processed dataset ready for training or evaluation
    """
    
    # 确定数据路径,根据training标志选择'train'或'test'子目录
    data_path = os.path.join(dataset_path, 'train' if training else 'test')
    
    # 创建ImageFolderDataset对象,加载数据集,class_indexing用于分类索引
    ds = de.ImageFolderDataset(data_path, num_parallel_workers=4, class_indexing=config.class_index)
    
    # 获取配置中的图像高度和宽度
    resize_height = config.image_height
    resize_width = config.image_width
    
    # 定义图像标准化操作,mean和std为ImageNet数据集的均值和标准差
    normalize_op = C.Normalize(mean=[0.485*255, 0.456*255, 0.406*255], std=[0.229*255, 0.224*255, 0.225*255])
    
    # 定义图像通道转换操作,将图像从HWC格式转换为CHW格式
    change_swap_op = C.HWC2CHW()
    
    # 定义标签类型转换操作,将标签转换为int32类型
    type_cast_op = C2.TypeCast(mstype.int32)

    if training:
        # 训练集数据增强操作:随机裁剪、解码、调整大小
        crop_decode_resize = C.RandomCropDecodeResize(resize_height, scale=(0.08, 1.0), ratio=(0.75, 1.333))
        
        # 随机水平翻转,概率为0.5
        horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5)
        
        # 随机颜色调整,包括亮度、对比度和饱和度
        color_adjust = C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
        
        # 训练集的转换操作列表,包含上述所有增强操作
        train_trans = [crop_decode_resize, horizontal_flip_op, color_adjust, normalize_op, change_swap_op]
        
        # 应用转换操作到图像列,num_parallel_workers表示并行工作线程数
        train_ds = ds.map(input_columns="image", operations=train_trans, num_parallel_workers=4)
        
        # 应用类型转换操作到标签列
        train_ds = train_ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=4)
        
        # 打乱数据集,buffer_size为缓冲区大小
        train_ds = train_ds.shuffle(buffer_size=buffer_size)
        
        # 将数据集分批,config.batch_size为每批次的大小,drop_remainder=True表示丢弃最后一个不完整的批次
        ds = train_ds.batch(config.batch_size, drop_remainder=True)
     else:
        # 评估集数据预处理操作:解码、调整大小、中心裁剪
        decode_op = C.Decode()
        resize_op = C.Resize((int(resize_width / 0.875), int(resize_width / 0.875)))
        center_crop = C.CenterCrop(resize_width)
        
        # 评估集的转换操作列表,包含上述所有预处理操作
        eval_trans = [decode_op, resize_op, center_crop, normalize_op, change_swap_op]
        
        # 应用转换操作到图像列
        eval_ds = ds.map(input_columns="image", operations=eval_trans, num_parallel_workers=4)
        
        # 应用类型转换操作到标签列
        eval_ds = eval_ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=4)
        
        # 将数据集分批,config.eval_batch_size为每批次的大小,drop_remainder=True表示丢弃最后一个不完整的批次
        ds = eval_ds.batch(config.eval_batch_size, drop_remainder=True)

    return ds

模型构建

#定义了模块中可以导出的对象
__all__ = ['MobileNetV2', 'MobileNetV2Backbone', 'MobileNetV2Head', 'mobilenet_v2']
#确保层的通道数可以被特定的数整除
def _make_divisible(v, divisor, min_value=None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v
#全局平均池化层
class GlobalAvgPooling(nn.Cell):
    

    def __init__(self):
        super(GlobalAvgPooling, self).__init__()

    def construct(self, x):
        x = P.mean(x, (2, 3))
        return x
#卷积、批归一化和ReLU层
class ConvBNReLU(nn.Cell):
    def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1):
        super(ConvBNReLU, self).__init__()
        padding = (kernel_size - 1) // 2
        in_channels = in_planes
        out_channels = out_planes
        if groups == 1:
            conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad_mode='pad', padding=padding)
        else:
            out_channels = in_planes
            conv = nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad_mode='pad',
                             padding=padding, group=in_channels)
		#卷积、批归一化和ReLU层	
        layers = [conv, nn.BatchNorm2d(out_planes), nn.ReLU6()]
        self.features = nn.SequentialCell(layers)

    def construct(self, x):
        output = self.features(x)
        return output
#倒残差模块
class InvertedResidual(nn.Cell):

    def __init__(self, inp, oup, stride, expand_ratio):
        super(InvertedResidual, self).__init__()
        assert stride in [1, 2]

        hidden_dim = int(round(inp * expand_ratio))
        self.use_res_connect = stride == 1 and inp == oup

        layers = []
        if expand_ratio != 1:
            layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1))
        layers.extend([
            ConvBNReLU(hidden_dim, hidden_dim,
                       stride=stride, groups=hidden_dim),
            nn.Conv2d(hidden_dim, oup, kernel_size=1,
                      stride=1, has_bias=False),
            nn.BatchNorm2d(oup),
        ])
        self.conv = nn.SequentialCell(layers)
        self.cast = P.Cast()

    def construct(self, x):
        identity = x
        x = self.conv(x)
        if self.use_res_connect:
            return P.add(identity, x)
        return x
#MobileNetV2骨干网络
class MobileNetV2Backbone(nn.Cell):

    def __init__(self, width_mult=1., inverted_residual_setting=None, round_nearest=8,
                 input_channel=32, last_channel=1280):
        super(MobileNetV2Backbone, self).__init__()
        block = InvertedResidual
        # setting of inverted residual blocks
        self.cfgs = inverted_residual_setting
        if inverted_residual_setting is None:
            self.cfgs = [
                # t, c, n, s
                [1, 16, 1, 1],
                [6, 24, 2, 2],
                [6, 32, 3, 2],
                [6, 64, 4, 2],
                [6, 96, 3, 1],
                [6, 160, 3, 2],
                [6, 320, 1, 1],
            ]

        # building first layer
        input_channel = _make_divisible(input_channel * width_mult, round_nearest)
        self.out_channels = _make_divisible(last_channel * max(1.0, width_mult), round_nearest)
        features = [ConvBNReLU(3, input_channel, stride=2)]
        # building inverted residual blocks
        for t, c, n, s in self.cfgs:
            output_channel = _make_divisible(c * width_mult, round_nearest)
            #构建相应数量的倒置残差块
            for i in range(n):
                stride = s if i == 0 else 1
                #使用 block 构建倒置残差块,并将其添加到特征列表 features 中
                features.append(block(input_channel, output_channel, stride, expand_ratio=t))
                #更新输入通道数,准备为下一个块提供正确的输入通道数
                input_channel = output_channel
        features.append(ConvBNReLU(input_channel, self.out_channels, kernel_size=1))
        #将特征列表 features 组装成一个顺序的网络结构self.features
        self.features = nn.SequentialCell(features)
        self._initialize_weights()

    def construct(self, x):
        x = self.features(x)
        return x

    def _initialize_weights(self):
        """
        Initialize weights.

        Args:

        Returns:
            None.

        Examples:
            >>> _initialize_weights()
        """
        self.init_parameters_data()
        for _, m in self.cells_and_names():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.set_data(Tensor(np.random.normal(0, np.sqrt(2. / n),
                                                          m.weight.data.shape).astype("float32")))
                if m.bias is not None:
                    m.bias.set_data(
                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
            elif isinstance(m, nn.BatchNorm2d):
                m.gamma.set_data(
                    Tensor(np.ones(m.gamma.data.shape, dtype="float32")))
                m.beta.set_data(
                    Tensor(np.zeros(m.beta.data.shape, dtype="float32")))

    @property
    def get_features(self):
        return self.features
#MobileNetV2头部
class MobileNetV2Head(nn.Cell):
    
    def __init__(self, input_channel=1280, num_classes=1000, has_dropout=False, activation="None"):
        super(MobileNetV2Head, self).__init__()
        # mobilenet head
        head = ([GlobalAvgPooling(), nn.Dense(input_channel, num_classes, has_bias=True)] if not has_dropout else
                [GlobalAvgPooling(), nn.Dropout(0.2), nn.Dense(input_channel, num_classes, has_bias=True)])
        self.head = nn.SequentialCell(head)
        self.need_activation = True
        if activation == "Sigmoid":
            self.activation = nn.Sigmoid()
        elif activation == "Softmax":
            self.activation = nn.Softmax()
        else:
            self.need_activation = False
        self._initialize_weights()

    def construct(self, x):
        x = self.head(x)
        if self.need_activation:
            x = self.activation(x)
        return x
	#初始化模型的权重参数
    def _initialize_weights(self):
        """
        Initialize weights.

        Args:

        Returns:
            None.

        Examples:
            >>> _initialize_weights()
        """
        self.init_parameters_data()
        for _, m in self.cells_and_names():
            if isinstance(m, nn.Dense):
                m.weight.set_data(Tensor(np.random.normal(
                    0, 0.01, m.weight.data.shape).astype("float32")))
                if m.bias is not None:
                    m.bias.set_data(
                        Tensor(np.zeros(m.bias.data.shape, dtype="float32")))
    @property
    def get_head(self):
        return self.head

class MobileNetV2(nn.Cell):
   
    def __init__(self, num_classes=1000, width_mult=1., has_dropout=False, inverted_residual_setting=None, \
        round_nearest=8, input_channel=32, last_channel=1280):
        super(MobileNetV2, self).__init__()
        self.backbone = MobileNetV2Backbone(width_mult=width_mult, \
            inverted_residual_setting=inverted_residual_setting, \
            round_nearest=round_nearest, input_channel=input_channel, last_channel=last_channel).get_features
        self.head = MobileNetV2Head(input_channel=self.backbone.out_channel, num_classes=num_classes, \
            has_dropout=has_dropout).get_head

    def construct(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x
#组件化设计(选用)
class MobileNetV2Combine(nn.Cell):

    def __init__(self, backbone, head):
        super(MobileNetV2Combine, self).__init__(auto_prefix=False)
        self.backbone = backbone
        self.head = head

    def construct(self, x):
        x = self.backbone(x)
        x = self.head(x)
        return x

def mobilenet_v2(backbone, head):
    return MobileNetV2Combine(backbone, head)

关于定义一个网络类时,其内部函数的执行过程:

__init__(self, input_channel=1280, num_classes=1000, has_dropout=False, activation="None"):

  • 这是类的初始化方法。在实例化过程中,首先会执行这个方法。它负责设置类的属性,构建模型头部的结构,并根据传入的参数初始化各个组件,包括选择是否使用 Dropout,选择激活函数等。最后,调用 _initialize_weights() 方法来初始化模型的权重和参数。

_initialize_weights(self):

  • 这是一个私有方法,用于初始化模型的权重。在 __init__ 方法中被调用,目的是确保模型在开始训练之前具有合适的初始权重。

construct(self, x):

  • 这是 nn.Cell 类的重要方法,用于定义模型的正向传播逻辑。在实例化后,当调用模型的 __call__ 方法(即 model(x))时,会执行这个方法。在这个例子中,construct 方法将输入 x 通过头部结构 self.head 进行前向传播,并根据 self.need_activation 来决定是否应用激活函数。

get_head(self):

  • 这是一个属性方法(property),用于获取模型的头部结构 self.head。当调用实例的 get_head 方法时,会返回模型头部的顺序结构。

训练

# 创建训练和评估数据集
train_dataset = create_dataset(dataset_path=config.dataset_path, config=config)
eval_dataset = create_dataset(dataset_path=config.dataset_path, config=config)

# 获取训练数据集的大小
step_size = train_dataset.get_dataset_size()

# 初始化 MobileNetV2 的主干网络(特征提取部分)
backbone = MobileNetV2Backbone() #last_channel=config.backbone_out_channels

# 冻结主干网络的参数(即不训练这些参数),可以注释掉这两行以不冻结参数
for param in backbone.get_parameters():
    param.requires_grad = False

# 从预训练模型加载参数到主干网络
load_checkpoint(config.pretrained_ckpt, backbone)

# 初始化 MobileNetV2 的头部(分类部分)
head = MobileNetV2Head(input_channel=backbone.out_channels, num_classes=config.num_classes)

# 组合主干网络和头部,形成完整的 MobileNetV2 模型
network = mobilenet_v2(backbone, head)

# 定义损失函数、优化器和模型
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
loss_scale = FixedLossScaleManager(LOSS_SCALE, drop_overflow_update=False)
lrs = cosine_decay(config.epochs * step_size, lr_max=config.lr_max)
opt = nn.Momentum(network.trainable_params(), lrs, config.momentum, config.weight_decay, loss_scale=LOSS_SCALE)

# 定义用于训练的 train_loop 函数
def train_loop(model, dataset, loss_fn, optimizer):
    # 定义正向计算函数
    def forward_fn(data, label):
        logits = model(data)
        loss = loss_fn(logits, label)
        return loss
    grad_fn = ms.value_and_grad(forward_fn, None, optimizer.parameters)

    # 定义 one-step training 函数
    def train_step(data, label):
        loss, grads = grad_fn(data, label)
        optimizer(grads)
        return loss

    size = dataset.get_dataset_size()
    model.set_train()
    for batch, (data, label) in enumerate(dataset.create_tuple_iterator()):
        loss = train_step(data, label)

        if batch % 10 == 0:
            loss, current = loss.asnumpy(), batch
            print(f"loss: {loss:>7f}  [{current:>3d}/{size:>3d}]")

# 定义用于测试的 test_loop 函数
def test_loop(model, dataset, loss_fn):
    num_batches = dataset.get_dataset_size()
    model.set_train(False)
    #初始化统计变量
    total, test_loss, correct = 0, 0, 0
    for data, label in dataset.create_tuple_iterator():
        pred = model(data)
        total += len(data)
        test_loss += loss_fn(pred, label).asnumpy()
        correct += (pred.argmax(1) == label).asnumpy().sum()
    #计算loss和准确率
    test_loss /= num_batches
    correct /= total
    print(f"Test: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

# 开始训练过程
print("============== Starting Training ==============")
epoch_begin_time = time.time()
epochs = 2
for t in range(epochs):
    begin_time = time.time()
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(network, train_dataset, loss, opt)
    ms.save_checkpoint(network, "save_mobilenetV2_model.ckpt")
    end_time = time.time()
    times = end_time - begin_time
    print(f"per epoch time: {times}s")
    test_loop(network, eval_dataset, loss)
epoch_end_time = time.time()
times = epoch_end_time - epoch_begin_time
print(f"total time:  {times}s")
print("============== Training Success ==============")
  1. for t in range(epochs)和train_loop是分开的
    • epochs控制多个训练周期
    • train_loop行每个训练周期内的具体训练过程

加载和预测

# 定义保存模型的检查点路径
CKPT = "save_mobilenetV2_model.ckpt"

def image_process(image):
    """
    处理单张图片,将其归一化并转换为Tensor。
    
    Args:
        image: numpy array,形状为 (H, W, C),表示高、宽、通道数。
        
    Returns:
        img_tensor: Tensor,形状为 (1, C, H, W)。
    """
    # 归一化的均值和标准差
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
    
    # 将图片转换为numpy数组并进行归一化
    image = (np.array(image) - mean) / std
    
    # 转换通道顺序,从 (H, W, C) 到 (C, H, W)
    image = image.transpose((2, 0, 1))
    
    # 将numpy数组转换为Tensor,并增加一个维度以匹配网络输入
    img_tensor = Tensor(np.array([image], np.float32))
    return img_tensor

def infer_one(network, image_path):
    """
    对单张图片进行推理并输出预测结果。
    
    Args:
        network: 训练好的网络模型。
        image_path: str,图片的路径。
    """
    # 打开图片并调整大小
    image = Image.open(image_path).resize((config.image_height, config.image_width))
    
    # 处理图片并通过网络进行推理
    logits = network(image_process(image))
    
    # 获取预测结果的类别
    pred = np.argmax(logits.asnumpy(), axis=1)[0]
    
    # 打印图片路径和预测类别
    print(image_path, class_en[pred])

def infer():
    """
    对多个图片进行推理并输出预测结果。
    """
    # 初始化 MobileNetV2 的主干网络和头部
    backbone = MobileNetV2Backbone(last_channel=config.backbone_out_channels)
    head = MobileNetV2Head(input_channel=backbone.out_channels, num_classes=config.num_classes)
    
    # 组合主干网络和头部,形成完整的 MobileNetV2 模型
    network = mobilenet_v2(backbone, head)
    
    # 从检查点加载训练好的模型参数
    load_checkpoint(CKPT, network)
    
    # 对路径中的图片进行推理
    for i in range(91, 100):
        infer_one(network, f'data_en/test/Cardboard/000{i}.jpg')

# 执行推理函数
infer()

保存

backbone = MobileNetV2Backbone(last_channel=config.backbone_out_channels)
head = MobileNetV2Head(input_channel=backbone.out_channels, num_classes=config.num_classes)
network = mobilenet_v2(backbone, head)
load_checkpoint(CKPT, network)

input = np.random.uniform(0.0, 1.0, size=[1, 3, 224, 224]).astype(np.float32)
# export(network, Tensor(input), file_name='mobilenetv2.air', file_format='AIR')
# export(network, Tensor(input), file_name='mobilenetv2.pb', file_format='GEIR')
export(network, Tensor(input), file_name='mobilenetv2.onnx', file_format='ONNX')
  1. AIR 格式适用于华为 Ascend 平台,具有高效的运行时性能,但主要局限于 Ascend 生态系统。
  2. GEIR 格式适用于 MindSpore 框架,支持模型的高效训练和推理,但跨框架兼容性较低。
  3. ONNX 格式则是一个通用的中间表示格式,具有良好的跨平台兼容性,适用于在不同深度学习框架之间交换模型
;