使用Pytorch和卷积神经网络进行简单的数字识别(MNIST)
说明:
- 首次发表日期:2024-10-30
- 参考:
- https://github.com/bentrevett/pytorch-image-classification/blob/master/1_mlp.ipynb
- https://www.sciencedirect.com/science/article/pii/S2214579620300502?via%3Dihub#se0070
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://blog.csdn.net/zylooooooooong/article/details/122805833
- https://www.markovml.com/blog/normalization-in-machine-learning
准备CONDA环境
# 在润建WORKBENCH上使用
conda create -n py310torch python=3.10
conda activate py310torch
conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.1 -c pytorch -c nvidia
pip install matplotlib tqdm ipywidgets
下载MNIST数据集
Kaggle上有MNIST数据集: https://www.kaggle.com/datasets/hojjatk/mnist-dataset
apt-get update
apt-get install curl
mkdir data
cd data
curl -L -o mnist.zip https://www.kaggle.com/api/v1/datasets/download/hojjatk/mnist-dataset
unzip mnist.zip
将MINST数据集保存为图片和标注
借鉴 https://www.kaggle.com/code/hojjatk/read-mnist-dataset 中的读取MNIST数据的代码,并添加保存为图片和标注(按PaddleOCR格式)
import numpy as np # linear algebra
import struct
from array import array
from os.path import join
import os
from PIL import Image
#
# MNIST Data Loader Class
#
class MnistDataloader(object):
def __init__(self, training_images_filepath,training_labels_filepath,
test_images_filepath, test_labels_filepath):
self.training_images_filepath = training_images_filepath
self.training_labels_filepath = training_labels_filepath
self.test_images_filepath = test_images_filepath
self.test_labels_filepath = test_labels_filepath
def read_images_labels(self, images_filepath, labels_filepath):
labels = []
with open(labels_filepath, 'rb') as file:
magic, size = struct.unpack(">II", file.read(8))
if magic != 2049:
raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
labels = array("B", file.read())
with open(images_filepath, 'rb') as file:
magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
if magic != 2051:
raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
image_data = array("B", file.read())
images = []
for i in range(size):
images.append([0] * rows * cols)
for i in range(size):
img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
img = img.reshape(28, 28)
images[i][:] = img
return images, labels
def load_data(self):
x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
return (x_train, y_train),(x_test, y_test)
input_path = './data'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
dest_dir = "./dataset/mnist_train"
os.makedirs(dest_dir, exist_ok = True)
label_lines = []
for i, (image, label) in enumerate(zip(x_train, y_train)):
fname = f"train_{i}_{label}.png"
Image.fromarray(np.stack(image)).save(os.path.join(dest_dir, fname))
label_lines.append(f"mnist_train/{fname}\t{label}")
with open("./dataset/mnist_train_rec.txt", "w") as f:
f.write("\n".join(label_lines))
dest_dir = "./dataset/mnist_test"
os.makedirs(dest_dir, exist_ok = True)
label_lines = []
for i, (image, label) in enumerate(zip(x_test, y_test)):
fname = f"train_{i}_{label}.png"
Image.fromarray(np.stack(image)).save(os.path.join(dest_dir, fname))
label_lines.append(f"mnist_test/{fname}\t{label}")
with open("./dataset/mnist_test_rec.txt", "w") as f:
f.write("\n".join(label_lines))
注意:需要保存为PNG格式,JPG格式保存时会压缩导致和原np.array不一致
准备数据DataLoader
自定义Dataset子类
按照官方教程 https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files , 一个自定义的Dataset
子类需要实现__init__
, __len__
和 __getitem__
这三个方法。
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision.io import decode_image
import numpy as np
from PIL import Image
from tqdm.notebook import trange, tqdm
import time
class CustomRecDataset(Dataset):
def __init__(self, data_dir, label_file_or_file_list, transform=None, target_transform=None):
assert os.path.exists(data_dir), f"data_dir {data_dir} does not exists"
self.data_dir = data_dir
self.transform = transform
self.target_transform = target_transform
self.label_list = []
label_file_list = []
if isinstance(label_file_or_file_list, str):
label_file_list.append(label_file_or_file_list)
for label_file in label_file_list:
with open(label_file, "r") as f:
self.label_list.extend(f.readlines())
def __len__(self):
return len(self.label_list)
def __getitem__(self, idx):
img_rel_path, label = self.label_list[idx].split("\t")
label = int(label.replace("\n", ""))
img_path = os.path.join(self.data_dir, img_rel_path)
image = Image.open(img_path).numpy()
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label
Normalization
通常,我们需要将图片数据进行Normlization处理。
为何要做Normalization:
- 每个feature具有不同的scale,即有不同的取值范围(比如房子的价格和面积),Normalization 可以移除scale的影响,使得不同的feature对模型的贡献是平等的,防止偏见。
- Normalization可以防止因数值过大造成的梯度爆炸或者梯度消失。
关于Normalization,了解更多请参考 https://www.markovml.com/blog/normalization-in-machine-learning
进行Normalization处理需要有MEAN和STD值,CV领域经常使用来自ImageNet的MEAN和STD值。
博客 https://blog.csdn.net/zylooooooooong/article/details/122805833 指出是否使用ImageNet的均值和标准差取决于你的数据:
- 假设你的数据是“
自然场景
”的普通照片(人,建筑,动物,不同的照明/角度/背景等等),并且假设你的数据集和 ImageNet 存在类似的偏差(在类别平衡方面),那么使用 ImageNet 的场景统计数据进行规范化就可以了。 - 如果照片是“
特殊的
”(颜色过滤,对比度调整,不寻常的光线,等等)或“非自然的主题
”(医学图像,卫星地图,手绘等) ,我建议在模型训练之前正确地规范化你的数据集(计算新的平均值和标准)。
本文不使用来自ImageNet的预训练权重,所以我们需要计算MEAN和STD值:
ds = CustomRecDataset('dataset', 'dataset/mnist_train_rec.txt')
images = [image for (image, label) in ds]
mean = np.mean(images) / 255.0
std = np.std(images) / 255.0
print(mean, std)
0.1306604762738429 0.30810780385646264
注意:计算MEAN和STD值的时候,只使用训练集的数据,避免泄露来自测试集的数据。
Transforms
train_transforms = transforms.Compose([
transforms.RandomRotation(5, fill=(0,)),
transforms.RandomCrop(28, padding=2),
transforms.ToTensor(),
transforms.Normalize(mean=[mean], std=[std])
])
test_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[mean], std=[std])
])
其中:
- RandomRotation随机旋转图片
(-x, +x)
度 - RandomCrop应用padding之后,随机裁剪
- ToTensor转换为Tensor并scale到
[0.0,1.0]
之间
准备训练集,验证集,测试集
train_data = CustomRecDataset('dataset', 'dataset/mnist_train_rec.txt', transform=train_transforms)
test_data = CustomRecDataset('dataset', 'dataset/mnist_test_rec.txt', transform=test_transforms)
MNIST数据集没有直接提供验证集,我们从训练集里面取出10%来当做验证集。
注意:我们可以从训练集取部分数据当验证集,但是不能从测试集取。
generator1 = torch.Generator().manual_seed(42)
train_data, valid_data = data.random_split(train_data, [0.9, 0.1], generator=generator1)
valid_data.transform=test_transforms
DataLoader
因为随机梯度下降算法需要,我们将训练集的DataLoader的shuffle
为True
。
BATCH_SIZE = 64
train_iterator = data.DataLoader(train_data,
shuffle=True,
batch_size=BATCH_SIZE)
valid_iterator = data.DataLoader(valid_data,
batch_size=BATCH_SIZE)
test_iterator = data.DataLoader(test_data,
batch_size=BATCH_SIZE)
定义模型
定义一个模型,参考自 https://www.sciencedirect.com/science/article/pii/S2214579620300502?via%3Dihub#se0070
class DigitNet(nn.Module):
def __init__(self, output_dim=10):
super().__init__()
self.classifier = nn.Sequential(
nn.Conv2d(1, 16, 3, 1),
nn.MaxPool2d(2, 2),
nn.ReLU(inplace=True),
nn.Conv2d(16, 32, 5, 1),
nn.MaxPool2d(2, 2),
nn.ReLU(inplace=True),
nn.Conv2d(32, 64, 3, 1),
nn.Flatten(),
nn.Linear(256, 128),
nn.Linear(128, output_dim)
)
def forward(self, x):
logits = self.classifier(x)
return logits
计算模型参数量:
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
model = DigitNet(10)
print(f'The model has {count_parameters(model):,} trainable parameters')
0.1306604762738429 0.30810780385646264
The model has 65,674 trainable parameters
训练
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
def calculate_accuracy(y_pred, y):
top_pred = y_pred.argmax(1, keepdim=True)
correct = top_pred.eq(y.view_as(top_pred)).sum()
acc = correct.float() / y.shape[0]
return acc
def train(model, iterator, optimizer, criterion, device):
epoch_loss = 0
epoch_acc = 0
model.train()
for (x, y) in tqdm(iterator, desc="Training", leave=False):
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
y_pred = model(x)
loss = criterion(y_pred, y)
acc = calculate_accuracy(y_pred, y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion, device):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):
x = x.to(device)
y = y.to(device)
y_pred = model(x)
loss = criterion(y_pred, y)
acc = calculate_accuracy(y_pred, y)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
EPOCHS = 10
best_valid_loss = float('inf')
for epoch in trange(EPOCHS):
start_time = time.monotonic()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut1-model.pt')
end_time = time.monotonic()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
Epoch: 01 | Epoch Time: 0m 22s
Train Loss: 0.285 | Train Acc: 91.15%
Val. Loss: 0.134 | Val. Acc: 96.06%
Epoch: 02 | Epoch Time: 0m 21s
Train Loss: 0.112 | Train Acc: 96.62%
Val. Loss: 0.115 | Val. Acc: 96.63%
Epoch: 03 | Epoch Time: 0m 21s
Train Loss: 0.089 | Train Acc: 97.26%
Val. Loss: 0.085 | Val. Acc: 97.64%
Epoch: 04 | Epoch Time: 0m 21s
Train Loss: 0.074 | Train Acc: 97.71%
Val. Loss: 0.079 | Val. Acc: 97.59%
Epoch: 05 | Epoch Time: 0m 19s
Train Loss: 0.066 | Train Acc: 97.97%
Val. Loss: 0.105 | Val. Acc: 96.66%
Epoch: 06 | Epoch Time: 0m 21s
Train Loss: 0.061 | Train Acc: 98.06%
Val. Loss: 0.073 | Val. Acc: 97.73%
Epoch: 07 | Epoch Time: 0m 21s
Train Loss: 0.056 | Train Acc: 98.29%
Val. Loss: 0.066 | Val. Acc: 97.94%
Epoch: 08 | Epoch Time: 0m 21s
Train Loss: 0.054 | Train Acc: 98.32%
Val. Loss: 0.087 | Val. Acc: 97.23%
Epoch: 09 | Epoch Time: 0m 22s
Train Loss: 0.051 | Train Acc: 98.40%
Val. Loss: 0.065 | Val. Acc: 98.39%
Epoch: 10 | Epoch Time: 0m 22s
Train Loss: 0.049 | Train Acc: 98.51%
Val. Loss: 0.065 | Val. Acc: 98.15%