以AFQMC数据集为语料
继承Dataset类构造自定义数据集
首先继承Dataset类构造自定义数据集,以组织样本和标签。AFQMC 样本以 json 格式存储,因此我们使用 json 库按行读取样本,并且以行号作为索引构建数据集。每一个样本将以字典的形式保存。
from torch.utils.data import Dataset
import json
class AFQMC(Dataset):
def __init__(self, data_file):
self.data = self.load_data(data_file)
def load_data(self, data_file):
Data = {}
with open(data_file, 'rt', encoding = 'utf-8') as f:
for idx, line in enumerate(f):
sample = json.loads(line.strip())
Data[idx] = sample
return Data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
train_data = AFQMC('data/afqmc_public/train.json')
valid_data = AFQMC('data/afqmc_public/dev.json')
若是数据集太大,也可以继承IterableDataset类构建迭代型数据集
from torch.utils.data import IterableDataset
import json
class IterableAFQMC(IterableDataset):
def __init__(self, data_file):
self.data_file = data_file
def __iter__(self):
with open(self.data_file, 'rt', encoding = 'utf-8') as f:
for line in f:
sample = json.loads(line.strip())
yield sample
# 每次生成器函数执行到 yield 语句时,会保存当前的状态(例如文件读取位置、局部变量等),直到下一次迭代请求时继续执行。
train_data = IterableAFQMC('data/afqmc_public/train.json')
使用Dataloader库按批(batch)加载数据
对于 NLP 任务,这个环节就是将每个 batch 中的文本按照预训练模型的格式进行编码(包括 Padding、截断等操作)。
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
batch_size = 4
def collote_fn(batch_samples):
batch_sentence_1, batch_sentence_2 = [], []
batch_label = []
for sample in batch_samples:
batch_sentence_1.append(sample['sentence1'])
batch_sentence_2.append(sample['sentence2'])
batch_label.append(int(sample['label']))
X = tokenizer(
batch_sentence_1,
batch_sentence_2,
padding=True,
truncation=True,
return_tensors="pt"
)
y = torch.tensor(batch_label)
return X, y
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collote_fn)
valid_dataloader = DataLoader(valid_data, batch_size=batch_size, shuffle=False, collate_fn=collote_fn)
我们通过手工编写 DataLoader 的批处理函数 collate_fn 来实现。首先加载分词器,然后对每个 batch 中的所有句子对进行编码,同时把标签转换为张量格式。
构建模型
大部分情况下需要自己编写模型。
常见的做法是继承Transformers 库中的预训练模型来创建自己的模型。例如这里我们可以继承 BERT 模型(BertPreTrainedModel 类)来创建一个与上面模型结构完全相同的分类器:
from torch import nn
from transformers import AutoModel
from transformers import BertPreTrainedModel, BertModel
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {device} device')
class BertForPairwiseCLS(BertPreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.bert = BertModel(config, add_pooling_layer=False)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(768, 2)
self.post_init()
def forward(self, x):
outputs = self.bert(**x)
cls_vectors = outputs.last_hidden_state[:, 0, :]
cls_vectors = self.dropout(cls_vectors)
logits = self.classifier(cls_vectors)
return logits
config = AutoConfig.from_pretrained(checkpoint)
model = BertForPairwiseCLS.from_pretrained(checkpoint, config=config).to(device)
优化模型参数
在训练模型时,将每一轮 Epoch 分为训练循环和验证/测试循环。在训练循环中计算损失、优化模型的参数,在验证/测试循环中评估模型的性能。
from tqdm.auto import tqdm
def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
progress_bar = tqdm(range(len(dataloader)))
progress_bar.set_description(f'loss: {0:>7f}')
finish_step_num = (epoch - 1) * len(dataloader)
model.train()
for step, (X, y) in enumerate(dataloader, start=1):
X, y = X.to(device), y.to(device)
pred = model(X)
loss = loss_fn(pred, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
lr_scheduler.step()
total_loss += loss.item()
progress_bar.set_description(f'loss: {total_loss / (finish_step_num + step):>7f}')
progress_bar.update(1)
return total_loss
def test_loop(dataloader, model, mode='Test'):
assert mode in ['Valid', 'Test']
size = len(dataloader.dataset)
correct = 0
model.eval()
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
correct /= size
print(f"{mode} Accuracy: {(100 * correct):>0.1f}%\n")
return correct
添加优化器
Transformers库的优化器会随着训练过程逐步减小学习率(通常会产生更好的效果)。例如我们前面使用过的 AdamW 优化器。
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)
为了正确地定义学习率调度器,我们需要知道总的训练步数 (step),它等于训练轮数 (Epoch number) 乘以每一轮中的步数(也就是训练 dataloader 的大小)
from transformers import get_scheduler
epochs = 3
num_training_steps = epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
完整训练过成如下
rom transformers import AdamW, get_scheduler
learning_rate = 1e-5
epoch_num = 3
loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
"linear",
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=epoch_num*len(train_dataloader),
)
total_loss = 0.
for t in range(epoch_num):
print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
test_loop(valid_dataloader, model, mode='Valid')
print("Done!")
保存最好的模型
def test_loop(dataloader, model, mode='Test'):
assert mode in ['Valid', 'Test']
size = len(dataloader.dataset)
correct = 0
model.eval()
with torch.no_grad():
for X, y in dataloader:
X, y = X.to(device), y.to(device)
pred = model(X)
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
correct /= size
print(f"{mode} Accuracy: {(100*correct):>0.1f}%\n")
return correct
total_loss = 0.
best_acc = 0.
for t in range(epoch_num):
print(f"Epoch {t+1}/{epoch_num}\n-------------------------------")
total_loss = train_loop(train_dataloader, model, loss_fn, optimizer, lr_scheduler, t+1, total_loss)
valid_acc = test_loop(valid_dataloader, model, mode='Valid')
if valid_acc > best_acc:
best_acc = valid_acc
print('saving new weights...\n')
torch.save(model.state_dict(), f'epoch_{t+1}_valid_acc_{(100*valid_acc):0.1f}_model_weights.bin')
print("Done!")