# coding: utf-8 import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' # 导入torch工具 import json import torch # 导入nn准备构建模型 import torch.nn as nn import torch.nn.functional as F import torch.optim as optim # 导入torch的数据源 数据迭代器工具包 from torch.utils.data import Dataset, DataLoader # 用于获得常见字母及字符规范化 import string # 导入时间工具包 import time # 引入制图工具包 import matplotlib.pyplot as plt # 从io中导入文件打开方法 from io import open # 1 获取常用的字符 标点,把每个char字符作为一个token,用onehot编码表示token # 因此我们的词表就是 char表 (字符表) 57个char all_letters = string.ascii_letters + " ,.;'" print(all_letters) n_letter = len(all_letters) # 词表的大小 print('字符表的长度:', n_letter) # 2 获取国家的类别种数 # 国家名 种类数 categorys = ['Italian', 'English', 'Arabic', 'Spanish', 'Scottish', 'Irish', 'Chinese', 'Vietnamese', 'Japanese', 'French', 'Greek', 'Dutch', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Czech', 'German'] # 国家名 个数,就是模型的 (linear输出维度) 分类数 categorynum = len(categorys) print('categorys--->', categorys) # 3 读取数据 def read_data(filename): # 3.1 初始化空列表两个 my_list_x, my_list_y = [], [] # 3.2 读取文件内容 with open(filename, 'r', encoding='utf-8') as fr: for line in fr.readlines(): # 异常点判断:改行长度<=5,说明这是异常样本,直接跳到下一行 if len(line) <= 5: continue x, y = line.strip().split('\t') my_list_x.append(x) my_list_y.append(y) # 3.3 返回两个列表 return my_list_x, my_list_y # 4 构建数据集 class NameClsDataset(Dataset): def __init__(self, mylist_x, mylist_y): self.mylist_x = mylist_x self.mylist_y = mylist_y def __len__(self): return len(self.mylist_x) def __getitem__(self, item): # 01 item 异常值出处理 index = min(max(item, 0), len(self.mylist_x) - 1) # 02 根据idx拿到人名 国家名 x = self.mylist_x[index] y = self.mylist_y[index] # 03 完成onehot tensor_x = torch.zeros(len(x), n_letter) for idx, letter in enumerate(x): tensor_x[idx][all_letters.find(letter)] = 1 # 04 获得标签 tensor_y = torch.tensor(categorys.index(y), dtype=torch.long) return tensor_x, tensor_y # 5 构建dataloader def get_dataloader(): filename = './data/name_classfication.txt' my_list_x, my_list_y = read_data(filename) mydataset = NameClsDataset(my_list_x, my_list_y) my_dataloader = DataLoader( mydataset, batch_size=1, shuffle=True, # 打乱顺序 # drop_last=True, # 是否丢弃最后那个不足一个batch_size的数据组 # collate_fn=collate_fn, # 处理一个batch的数据为整齐的维度 ) x, y = next(iter(my_dataloader)) # print(x) # print(x.shape) # print(y) return my_dataloader # 6 创建rnn模型 class MyRNN(nn.Module): def __init__(self, input_size, hidden_size, output_size, num_layers=1): super().__init__() self.input_size = input_size self.hidden_size = hidden_size self.output_size = output_size self.num_layers = num_layers self.rnn = nn.RNN(self.input_size, self.hidden_size, self.num_layers, batch_first=True) # self.linear = nn.Linear(self.hidden_size, self.hidden_size) self.linear = nn.Linear(self.hidden_size, self.output_size) self.softmax = nn.LogSoftmax(dim=-1) def forward(self, input): # input.shape = (1, 9, 57) # hidden.shape = (1, 1, 128) # rnn_output.shape = (1, 9, 128) # rnn_hn.shape = (1, 1, 128) # rnn_output, _ = self.rnn(input) rnn_output, rnn_hn = self.rnn(input) # temp.shape = (1, 128) # temp = rnn_output[0][-1].unsqueeze(0) temp = rnn_hn[0] # output.shape=(1,18) # self.softmax(output) (2, 18) output = self.linear(temp) # 可以接受三维数据 return self.softmax(output), rnn_hn # 7 测试RNN def ceshiRNN(): # 1 拿到数据 my_dataloader = get_dataloader() # 2 实例化模型 input_size = n_letter # 字符表的大小 (词表的大小) hidden_size = 128 # 超参数 768,rnn输出维度 output_size = len(categorys) # 18,分类总数 my_rnn = MyRNN(input_size, hidden_size, output_size) # 3 将数据送入到模型 x, y = next(iter(my_dataloader)) output, hn = my_rnn(x) # output.shape = (1, 18) print(output.shape) print(hn.shape) # 8 训练RNN def train_my_rnn(): epochs = 1 my_lr = 1e-3 # 1 读取数据 my_list_x, my_list_y = read_data('./data/name_classfication.txt') # 2 定义dataset myDataset = NameClsDataset(my_list_x, my_list_y) # 3 实例化dataloader my_dataloader = DataLoader(myDataset, batch_size=1, shuffle=True) # 4 实例化RNN模型 input_size = 57 hidden_size = 128 output_size = 18 my_rnn = MyRNN(input_size, hidden_size, output_size) # 5 损失函数 my_crossentropy = nn.NLLLoss() # 6 优化器 my_optimizer = optim.Adam(my_rnn.parameters(), lr=my_lr) # 7 日志 start_time = time.time() total_iter_num = 0 # 已经训练好的样本数 total_loss = 0 # 总的loss total_loss_list = [] # 每隔多少步存储loss-avg total_acc_num = 0 total_acc_list = [] # 存储间隔准确率acc-avg # 8 开始训练 # 8.1 外部循环 for epoch_idx in range(epochs): # 8.2 batch循环 for i, (x, y) in enumerate(my_dataloader): # 8.3 将x送入到模型 一轮模型训练 output, hn = my_rnn(x) my_loss = my_crossentropy(output, y) my_optimizer.zero_grad() my_loss.backward() my_optimizer.step() total_iter_num += 1 total_loss += my_loss.item() item1 = 1 if torch.argmax(output, dim=-1).item() == y.item() else 0 total_acc_num += item1 # 每隔 100 步存储avg-loss acc-avg if total_iter_num % 100 == 0: # 保存一下平均损失 loss_avg = total_loss / total_iter_num total_loss_list.append(loss_avg) # acc-avg acc_avg = total_acc_num / total_iter_num total_acc_list.append(acc_avg) if total_iter_num % 1000 == 0: loss_avg = total_loss / total_iter_num acc_avg = total_acc_num / total_iter_num end_time = time.time() use_time = end_time - start_time print( '当前的训练批次:%d, 平均损失:%.5f, 训练时间:%.3f, 准确率:%.2f' % ( epoch_idx + 1, loss_avg, use_time, acc_avg ) ) # 9 保存模型 torch.save(my_rnn.state_dict(), './model/my_rnn.bin') # 10 结束 all_time = time.time() - start_time print('总耗时:', all_time) return total_loss_list, total_acc_list, all_time # 9 将模型结果进行保存,方便进行读取 def save_rnn_res(): # 1 训练模型,得到需要的结果 total_loss_list, total_acc_list, all_time = train_my_rnn() # 2 定义一个字典 dict1 = { 'loss': total_loss_list, 'time': all_time, 'acc': total_acc_list } # 3 保存成json with open('./data/rnn_result.json', 'w') as fw: fw.write(json.dumps(dict1)) # 10 读取模型结果json def read_json(json_path): with open(json_path, 'r') as fr: # '{a:1, b:2,,,}' --> json.loads() # json.load() 加载json文件 res = json.load(fr) return res # 11 绘图 def plt_RNN(): # 1 拿到数据 rnn_results = read_json('./data/rnn_result-epoch3.json') total_loss_list_rnn, all_time_rnn, total_acc_list_rnn = rnn_results['loss'], rnn_results['time'], rnn_results['acc'] lstm_results = read_json('./data/lstm_result-epoch3.json') total_loss_list_lstm, all_time_lstm, total_acc_list_lstm = lstm_results['loss'], lstm_results['time'], lstm_results[ 'acc'] gru_results = read_json('./data/gru_result-epoch3.json') total_loss_list_gru, all_time_gru, total_acc_list_gru = gru_results['loss'], gru_results['time'], gru_results['acc'] # 2 绘制loss对比曲线图 plt.figure(0) plt.plot(total_loss_list_rnn, label='RNN') plt.plot(total_loss_list_lstm, label='LSTM', color='red') plt.plot(total_loss_list_gru, label='GRU', color='orange') plt.legend(loc='upper right') plt.savefig('./picture/loss.png') plt.show() # 3 绘制耗时柱状图 plt.figure(1) x_data = ['RNN', 'LSTM', 'GRU'] y_data = [all_time_rnn, all_time_lstm, all_time_gru] plt.bar(range(len(x_data)), y_data, tick_label=x_data) plt.savefig('./picture/use_time.png') plt.show() # 4 绘制acc曲线图 plt.figure(2) plt.plot(total_acc_list_rnn, label='RNN') plt.plot(total_acc_list_lstm, label='LSTM', color='red') plt.plot(total_acc_list_gru, label='GRU', color='orange') plt.legend(loc='upper right') plt.savefig('./picture/acc.png') plt.show() # 12 定义预测输入的x --》 tensor_x def line2tensor(x): tensor_x = torch.zeros(len(x), n_letter) for li, letter in enumerate(x): tensor_x[li][all_letters.find(letter)] = 1 return tensor_x # 13 预测主函数 def rnn_predict(x): # 1 x --》 tensor_x tensor_x = line2tensor(x) # 2 实力化模型 my_rnn = MyRNN(input_size=57, hidden_size=128, output_size=18) my_rnn.load_state_dict(torch.load('./model/my_rnn.bin')) # 3 预测 with torch.no_grad(): # 预测时不去计算梯度 input0 = tensor_x.unsqueeze(0) # input0 是三维的,rnn需要 output, hn = my_rnn(input0) topv, topi = output.topk(3, 1, True) print('人名是', x) # 4 打印topk个 for i in range(3): value = topv[0][i] index = topi[0][i] cate = categorys[index] print('国家名是:', cate) if __name__ == '__main__': # filename = './data/name_classfication.txt' # x, y = read_data(filename) # print(x) # print(y) # get_dataloader() # ceshiRNN() # train_my_rnn() # plt_RNN() rnn_predict('zhang')