目录
一、学习目标
- 理解循环神经网络RNN的基本原理。
- 掌握利用循环神经网络进行文本分类的方法。
二、学习内容
利用循环神经网络进行新闻话题分类的代码,设置Embedding的trainable=True,并调整网络结构,看是否可以提高识别率。
三、学习过程
经过不断的调参,得出下表结果,当设置Embedding的trainable=True时,并经过下表的参数调整,识别率有所下降。
SimpleRNN:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.4666 |
Adam | 0.001 | 512 | True | 10 | 0.4974 |
RMSprop | 0.001 | 512 | True | 10 | 0.5083 |
SGD | 0.001 | 512 | True | 10 | 0.2692 |
Lstm:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.5904 |
Adam | 0.001 | 512 | True | 10 | 0.5670 |
RMSprop | 0.001 | 512 | True | 10 | 0.5940 |
SGD | 0.001 | 512 | True | 10 | 0.2674 |
GRU:
optimizer | lr | batch_ size | trainable | epochs | 验证集识别率 |
Adam | 0.001 | 512 | False | 10 | 0.5973 |
Adam | 0.001 | 512 | True | 10 | 0.5518 |
RMSprop | 0.001 | 512 | True | 10 | 0.5916 |
SGD | 0.001 | 512 | True | 10 | 0.2800 |
网络结构:
优化器为Adam,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU |
优化器为RMSprop,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU |
优化器为SGD,trainable=True,各种循环神经网络的识别率:
SimpleRNN | |
LSTM | |
GRU |
可见,SGD并不适合当做循环神经网络的优化器。
四、源码
# In[1]: 读取新闻话题分类数据
import pandas as pd
df = pd.read_json(r'D:\Cadabra_tools002\course_data\News_Category_Dataset.json', lines=True)
df.head()
# In[2]: 预处理,合并 "WORLDPOST"和"THE WORLDPOST"两种类别
df.category = df.category.map(lambda x:"WORLDPOST" if x == "THE WORLDPOST" else x)
categories = df.groupby('category')
print("total categories: ", categories.ngroups)
print(categories.size())
# In[3]: 将单词进行标号
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
# 将标题和正文合并
df['text'] = df.headline + " " + df.short_description
# 将单词进行标号
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.text)
X = tokenizer.texts_to_sequences(df.text)
df['words'] = X
#记录每条数据的单词数
df['word_length'] = df.words.apply(lambda i: len(i))
#清除单词数不足5个的数据条目
df = df[df.word_length >= 5]
df.word_length.describe()
# In[4]: 将类别进行编号
maxlen = 50
X = list(sequence.pad_sequences(df.words, maxlen=maxlen))
# 将类别进行编号,
categories = df.groupby('category').size().index.tolist()
category_int = {}
int_category = {}
for i, k in enumerate(categories):
category_int.update({k:i}) # 类别 -> 编号
int_category.update({i:k}) # 编号 -> 类别
df['c2id'] = df['category'].apply(lambda x: category_int[x])
# In[5]: 随机选取训练样本
import numpy as np
import keras.utils as utils
from sklearn.model_selection import train_test_split
X = np.array(X)
Y = utils.to_categorical(list(df.c2id))
# 将数据分成两部分,随机取80%用于训练,20%用于测试
seed = 29 # 随机种子
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=seed)
# In[6]: 加载预先训练好的单词向量
EMBEDDING_DIM = 100
embeddings_index = {}
f = open(r'D:\Cadabra_tools002\course_data\glove.6B.100d.txt',errors='ignore') # 每个单词用100个数字的向量表示
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Total %s word vectors.' %len(embeddings_index)) #399913
# In[7]: 构造Embedding层,并用预训练好的单词向量初始化,注意该层不用训练
from keras.initializers import Constant
from keras.layers.embeddings import Embedding
word_index = tokenizer.word_index
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
embedding_vector = embeddings_index.get(word)
#根据单词挑选出对应向量
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
embedding_layer = Embedding(len(word_index)+1, EMBEDDING_DIM,
embeddings_initializer=Constant(embedding_matrix),
input_length = maxlen,
trainable=True #可改为True
)
# In[8]: 构造神经网络并训练,对单词向量进行分类
from keras import optimizers
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
model = Sequential()
model.add(embedding_layer) # 注意该层的 trainable=False
model.add(Flatten())
model.add(Dense(64, activation='relu')) #
#当结果是输出多个分类的概率时,用softmax激活函数,它将为30个分类提供不同的可能性概率值
model.add(Dense(len(int_category), activation='softmax'))
#对于输出多个分类结果,最好的损失函数是categorical_crossentropy
optimizer = optimizers.Adam(lr=0.001)#调整识别率
#optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
model.summary()
history = model.fit(x_train, y_train, epochs=3, validation_data=(x_val, y_val), batch_size=512)
# val_accuracy: 0.4611
# 识别率并不高,这是因为全连接网络不适合用于识别单词序列
# In[9]: 绘制训练过程中识别率和损失的变化
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()
plt.figure()
plt.title('Training and validation loss')
plt.plot(epochs, loss, 'red', label='Training loss')
plt.plot(epochs, val_loss, 'blue', label='Validation loss')
plt.legend()
plt.show()
# In[10]: 使用循环神经网络 RNN 进行分类
from keras.layers import SimpleRNN
model = Sequential()
model.add(embedding_layer)
model.add(SimpleRNN(64)) #可改变单词向量的维度
#model.add(Dense(32, activation='Relu'))
model.add(Dense(len(int_category), activation='softmax'))
optimizer = optimizers.Adam(lr=0.001)#调整识别率
#optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
#optimizer = optimizers.SGD(lr=0.001)#调整识别率
model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), batch_size=512)
# val_accuracy: 0.4486
# 绘制训练过程中识别率和损失的变化
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()
plt.show()
# In[11]: long short term memory(长短期记忆网络)
from keras.layers import LSTM
model = Sequential()
model.add(embedding_layer)
model.add(LSTM(64,activation='relu',return_sequences=True)) #输出每一个句子,激活函数换成sigmoid,或者activation='relu'
model.add(LSTM(32)) #再添加一个LSTM,识别率反而下降
#model.add(Flatten()) #进行展平
model.add(Dense(len(int_category), activation='softmax')) #
#optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
optimizer = optimizers.Adam(lr=0.001)#调整识别率
#optimizer = optimizers.SGD(lr=0.001)#调整识别率
model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), batch_size=512)
# val_acc: 0.5920
# 绘制训练过程中识别率和损失的变化
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()
plt.show()
# In[12]: Gate Recurrent Unit (门控循环单元)
from keras.layers import GRU, Conv1D, MaxPooling1D
model = Sequential()
model.add(embedding_layer)
#使用一维卷积网络切割输入数据,参数5表示每各个单词作为切割小段
model.add(Conv1D(64, 5, activation='relu'))
#参数3表示,上层传下来的数据中,从每3个数值中抽取最大值
model.add(MaxPooling1D(3))
#添加一个有记忆性的GRU层,其原理与LSTM相同,运行速度更快,准确率有所降低
model.add(GRU(64, dropout=0.1))
model.add(Dense(len(int_category), activation='softmax'))
#optimizer = optimizers.RMSprop(lr=0.001)#调整识别率
optimizer = optimizers.Adam(lr=0.001)#调整识别率
#optimizer = optimizers.SGD(lr=0.001)#调整识别率
model.compile(optimizer, loss='categorical_crossentropy', metrics=['acc'])
history = model.fit(x_train, y_train, epochs=5, validation_data=(x_val, y_val), batch_size=512)
# val_acc: 0.5741
# 绘制训练过程中识别率和损失的变化
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.title('Training and validation accuracy')
plt.plot(epochs, acc, 'red', label='Training acc')
plt.plot(epochs, val_acc, 'blue', label='Validation acc')
plt.legend()
plt.show()
五、学习产出
- LSTM的识别率到了60%就已经封顶了,调参试过很多次,识别率仍无法上去,迭代次数过多也会造成识别率的下降。