实验4 RNN&LSTM
学号:111
姓名:XXX
实验目的
通过本实验让学生了解RNN&LSTM网络的基本原理,并通过Keras和Tensorflow构建相关模型完成文本情感分析识别和时间序列预测。
实验要求:
(1)编码规范
(2)代码高效
(3)注释充分,程序可读性好
(4)程序无bug
(5)方法接口规范定义
实验内容
(1)基于keras,使用LSTM进行文本情感识别(from keras.datasets import imdb),并对分类性能进行评估和可视化;(2)基于keras,使用LSTM进行时间序列预测,并对预测性能进行评估和可视化。
实验步骤
1文本情感识别
#导入第三方库
from __future__ import print_function
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.datasets import imdb
from keras.utils import pad_sequences
1.1 数据集读取
max_features = 20000
maxlen = 100
batch_size = 32
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
# 打印
print(len(x_train), 'train sequences')
# 打印
print(len(x_test), 'test sequences')
print('Pad sequences (samples x time)')
Loading data...
25000 train sequences
25000 test sequences
Pad sequences (samples x time)
1.2数据预处理
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)
model = Sequential()
x_train shape: (25000, 100)
x_test shape: (25000, 100)
1.3模型构建
model = Sequential()
# 嵌入层
model.add(Embedding(max_features, # 词汇表大小中收录单词数量,也就是嵌入层矩阵的行数
128, # 每个单词的维度,也就是嵌入层矩阵的列数
input_length=maxlen)) # 一篇文本的长度
# LSTM隐藏层
model.add(Bidirectional(LSTM(64))) # 输出64维向量
# 防止过拟合的丢弃层
model.add(Dropout(0.5))
# 模型输出层
model.add(Dense(1, activation='sigmoid')) # 输出1维标量
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
1.4模型训练
# 训练过程
print('Train...')
history =model.fit(x_train, y_train,
batch_size=batch_size, # 遍历1遍数据集的批次数=len(x_train)/batch_size
epochs=4, # 只遍历整个数据集4遍
validation_data=[x_test, y_test],validation_split=0.2) # 验证集
Train...
Epoch 1/4
782/782 [==============================] - 93s 115ms/step - loss: 0.4111 - accuracy: 0.8065 - val_loss: 0.3329 - val_accuracy: 0.8551
Epoch 2/4
782/782 [==============================] - 91s 117ms/step - loss: 0.2180 - accuracy: 0.9155 - val_loss: 0.3607 - val_accuracy: 0.8498
Epoch 3/4
782/782 [==============================] - 89s 114ms/step - loss: 0.1240 - accuracy: 0.9559 - val_loss: 0.4296 - val_accuracy: 0.8394
Epoch 4/4
782/782 [==============================] - 90s 116ms/step - loss: 0.0632 - accuracy: 0.9778 - val_loss: 0.5341 - val_accuracy: 0.8359
1.5 模型性能评估
# 评测
loss, precision = model.evaluate(x_test, y_test, verbose=1)
print("precision=", precision)
782/782 [==============================] - 17s 22ms/step - loss: 0.5341 - accuracy: 0.8359
precision= 0.835919976234436
1.6.模型可视化
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.legend(['training', 'valivation'], loc='upper left')
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UBsznaqa-1681869337832)(%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_files/%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_23_0.png)]
2.时间序列预测
2.1 数据获取
from math import sqrt
from numpy import concatenate
from matplotlib import pyplot
from pandas import read_csv
from pandas import DataFrame
from pandas import concat
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
dataset = read_csv('pollution1.csv', header=0, index_col=0) #dataframe
values = dataset.values #高维数组
2.2数据与处理
#数据可视化
# specify columns to plot
groups = [0, 1, 2, 3, 5, 6, 7]
i = 1
# plot each column
pyplot.figure()
for group in groups:
pyplot.subplot(len(groups), 1, i)
pyplot.plot(values[:, group])
pyplot.title(dataset.columns[group], y=0.5, loc='right')
i += 1
pyplot.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hlDUqjWp-1681869337833)(%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_files/%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_29_0.png)]
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
for i in range(n_in, 0, -1):
cols.append(df.shift(i)) #通过位移得到时间序列
#添加列名
names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
for i in range(0, n_out):
cols.append(df.shift(-i))
if i == 0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
# drop rows with NaN values
if dropnan:
agg.dropna(inplace=True)
return agg
# integer encode direction
encoder = LabelEncoder()
#对不连续的数字或文本编号,将风向数字化编码
values[:,4] = encoder.fit_transform(values[:,4])
# 将数据转化为浮点数
values = values.astype('float32')
# 数据归一化
scaler = MinMaxScaler(feature_range=(0, 1))
scaled = scaler.fit_transform(values)
# 指定特征数目和输入时序长度
#specify the number of lag hours
n_hours = 3
n_features = 8
# frame as supervised learning
reframed = series_to_supervised(scaled, n_hours, 1)
print(reframed.shape)
(43797, 32)
# split into train and test sets
values = reframed.values
n_train_hours = 365 * 24
train = values[:n_train_hours, :]
test = values[n_train_hours:, :]
# split into input and outputs
n_obs = n_hours * n_features #观察值24
train_X, train_y = train[:, :n_obs], train[:, -n_features]
test_X, test_y = test[:, :n_obs], test[:, -n_features]
print(train_X.shape, len(train_X), train_y.shape)
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)
(8760, 24) 8760 (8760,)
(8760, 3, 8) (8760,) (35037, 3, 8) (35037,)
2.3 模型构建
#LSTM网络设计
model = Sequential()
#LSTM参数依次为:输出维度,输入维度
model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2])))#LSTM模型
#全连接层
model.add(Dense(1))
#优化方法选择adam,损失函数选择均方差
model.compile(loss='mae', optimizer='adam')#确定代价函数及梯度下降法
model.summary()
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 50) 11800
dense (Dense) (None, 1) 51
=================================================================
Total params: 11,851
Trainable params: 11,851
Non-trainable params: 0
_________________________________________________________________
2.4 模型训练
# fit network
history = model.fit(train_X, train_y, epochs=50, batch_size=72, validation_data=(test_X, test_y), verbose=2, shuffle=False)
Epoch 1/50
122/122 - 4s - loss: 0.0469 - val_loss: 0.0717 - 4s/epoch - 29ms/step
Epoch 2/50
122/122 - 1s - loss: 0.0269 - val_loss: 0.0492 - 1s/epoch - 8ms/step
Epoch 3/50
122/122 - 1s - loss: 0.0215 - val_loss: 0.0303 - 779ms/epoch - 6ms/step
Epoch 4/50
122/122 - 1s - loss: 0.0210 - val_loss: 0.0268 - 811ms/epoch - 7ms/step
Epoch 5/50
122/122 - 1s - loss: 0.0204 - val_loss: 0.0234 - 788ms/epoch - 6ms/step
Epoch 6/50
122/122 - 1s - loss: 0.0199 - val_loss: 0.0206 - 859ms/epoch - 7ms/step
Epoch 7/50
122/122 - 1s - loss: 0.0196 - val_loss: 0.0201 - 975ms/epoch - 8ms/step
Epoch 8/50
122/122 - 1s - loss: 0.0192 - val_loss: 0.0191 - 941ms/epoch - 8ms/step
Epoch 9/50
122/122 - 1s - loss: 0.0189 - val_loss: 0.0184 - 967ms/epoch - 8ms/step
Epoch 10/50
122/122 - 1s - loss: 0.0184 - val_loss: 0.0179 - 951ms/epoch - 8ms/step
Epoch 11/50
122/122 - 1s - loss: 0.0181 - val_loss: 0.0175 - 997ms/epoch - 8ms/step
Epoch 12/50
122/122 - 1s - loss: 0.0177 - val_loss: 0.0173 - 1s/epoch - 9ms/step
Epoch 13/50
122/122 - 1s - loss: 0.0173 - val_loss: 0.0171 - 1s/epoch - 8ms/step
Epoch 14/50
122/122 - 1s - loss: 0.0170 - val_loss: 0.0168 - 967ms/epoch - 8ms/step
Epoch 15/50
122/122 - 1s - loss: 0.0168 - val_loss: 0.0169 - 982ms/epoch - 8ms/step
Epoch 16/50
122/122 - 1s - loss: 0.0164 - val_loss: 0.0175 - 931ms/epoch - 8ms/step
Epoch 17/50
122/122 - 1s - loss: 0.0160 - val_loss: 0.0172 - 958ms/epoch - 8ms/step
Epoch 18/50
122/122 - 1s - loss: 0.0157 - val_loss: 0.0170 - 979ms/epoch - 8ms/step
Epoch 19/50
122/122 - 1s - loss: 0.0152 - val_loss: 0.0175 - 946ms/epoch - 8ms/step
Epoch 20/50
122/122 - 1s - loss: 0.0153 - val_loss: 0.0171 - 931ms/epoch - 8ms/step
Epoch 21/50
122/122 - 1s - loss: 0.0149 - val_loss: 0.0174 - 957ms/epoch - 8ms/step
Epoch 22/50
122/122 - 1s - loss: 0.0152 - val_loss: 0.0163 - 928ms/epoch - 8ms/step
Epoch 23/50
122/122 - 1s - loss: 0.0150 - val_loss: 0.0154 - 955ms/epoch - 8ms/step
Epoch 24/50
122/122 - 1s - loss: 0.0149 - val_loss: 0.0159 - 966ms/epoch - 8ms/step
Epoch 25/50
122/122 - 1s - loss: 0.0146 - val_loss: 0.0159 - 965ms/epoch - 8ms/step
Epoch 26/50
122/122 - 1s - loss: 0.0148 - val_loss: 0.0150 - 981ms/epoch - 8ms/step
Epoch 27/50
122/122 - 1s - loss: 0.0146 - val_loss: 0.0157 - 1s/epoch - 10ms/step
Epoch 28/50
122/122 - 1s - loss: 0.0147 - val_loss: 0.0148 - 1s/epoch - 8ms/step
Epoch 29/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0154 - 1s/epoch - 8ms/step
Epoch 30/50
122/122 - 1s - loss: 0.0147 - val_loss: 0.0148 - 986ms/epoch - 8ms/step
Epoch 31/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0147 - 924ms/epoch - 8ms/step
Epoch 32/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0143 - 931ms/epoch - 8ms/step
Epoch 33/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0144 - 930ms/epoch - 8ms/step
Epoch 34/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0142 - 947ms/epoch - 8ms/step
Epoch 35/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0142 - 951ms/epoch - 8ms/step
Epoch 36/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0140 - 949ms/epoch - 8ms/step
Epoch 37/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0139 - 925ms/epoch - 8ms/step
Epoch 38/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0140 - 932ms/epoch - 8ms/step
Epoch 39/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0139 - 942ms/epoch - 8ms/step
Epoch 40/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0139 - 980ms/epoch - 8ms/step
Epoch 41/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0140 - 917ms/epoch - 8ms/step
Epoch 42/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0138 - 924ms/epoch - 8ms/step
Epoch 43/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0139 - 929ms/epoch - 8ms/step
Epoch 44/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0140 - 941ms/epoch - 8ms/step
Epoch 45/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0137 - 981ms/epoch - 8ms/step
Epoch 46/50
122/122 - 1s - loss: 0.0143 - val_loss: 0.0139 - 1s/epoch - 9ms/step
Epoch 47/50
122/122 - 1s - loss: 0.0144 - val_loss: 0.0138 - 1s/epoch - 10ms/step
Epoch 48/50
122/122 - 1s - loss: 0.0145 - val_loss: 0.0139 - 1s/epoch - 9ms/step
Epoch 49/50
122/122 - 1s - loss: 0.0142 - val_loss: 0.0136 - 1s/epoch - 10ms/step
Epoch 50/50
122/122 - 1s - loss: 0.0143 - val_loss: 0.0136 - 1s/epoch - 12ms/step
2.5模型测试
2.6.评估和可视化
# plot history
pyplot.plot(history.history['loss'], label='train')
pyplot.plot(history.history['val_loss'], label='test')
pyplot.legend()
pyplot.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VFzbHpsJ-1681869337834)(%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_files/%E5%AE%9E%E9%AA%8C%E5%9B%9B%20%E5%BE%AA%E7%8E%AF%E7%A5%9E%E7%BB%8F%E7%BD%91%E7%BB%9C_40_0.png)]
2.7 模型调参
调整模型参数,以获得更好的预测效果
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_hours*n_features))
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, -7:]), axis=1)
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -7:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# 计算均方差
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
1095/1095 [==============================] - 2s 2ms/step
Test RMSE: 27.032
实验总结
- 情感分析
- 情感是分类为积极与消极的二分类
- 设置数据中有20000个不同的单词,每句话由不超过100个单词组成。
- 使用sequence.pad_sequences固定输入长度
- 再通过np.array将数据转换为数组
- 构建模型,定义embedding,负责将数字编码成向量,将单词编码成128维的向量,LSTM输出的隐状态的维度为64
- 网络构建好后就是上数据训练了,用 4 个 epochs 和 batch_size 取 32 来训练这个网络
- 时间序列
- 在数据处理中,对不连续的数字或文本编号,使用LabelEncoder的fit_transform方法将风向数字化编码
- 在自定义series_to_supervised函数中,通过df.shift位移得到时间序列
- 利用concat函数将序列按行方向进行合并
- 然后删除序列中包含缺失值的行或列,从而生成一个新的数据框或序列,其中不包含缺失值。
- 在模型构建中选择adam方法进行优化,选择均方差作为损失函数
析
- 情感是分类为积极与消极的二分类
- 设置数据中有20000个不同的单词,每句话由不超过100个单词组成。
- 使用sequence.pad_sequences固定输入长度
- 再通过np.array将数据转换为数组
- 构建模型,定义embedding,负责将数字编码成向量,将单词编码成128维的向量,LSTM输出的隐状态的维度为64
- 网络构建好后就是上数据训练了,用 4 个 epochs 和 batch_size 取 32 来训练这个网络
- 时间序列
- 在数据处理中,对不连续的数字或文本编号,使用LabelEncoder的fit_transform方法将风向数字化编码
- 在自定义series_to_supervised函数中,通过df.shift位移得到时间序列
- 利用concat函数将序列按行方向进行合并
- 然后删除序列中包含缺失值的行或列,从而生成一个新的数据框或序列,其中不包含缺失值。
- 在模型构建中选择adam方法进行优化,选择均方差作为损失函数