"""
word embedding
文本情感分类
数据下载地址:https://ai.stanford.edu/~amaas/data/sentiment/
思路分析:准备数据 构建模型 模型训练 模型评估
"""
from torch.utils.data import DataLoader, Dataset
from demo0421002 import ws,max_len
import os
import re
import torch
'''input:字符串 output:单词'''
def tokenlize(content):
filters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>', '\?',
'@'
, '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”', '“', ]
re.sub("<.*?>", "", content)
content = re.sub("|".join(filters), '', content)
tokens = [i.strip().lower() for i in content.split()]
return tokens
class ImdbDataset(Dataset):
def __init__(self, train=True):
self.train_data_path = r"D:\pythonProject\aclImdb\train"
self.test_data_path = r"D:\pythonProject\aclImdb\test"
data_path = self.train_data_path if train else self.test_data_path
temp_data_path = [os.path.join(data_path, "pos"), os.path.join(data_path, "neg")]
self.total_file_path = []
for path in temp_data_path:
file_name_list = os.listdir(path)
file_path_list = [os.path.join(path, i) for i in file_name_list if i.endswith(".txt")]
self.total_file_path.extend(file_path_list)
def __getitem__(self, index):
file_path = self.total_file_path[index]
label_str = file_path.split("\\")[-2]
label = 0 if label_str == "neg" else 1
content = open(file_path, errors='ignore').read()
tokens = tokenlize(content)
return tokens, label
def __len__(self):
return len(self.total_file_path)
def collate_fn(batch):
"""
Not Understand
:param batch:([tokens,label],[tokens,label])
:return:
"""
content, label = list(zip(*batch))
content = [ws.transform(i, max_len=max_len) for i in content]
content = torch.LongTensor(content)
label = torch.LongTensor(label)
return content, label
def get_dataloader(train=True):
imdb_dataset = ImdbDataset(train)
data_loader = DataLoader(imdb_dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)
return data_loader
if __name__ == "__main__":
for idx, (input, target) in enumerate(get_dataloader()):
print(idx)
print(input.size())
print(target.size())
break
"""
实现的是:构建词典,实现方法把句子转化为数字序列和其反转
"""
class Word2Sequence:
UNK_TAG = "UNK"
PAD_TAG = "PAD"
UNK = 0
PAD = 1
def __init__(self):
self.dict = {
self.UNK_TAG: self.UNK,
self.PAD_TAG: self.PAD
}
self.inverse_dict = {}
self.count = {}
def fit(self, sentence):
"""把单个句子保存到dict
::param sentence:[word1,word2,word3...]
"""
for word in sentence:
self.count[word] = self.count.get(word, 0) + 1
def build_vocab(self, min=5, max=None, max_feature=None):
"""
生成词典
:param min:最小的次数
:param max: 最大的次数
:param max_feature: 一共保留多少个词语
:return:
"""
if min is not None:
self.count = {word: value for word, value in self.count.items() if value > min}
if max is not None:
self.count = {word: value for word, value in self.count.items() if value < max}
if max_feature is not None:
temp = sorted(self.count.items(), key=lambda x: x[-1], reverse=True)[:max_feature]
self.count = dict(temp)
for word in self.count:
self.dict[word] = len(self.dict)
self.inverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
def transform(self, sentence, max_len=None):
"""
把句子转化为序列
:param sentence:[word1,word2,word3...]
:param max_len:int,对句子进行填充或裁剪
:return:
"""
if max_len is not None:
if max_len > len(sentence):
sentence = sentence + [self.PAD_TAG] * (max_len - len(sentence))
if max_len < len(sentence):
sentence = sentence[:max_len]
return [self.dict.get(word, self.UNK) for word in sentence]
def inverse_transform(self, indices):
"""
把序列转化为句子
:param indices:[1,2,3,4...]
:return:
"""
return [self.inverse_dict.get(idx) for idx in indices]
def __len__(self):
return len(self.dict)
from demo0415002 import Word2Sequence
from demo0415001 import tokenlize
import os
import pickle
from tqdm import tqdm
if __name__ == "__main__":
ws = Word2Sequence()
path = r"D:\pythonProject\aclImdb\train"
temp_data_path = [os.path.join(path, "pos"), os.path.join(path, "neg")]
for data_path in temp_data_path:
file_paths = [os.path.join(data_path, file_name) for file_name in os.listdir(data_path) if file_name.endswith("txt")]
for file_path in tqdm(file_paths):
sentence = tokenlize(open(file_path, errors='ignore').read())
ws.fit(sentence)
ws.build_vocab(min=10, max_feature=10000)
pickle.dump(ws, open("./model/ws.pkl", "wb"))
print(len(ws))
import pickle
ws = pickle.load(open("./model/ws.pkl", "rb"))
max_len = 20
"""
定义模型
"""
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from demo0421002 import ws, max_len
from demo0415001 import get_dataloader
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.embedding = nn.Embedding(len(ws), 100)
self.fc = nn.Linear(max_len*100, 2)
def forward(self, input):
"""
:param input:[batch_size,max_len]
:return:
"""
x = self.embedding(input)
x = x.view([-1, max_len*100])
out = self.fc(x)
return F.log_softmax(out, dim=-1)
model = MyModel()
optimizer = Adam(model.parameters(), 0.001)
def train(epoch):
for idx, (input, target) in enumerate(get_dataloader(train=True)):
optimizer.zero_grad()
output = model(input)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
print(loss.item())
if __name__ == "__main__":
for i in range(1):
train(i)