循环神经网络,被广泛应用在自然语言处理领域(NLP),本文就使用RNN的一个改进模型LSTM来做一个小案例,生成藏头古诗。
代码示例
1、训练词向量
from gensim.models import Word2Vec
class Corpus():
def __init__(self, file_path):
self.file_path = file_path
def __iter__(self):
with open(self.file_path) as file:
while True:
line = file.readline()
if not line:
break
# 将句子拆分成字
yield ' '.join(line).split(' ')
corpus = Corpus('./datas/poetry_7.txt')
model = Word2Vec(corpus, min_count=1, vector_size=100)
# 保存词向量相关数据
vectors = model.wv.vectors
key_to_index = model.wv.key_to_index
index_to_key = model.wv.index_to_key
2、构建数据集
import torch.utils.data as data
import numpy as np
class PoetryDataset(data.Dataset):
def __init__(self, file_path, vectors, key_to_index):
super().__init__()
self.vectors = vectors
self.key_to_index = key_to_index
# 读取文件
with open(file_path) as file:
self.lines = file.readlines()
def __len__(self):
return len(self.lines)
def __getitem__(self, index):
line = self.lines[index].strip()
x_idx = []
y_idx = []
# 错位,建立前后关系
for xs, ys in zip(line[:-1], line[1:]):
x_idx.append(self.key_to_index[xs])
y_idx.append(self.key_to_index[ys])
x = vectors[x_idx]
return x, np.array(y_idx)
dataset = PoetryDataset('./datas/poetry_7.txt', vectors, key_to_index)
loader = data.DataLoader(dataset, shuffle=True, batch_size=100)
3、定义模型
import torch.nn as nn
import torch
class Net(nn.Module):
def __init__(self, param):
super().__init__()
# 2层,双向LSTM
self.lstm = nn.LSTM(param['D_in'], param['D_hidden'], num_layers=2, bidirectional=True, batch_first=True)
self.dropout = nn.Dropout(0.3)
self.flatten = nn.Flatten(0, 1)
# 双向,输出隐层x2
self.linear = nn.Linear(2 * param['D_hidden'], param['D_out'])
self.loss_fn = nn.CrossEntropyLoss()
self.optimizer = torch.optim.RMSprop(self.parameters(), lr=param['lr'])
def forward(self, x, h_n, c_n):
x, (h_n, c_n) = self.lstm(x)
x = self.dropout(x)
# 展平后才能输入fc层
x = self.flatten(x)
out = self.linear(x)
return out, (h_n, c_n)
4、模型训练
计算量不是一般的大,GPU自然少不了。
device = "cuda" if torch.cuda.is_available() else "cpu"
print('device', device)
param = {
'D_in': 100,
'D_hidden': 128,
'D_out': len(index_to_key),
'lr': 1e-4,
}
net = Net(param).to(device)
# 定义初始参数
h_n = c_n = None
for e in range(1000):
for i, (x, y) in enumerate(loader):
# 数据迁移到GPU
x = x.to(device)
y = y.to(device)
# 训练模型
y_pred, (h_n, c_n) = net(x, h_n, c_n)
# 要注意和输出的维度保持一致
loss = net.loss_fn(y_pred, y.view(-1))
net.optimizer.zero_grad()
loss.backward()
net.optimizer.step()
if e % 50 ==0 and e % 50 == 0:
print(e, i, loss)
torch.save(net, f'./net_{e}.m')
torch.save(net, './net.m')
5、古诗生成
# 随机生成一首诗
word_idx = np.random.randint(len(key_to_index))
result = index_to_key[word_idx]
# 初始化输入参数
h_g = torch.zeros(4, 100, 128)
c_g = torch.zeros(4, 100, 128)
# 根据第一个字,生成后面的31个字
for i in range(31):
x_g = torch.tensor(vectors[word_idx][None][None]).to(device)
out, (h_g, c_g) = net(x_g, h_g, c_g)
word_idx = torch.argmax(out).item()
result += index_to_key[word_idx]
print(result)
# 藏头诗
word_list = ['独', '每', '遥', '遍']
points = [',', ';', ',', '。']
result = ''
for w,p in zip(word_list, points):
result += w
# 防止出现生僻字
try:
word_idx = key_to_index[w]
except KeyError:
word_idx = np.random.randint(len(key_to_index))
h_g = torch.zeros(4, 100, 128)
c_g = torch.zeros(4, 100, 128)
# 生成后面6个字
for i in range(6):
x_g = torch.tensor(vectors[word_idx][None][None]).to(device)
out, (h_g, c_g) = net(x_g, h_g, c_g)
word_idx = torch.argmax(out).item()
result += index_to_key[word_idx]
result += p
print(result)
很遗憾,不论是完整的,还是藏头的诗,效果都不是很好。原因可能是模型太简单,没有Attention机制,后面再优化。
本文的主要目的,还是熟悉LSTM结构,和序列生成的套路,生成句子的场景大多是自己玩玩,实际生产环境中用到的可能性不大。
本文为 陈华 原创,欢迎转载,但请注明出处:http://www.chenhuax.com/read/318