I tried to proofread the text using Seq2seq at Pythorch, but
Decoder prints only the same word.
This is main.py.
Doing this will give you training and more.
#main.py
import copy
import torch
import torch.nn asn
import torch.nn.functional asF
from torch.autograd import Variable
import torch.optim as optim
import argparse
import sentencepiece as spm
from loader import Seq2seqDataset
from train import operator
from models.EncoderDecoder import EncoderDecoder, Encoder, Decoder
from models.Decoder import Decoder as attachmentGRU
from models.GRU import GRU
sp=spm.SentencePieceProcessor()
sp.load("./data/index.model")
def make_model(vocab, maxlen, d_model=512):
"Helper: Construct a model from hyperparameters."
c=copy.deepcopy
decoder_gru=attentionGRU(d_model, vocab, d_model, maxlen)
gru=GRU(d_model, vocab)
model=EncoderDecoder(gru), Decoder(decoder_gru, maxlen))
return model.cuda()
def data_load(maxlen, source_size, batch_size):
data_set=Seq2seqDataset(maxlen=maxlen)
data_num=len(data_set)
train_ratio=int(data_num*0.8)
event_ratio=int(data_num*0.1)
test_ratio=int(data_num*0.1)
res=int(data_num-(train_ratio+eval_ratio+test_ratio))
train_ratio+=res
ratio=[train_ratio, eval_ratio, test_ratio]
train_dataset, val_dataset, test_dataset=torch.utils.data.random_split(data_set, ratio)
dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=True)
del(train_dataset)
del(data_set)
return dataloader, val_dataloader
if__name__=="__main__":
parser=argparse.ArgumentParser(description='Parse training parameters')
parser.add_argument('--batch_size', type=int, default=128,
help='number of examples in a batch')
parser.add_argument('--maxlen', type=int, default=20,
help='Sequences will be padded or truncated to this size.')
parser.add_argument('--epochs', type=int, default=10,
help='the number of epochs to train')
parser.add_argument('--hidden_size', type=int, default=512,
help='the number of hidden dim')
args=parser.parse_args()
vocab=sp.get_piece_size()
train_loader, val_loader=data_load(args.maxlen,vocab,args.batch_size)
model=make_model(vocab, args.maxlen, args.hidden_size)
criterion=nn.NLLoss(ignore_index=0).cuda()
optimizer=optim.Adam (model.parameters(), lr=0.0001)
training_operator=Operator(model, optimizer, criterion)
for epoch in range (args.epochs):
training_operator.run_epoch(epoch, train_loader, val_loader)
This is train.py.
Training, validation, etc.
#train.py
from utils import to_np, trim_seqs
from torch.autograd import Variable
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sentencepiece as spm
sp=spm.SentencePieceProcessor()
sp.load("./data/index.model")
clip = 5.0
Class Operator:
def__init__(self, model, optimizer, script):
self.model=model
self.optimizer=optimizer
self.criterion=criterion
def run_epoch(self, epoch, train_loader, eval_loader):
self.model.train()
losses = [ ]
sampled_idxs = [ ]
for idx, data in enumerate (train_loader):
self.optimizer.zero_grad()
input_data=Variable(data[0].cuda())
target=data[1].cuda()
target_y=Variable(target[:,1:])
target=Variable(target[:,:-1])
out,_=self.model(input_data, target)
loss=self.loss_compute(out, target_y, True)
loss.append(to_np(loss))
train_loss=sum(losses)/len(losses)
eval_loss, bleuscore=self.evaluate(out, eval_loader)
print("epochs:{}train_loss:{}eval_loss:{}val_bleuscore:{}.format(epoch+1,train_loss,eval_loss,bleuscore))
defloss_compute(self, out, target, flag=False):
loss=self.criterion(out.continuous().view(-1, out.size(-1)), target.continuous().view(-1))
if flag:
self.optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm (self.model.parameters(), clip)
self.optimizer.step()
return loss
default (self, out, loader):
self.model.eval()
losses = [ ]
all_output_seqs = [ ]
all_target_seqs = [ ]
for idx, data in enumerate (loader):
with torch.no_grad():
sampled_index=[]
decoder_outputs=[ ]
sampled_idxs = [ ]
input_data=data[0].cuda()
target=data[1].cuda()
target_y = target [:,1:]
_, hidden=self.model.encode(input_data)
start_symbol=[[sp.PieceToId("<s>")] for i in range(input_data.size(0))]
decoder_input=torch.tensor(start_symbol).cuda()
for i in range (input_data.size(1)):
decoder_output, hidden=self.model.decode(decoder_input, hidden)
_,topi=torch.topk(decoder_output,1,dim=-1)
decoder_outputs.append(decoder_output)
sampled_idxs.append(topi)
decoder_input=topi.squeeze(1)
sampled_idxs=torch.stack(sampled_idxs, dim=1)
decoder_outputs=torch.stack(decoder_outputs, dim=1)
sampled_idxs=sampled_idxs.squeeze()
decoder_outputs=decoder_outputs.squeeze()
loss=self.loss_compute(decoder_outputs, target_y)
all_output_seqs.extend(trim_seqs(sampled_idxs))
all_target_seqs.extend([list(seq[seq>0])] for seq in to_np(target))
loss.append(to_np(loss))
bleu_score=corpus_bleu(all_target_seqs, all_output_seqs, smoothing_function=SmoothingFunction().method1)
mean_loss=sum(losses)/len(losses)
self.generator(all_output_seqs, all_target_seqs, input_data.size(1))
return mean_loss, bleu_score
def generator(self, all_output_seqs, all_target_seqs, maxlen):
with open("."/log/result.txt", "w", encoding="utf-8") asf:
for presence in all_output_seqs:
for token presence:
f.write(sp.IdToPiece(int(tok)))
f.write("\n")
with open("."/log/target.txt", "w", encoding="utf-8") asf:
for presence in all_target_seqs:
for token presence [0]:
f.write(sp.IdToPiece(int(tok)))
f.write("\n")
This is loader.py.
Shape and return corpus in csv file.
#loader.py
import torch
import numpy as np
import csv
import sentencepiece as spm
class Seq2seqDataset(torch.utils.data.Dataset):
def__init__(self, maxlen):
self.sp=spm.SentencePieceProcessor()
self.sp.load("./data/index.model")
self.maxlen=maxlen
with open('./data/parallel_data.csv', mode='r', newline=', encoding='utf-8') asf:
# with open('./data/sample.csv', mode='r', newline=', encoding='utf-8') asf:
csv_file=csv.reader(f)
read_data = [row for row in csv_file ]
self.data_num=len(read_data)-1
jp_data=[]
es_data=[ ]
for i in range(1,self.data_num):
jp_data.append(read_data[i][1:2])
es_data.append(read_data[i][2:3])
self.en_data_idx=np.zeros(len(jp_data), maxlen))
self.de_data_idx=np.zeros(len(es_data), maxlen+1))
for i, sentence in enumerate(jp_data):
for j, idx in enumerate (self.sp.EncodeAsIds(sentence[0])[0:]):
self.en_data_idx[i][j] = idx
if j == maxlen-1:
break
if j<maxlen-1:
self.en_data_idx[i][j:] = self.sp.PieceToId("<unk>")
self.en_data_idx[i][-1] = self.sp.PieceToId("</s>")
for i, sentence in enumerate(es_data):
self.de_data_idx[i][0]=self.sp.PieceToId("<s>")
for j, idx in enumerate (self.sp.EncodeAsIds(sentence[0])[0:]):
self.de_data_idx[i][j+1]=idx
if j+1 == maxlen:
break
if j+1<maxlen:
self.de_data_idx[i][j+1:]=self.sp.PieceToId("<unk>")
self.de_data_idx[i][-1]=self.sp.PieceToId("</s>")
def__len__(self):
return self.data_num
def__getitem__(self,idx):
en_data=torch.tensor(self.en_data_idx[idx-1][:], dtype=torch.long)
de_data=torch.tensor(self.de_data_idx[idx-1][:], dtype=torch.long)
target=torch.zero(self.maxlen+1), dtype=torch.long)
for i, data in enumerate (de_data[:]):
target[i] = data
target[0] = self.sp.PieceToId("<s>")
returnen_data, target
GRU.py
Encoder.
#GRU.py
import torch
import torch.nn asn
class GRU(nn.Module):
def_init__(self, hidden_size, output_size, num_layers=1):
super(GRU,self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.embedding=nn.embedding(self.output_size, self.hidden_size, padding_idx=0)
self.embedding.weight.data.normal_(0,1/self.hidden_size**0.5)
self.embedding.weight.data[0,:] = 0.0
self.gru_source=nn.GRU(hidden_size, hidden_size, num_layers=num_layers,
bidirectional=True, batch_first=True, dropout=0.2)
def forward (self, sentence_words):
self.gru_source.flatten_parameters()
embedded=self.embedding(sentence_words)
hx = self.init_hidden(sentence_words.size(0))
encoder_output, hx = self.gru_source(embedded, hx)
encoder_output=(encoder_output[:, :, :self.hidden_size] + encoder_output[:, :, self.hidden_size:])/2
hx=(hx[0]+hx[1])/2
hx = hx.unsqueeze(0)
return encoder_output, hx
default_hidden(self,bc):
hx=torch.zeros(self.num_layers*2,bc,self.hidden_size))
hx = hx.cuda()
return hx
This is Decoder.py.
Decoder.
#Decoder.py
import torch
import torch.nn asn
from torch.autograd import Variable
import torch.nn.functional asF
import torchvision
class Decoder (nn.Module):
def_init__(self, hidden_dim, vocab_size, embedding_dim, max_length):
super(decoder,self).__init__()
self.hidden_dim = hidden_dim
self.word_embeddings=nn.embedding(vocab_size, embedding_dim, padding_idx=0)
self.gru=nn.GRU (embedding_dim, hidden_dim, batch_first=True)
self.hidden2 linear=nn.Linal(hidden_dim, vocab_size)
def forward (self, sequence, encoder_state):
embedding = self.word_embeddings (sequence)
embedding = F.relu(embedding)
output, state=self.gru(embedding, encoder_state)
output = self.hidden2 linear(output)
output=F.log_softmax(output, dim=-1)
return output, state
This is EncoderDecoder.py.
Encoder and Decoder are grouped together to make it easier to handle.
#EncoderDecoder.py
import torch
import torch.nn asn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional asF
from torch.autograd import Variable
class EncoderDecoder(nn.Module):
def__init__(self, encoder, decoder):
super(EncoderDecoder,self).__init__()
self.encoder=encoder
self.decoder=decoder
def forward (self, data, target):
out,hx =self.encoder(data)
return self.decoder(target,hx)
defdecode(self, target, hx):
return self.decoder(target,hx)
default(self, input_data):
return self.encoder(input_data)
class Encoder (nn.Module):
def_init_(self, base_module):
super(Encoder,self).__init__()
self.base_module=base_module
def forward (self, data):
return self.base_module(data)
class Decoder (nn.Module):
def_init__(self, base_module, maxlen):
super(decoder,self).__init__()
self.base_module=base_module
self.maxlen=maxlen
def forward (self, data, hx):
return self.base_module(data,hx)
If you train this to generate a sentence,
All sentences are composed of pad.
The loss will decrease without any problems, but the sentence generated will be x.
The following is sample output:
epochs:1 train_loss:6.804564086012185eval_loss:6.84909054090885val_bleuscore:0.00021425393895175578
epochs:2 train_loss:6.16480832815932 val_loss:7.3809502720832825 val_bleuscore:5.9522479993669076e-05
epochs:3 train_loss:5.995282799291154 event_loss:7.583597528934479 val_bleuscore:6.663744208921663e-05
epochs:4 train_loss:5.8298093747026245val_loss:7.464733970165253 val_bleuscore:0.00021450632944840199
epochs:5 train_loss:5.752497753777062 event_loss:7.193604528903961 val_bleuscore:0.00013741750714360893
I would appreciate it if you could tell me the problems.
Personally, I think the problem is that we are dealing with Encoder Decoder integratedly, and that we are training by handing over the model by self.
I'll share all the files if you can.
I'm trying something similar.
The following paper describes how to limit the model created in seq2seq to prevent repeated word output.
Sparse and Constrained Attention for Neural Machine Translation
Descriptive thesis summary
https://github.com/ymym3412/acl-papers/issues/218
In normal attention, every word is given a little weight at any time t, which causes a reposition during decode.Therefore, we propose a constructed sparse max combination of constrained softmax, which restricts the number of words to be applied to sparse for attention.
© 2024 OneMinuteCode. All rights reserved.