Pythorch designed Seq2seq, but only the same words are printed

I tried to proofread the text using Seq2seq at Pythorch, but
Decoder prints only the same word.

This is main.py.
Doing this will give you training and more.

#main.py
import copy
import torch
import torch.nn asn
import torch.nn.functional asF
from torch.autograd import Variable
import torch.optim as optim
import argparse
import sentencepiece as spm

from loader import Seq2seqDataset
from train import operator

from models.EncoderDecoder import EncoderDecoder, Encoder, Decoder

from models.Decoder import Decoder as attachmentGRU
from models.GRU import GRU

sp=spm.SentencePieceProcessor()
sp.load("./data/index.model")

def make_model(vocab, maxlen, d_model=512):
    "Helper: Construct a model from hyperparameters."
    c=copy.deepcopy
    decoder_gru=attentionGRU(d_model, vocab, d_model, maxlen)
    gru=GRU(d_model, vocab)
    model=EncoderDecoder(gru), Decoder(decoder_gru, maxlen))
    return model.cuda()

def data_load(maxlen, source_size, batch_size):
    data_set=Seq2seqDataset(maxlen=maxlen)
    data_num=len(data_set)
    train_ratio=int(data_num*0.8)
    event_ratio=int(data_num*0.1)
    test_ratio=int(data_num*0.1)
    res=int(data_num-(train_ratio+eval_ratio+test_ratio))
    train_ratio+=res
    ratio=[train_ratio, eval_ratio, test_ratio]
    train_dataset, val_dataset, test_dataset=torch.utils.data.random_split(data_set, ratio)
    dataloader=torch.utils.data.DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
    val_dataloader=torch.utils.data.DataLoader(val_dataset,batch_size=batch_size,shuffle=True)
    del(train_dataset)
    del(data_set)
    return dataloader, val_dataloader

if__name__=="__main__":
    parser=argparse.ArgumentParser(description='Parse training parameters')
    parser.add_argument('--batch_size', type=int, default=128,
                        help='number of examples in a batch')
    parser.add_argument('--maxlen', type=int, default=20,
                        help='Sequences will be padded or truncated to this size.')    
    parser.add_argument('--epochs', type=int, default=10,
                        help='the number of epochs to train')   
    parser.add_argument('--hidden_size', type=int, default=512,
                        help='the number of hidden dim')                       
    args=parser.parse_args()             

    vocab=sp.get_piece_size()

    train_loader, val_loader=data_load(args.maxlen,vocab,args.batch_size)
    model=make_model(vocab, args.maxlen, args.hidden_size)
    criterion=nn.NLLoss(ignore_index=0).cuda()
    optimizer=optim.Adam (model.parameters(), lr=0.0001)
    training_operator=Operator(model, optimizer, criterion)
    for epoch in range (args.epochs):
        training_operator.run_epoch(epoch, train_loader, val_loader)

This is train.py.
Training, validation, etc.

#train.py

from utils import to_np, trim_seqs
from torch.autograd import Variable
import torch
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import sentencepiece as spm

sp=spm.SentencePieceProcessor()
sp.load("./data/index.model")

clip = 5.0

Class Operator:
    def__init__(self, model, optimizer, script):
        self.model=model
        self.optimizer=optimizer
        self.criterion=criterion

    def run_epoch(self, epoch, train_loader, eval_loader):
        self.model.train()
        losses = [ ]
        sampled_idxs = [ ]
        for idx, data in enumerate (train_loader):
            self.optimizer.zero_grad()

            input_data=Variable(data[0].cuda())
            target=data[1].cuda()
            target_y=Variable(target[:,1:])
            target=Variable(target[:,:-1])
            
            out,_=self.model(input_data, target)

            loss=self.loss_compute(out, target_y, True)
            loss.append(to_np(loss))
        train_loss=sum(losses)/len(losses)
        eval_loss, bleuscore=self.evaluate(out, eval_loader)
        print("epochs:{}train_loss:{}eval_loss:{}val_bleuscore:{}.format(epoch+1,train_loss,eval_loss,bleuscore))

    defloss_compute(self, out, target, flag=False):
        loss=self.criterion(out.continuous().view(-1, out.size(-1)), target.continuous().view(-1))

        if flag:
            self.optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm (self.model.parameters(), clip)
            self.optimizer.step()
        return loss

    default (self, out, loader):
        self.model.eval()
        losses = [ ]
        all_output_seqs = [ ]
        all_target_seqs = [ ]
        for idx, data in enumerate (loader):
            with torch.no_grad():
                sampled_index=[]
                decoder_outputs=[ ]
                sampled_idxs = [ ]
                input_data=data[0].cuda()
                target=data[1].cuda()
                target_y = target [:,1:]
                _, hidden=self.model.encode(input_data)
                start_symbol=[[sp.PieceToId("<s>")] for i in range(input_data.size(0))]
                decoder_input=torch.tensor(start_symbol).cuda()
                for i in range (input_data.size(1)):
                    decoder_output, hidden=self.model.decode(decoder_input, hidden)
                    _,topi=torch.topk(decoder_output,1,dim=-1)
                    decoder_outputs.append(decoder_output)
                    sampled_idxs.append(topi)
                    decoder_input=topi.squeeze(1)
                sampled_idxs=torch.stack(sampled_idxs, dim=1)
                decoder_outputs=torch.stack(decoder_outputs, dim=1)
                sampled_idxs=sampled_idxs.squeeze()
                decoder_outputs=decoder_outputs.squeeze()
                loss=self.loss_compute(decoder_outputs, target_y)
                all_output_seqs.extend(trim_seqs(sampled_idxs))
                all_target_seqs.extend([list(seq[seq>0])] for seq in to_np(target))
                loss.append(to_np(loss))
        bleu_score=corpus_bleu(all_target_seqs, all_output_seqs, smoothing_function=SmoothingFunction().method1)
        mean_loss=sum(losses)/len(losses)
        self.generator(all_output_seqs, all_target_seqs, input_data.size(1))
        return mean_loss, bleu_score

    def generator(self, all_output_seqs, all_target_seqs, maxlen):
        with open("."/log/result.txt", "w", encoding="utf-8") asf:
            for presence in all_output_seqs:
                for token presence:
                    f.write(sp.IdToPiece(int(tok)))
                f.write("\n")

        with open("."/log/target.txt", "w", encoding="utf-8") asf:
            for presence in all_target_seqs:
                for token presence [0]:
                    f.write(sp.IdToPiece(int(tok)))
                f.write("\n")

This is loader.py.
Shape and return corpus in csv file.

#loader.py
import torch
import numpy as np
import csv
import sentencepiece as spm

class Seq2seqDataset(torch.utils.data.Dataset):
    def__init__(self, maxlen):
        self.sp=spm.SentencePieceProcessor()
        self.sp.load("./data/index.model")
        self.maxlen=maxlen
        

        with open('./data/parallel_data.csv', mode='r', newline=', encoding='utf-8') asf:
        # with open('./data/sample.csv', mode='r', newline=', encoding='utf-8') asf:    
            csv_file=csv.reader(f)
            read_data = [row for row in csv_file ]
        self.data_num=len(read_data)-1
        jp_data=[]
        es_data=[ ]
        for i in range(1,self.data_num):    
            jp_data.append(read_data[i][1:2])
            es_data.append(read_data[i][2:3])


        self.en_data_idx=np.zeros(len(jp_data), maxlen))
        self.de_data_idx=np.zeros(len(es_data), maxlen+1))

        for i, sentence in enumerate(jp_data):
            for j, idx in enumerate (self.sp.EncodeAsIds(sentence[0])[0:]):
                self.en_data_idx[i][j] = idx
                if j == maxlen-1:
                    break
            if j<maxlen-1:
                self.en_data_idx[i][j:] = self.sp.PieceToId("<unk>")
            self.en_data_idx[i][-1] = self.sp.PieceToId("</s>")
        for i, sentence in enumerate(es_data):
            self.de_data_idx[i][0]=self.sp.PieceToId("<s>")
            for j, idx in enumerate (self.sp.EncodeAsIds(sentence[0])[0:]):
                self.de_data_idx[i][j+1]=idx
                if j+1 == maxlen:
                    break
            if j+1<maxlen:
                self.de_data_idx[i][j+1:]=self.sp.PieceToId("<unk>")
            self.de_data_idx[i][-1]=self.sp.PieceToId("</s>")
        
    def__len__(self):
        return self.data_num

    def__getitem__(self,idx):
        en_data=torch.tensor(self.en_data_idx[idx-1][:], dtype=torch.long)
        de_data=torch.tensor(self.de_data_idx[idx-1][:], dtype=torch.long)
        target=torch.zero(self.maxlen+1), dtype=torch.long)
        for i, data in enumerate (de_data[:]):
            target[i] = data
        target[0] = self.sp.PieceToId("<s>")

        returnen_data, target

GRU.py
Encoder.

#GRU.py
import torch
import torch.nn asn

class GRU(nn.Module):
    def_init__(self, hidden_size, output_size, num_layers=1):
        super(GRU,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        self.embedding=nn.embedding(self.output_size, self.hidden_size, padding_idx=0)
        self.embedding.weight.data.normal_(0,1/self.hidden_size**0.5)
        self.embedding.weight.data[0,:] = 0.0
        self.gru_source=nn.GRU(hidden_size, hidden_size, num_layers=num_layers,
                                bidirectional=True, batch_first=True, dropout=0.2)

    def forward (self, sentence_words):
        self.gru_source.flatten_parameters()
        embedded=self.embedding(sentence_words)
        hx = self.init_hidden(sentence_words.size(0))
        encoder_output, hx = self.gru_source(embedded, hx)
        encoder_output=(encoder_output[:, :, :self.hidden_size] + encoder_output[:, :, self.hidden_size:])/2
        hx=(hx[0]+hx[1])/2 
        hx = hx.unsqueeze(0)
        return encoder_output, hx

    default_hidden(self,bc):
        hx=torch.zeros(self.num_layers*2,bc,self.hidden_size))
        hx = hx.cuda()
        return hx

This is Decoder.py.
Decoder.

#Decoder.py
import torch
import torch.nn asn
from torch.autograd import Variable
import torch.nn.functional asF
import torchvision

class Decoder (nn.Module):
    def_init__(self, hidden_dim, vocab_size, embedding_dim, max_length):
        super(decoder,self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings=nn.embedding(vocab_size, embedding_dim, padding_idx=0)
        self.gru=nn.GRU (embedding_dim, hidden_dim, batch_first=True)
        self.hidden2 linear=nn.Linal(hidden_dim, vocab_size)
        
    def forward (self, sequence, encoder_state):
        embedding = self.word_embeddings (sequence)
        embedding = F.relu(embedding)
        output, state=self.gru(embedding, encoder_state)
        output = self.hidden2 linear(output)
        output=F.log_softmax(output, dim=-1)
        return output, state

This is EncoderDecoder.py.
Encoder and Decoder are grouped together to make it easier to handle.

#EncoderDecoder.py
import torch
import torch.nn asn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional asF
from torch.autograd import Variable

class EncoderDecoder(nn.Module):
    def__init__(self, encoder, decoder):
        super(EncoderDecoder,self).__init__()
        self.encoder=encoder
        self.decoder=decoder

    def forward (self, data, target):
        out,hx =self.encoder(data)
        return self.decoder(target,hx)
    defdecode(self, target, hx):
        return self.decoder(target,hx)
    default(self, input_data):
        return self.encoder(input_data)

class Encoder (nn.Module):
    def_init_(self, base_module):
        super(Encoder,self).__init__()
        self.base_module=base_module

    def forward (self, data):
        return self.base_module(data)

class Decoder (nn.Module):
    def_init__(self, base_module, maxlen):
        super(decoder,self).__init__()
        self.base_module=base_module
        self.maxlen=maxlen

    def forward (self, data, hx):
        return self.base_module(data,hx)

If you train this to generate a sentence,
All sentences are composed of pad.
The loss will decrease without any problems, but the sentence generated will be x.

The following is sample output:

epochs:1 train_loss:6.804564086012185eval_loss:6.84909054090885val_bleuscore:0.00021425393895175578
epochs:2 train_loss:6.16480832815932 val_loss:7.3809502720832825 val_bleuscore:5.9522479993669076e-05
epochs:3 train_loss:5.995282799291154 event_loss:7.583597528934479 val_bleuscore:6.663744208921663e-05
epochs:4 train_loss:5.8298093747026245val_loss:7.464733970165253 val_bleuscore:0.00021450632944840199
epochs:5 train_loss:5.752497753777062 event_loss:7.193604528903961 val_bleuscore:0.00013741750714360893

I would appreciate it if you could tell me the problems.
Personally, I think the problem is that we are dealing with Encoder Decoder integratedly, and that we are training by handing over the model by self.
I'll share all the files if you can.

python deep-learning natural-language-processing pytorch

2022-09-30 15:52

1 Answers

I'm trying something similar.

The following paper describes how to limit the model created in seq2seq to prevent repeated word output.

Sparse and Constrained Attention for Neural Machine Translation

Descriptive thesis summary
https://github.com/ymym3412/acl-papers/issues/218

In normal attention, every word is given a little weight at any time t, which causes a reposition during decode.Therefore, we propose a constructed sparse max combination of constrained softmax, which restricts the number of words to be applied to sparse for attention.

2022-09-30 15:52

If you have any answers or tips

Popular Tags

python x 4647

android x 1593

java x 1494

javascript x 1427

c x 927

c++ x 878

ruby-on-rails x 696

php x 692

python3 x 685

html x 656