Task04:机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
机器翻译
机器翻译(MT):将一段文本从一种语言自动翻译为另一种语言,用神经网络解决这个问题通常称为神经机器翻译(NMT)。 主要特征:输出是单词序列而不是单个单词。 输出序列的长度可能与源序列的长度不同。
import sys sys.path.append('/home/kesci/input/d2l9528/') import collections import d2l import zipfile from d2l.data.base import Vocab import time import torch import torch.nn as nn import torch.nn.functional as F from torch.utils import data from torch import optim with open('/home/kesci/input/fraeng6506/fra.txt', 'r') as f: raw_text = f.read() def preprocess_raw(text): text = text.replace('\u202f', ' ').replace('\xa0', ' ') out = '' for i, char in enumerate(text.lower()): if char in (',', '!', '.') and i > 0 and text[i-1] != ' ': out += ' ' out += char return out text = preprocess_raw(raw_text) num_examples = 50000 source, target = [], [] for i, line in enumerate(text.split('\n')): if i > num_examples: break parts = line.split('\t') if len(parts) >= 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) def build_vocab(tokens): tokens = [token for line in tokens for token in line] return d2l.data.base.Vocab(tokens, min_freq=3, use_special_tokens=True) src_vocab = build_vocab(source) def pad(line, max_len, padding_token): if len(line) > max_len: return line[:max_len] return line + [padding_token] * (max_len - len(line)) def build_array(lines, vocab, max_len, is_source): lines = [vocab[line] for line in lines] if not is_source: lines = [[vocab.bos] + line + [vocab.eos] for line in lines] array = torch.tensor([pad(line, max_len, vocab.pad) for line in lines]) valid_len = (array != vocab.pad).sum(1) #第一个维度 return array, valid_len def load_data_nmt(batch_size, max_len): # This function is saved in d2l. src_vocab, tgt_vocab = build_vocab(source), build_vocab(target) src_array, src_valid_len = build_array(source, src_vocab, max_len, True) tgt_array, tgt_valid_len = build_array(target, tgt_vocab, max_len, False) train_data = data.TensorDataset(src_array, src_valid_len, tgt_array, tgt_valid_len) train_iter = data.DataLoader(train_data, batch_size, shuffle=True) return src_vocab, tgt_vocab, train_iter src_vocab, tgt_vocab, train_iter = load_data_nmt(batch_size=2, max_len=8) class Encoder(nn.Module): def __init__(self, **kwargs): super(Encoder, self).__init__(**kwargs) def forward(self, X, *args): raise NotImplementedError class Decoder(nn.Module): def __init__(self, **kwargs): super(Decoder, self).__init__(**kwargs) def init_state(self, enc_outputs, *args): raise NotImplementedError def forward(self, X, state): raise NotImplementedError class EncoderDecoder(nn.Module): def __init__(self, encoder, decoder, **kwargs): super(EncoderDecoder, self).__init__(**kwargs) self.encoder = encoder self.decoder = decoder def forward(self, enc_X, dec_X, *args): enc_outputs = self.encoder(enc_X, *args) dec_state = self.decoder.init_state(enc_outputs, *args) return self.decoder(dec_X, dec_state) class Seq2SeqEncoder(d2l.Encoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqEncoder, self).__init__(**kwargs) self.num_hiddens=num_hiddens self.num_layers=num_layers self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout) def begin_state(self, batch_size, device): return [torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device), torch.zeros(size=(self.num_layers, batch_size, self.num_hiddens), device=device)] def forward(self, X, *args): X = self.embedding(X) # X shape: (batch_size, seq_len, embed_size) X = X.transpose(0, 1) # RNN needs first axes to be time # state = self.begin_state(X.shape[1], device=X.device) out, state = self.rnn(X) # The shape of out is (seq_len, batch_size, num_hiddens). # state contains the hidden state and the memory cell # of the last time step, the shape is (num_layers, batch_size, num_hiddens) return out, state class Seq2SeqDecoder(d2l.Decoder): def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqDecoder, self).__init__(**kwargs) self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = nn.LSTM(embed_size,num_hiddens, num_layers, dropout=dropout) self.dense = nn.Linear(num_hiddens,vocab_size) def init_state(self, enc_outputs, *args): return enc_outputs[1] def forward(self, X, state): X = self.embedding(X).transpose(0, 1) out, state = self.rnn(X, state) # Make the batch to be the first dimension to simplify loss computation. out = self.dense(out).transpose(0, 1) return out, state def SequenceMask(X, X_len,value=0): maxlen = X.size(1) mask = torch.arange(maxlen)[None, :].to(X_len.device) < X_len[:, None] X[~mask]=value return X class MaskedSoftmaxCELoss(nn.CrossEntropyLoss): # pred shape: (batch_size, seq_len, vocab_size) # label shape: (batch_size, seq_len) # valid_length shape: (batch_size, ) def forward(self, pred, label, valid_length): # the sample weights shape should be (batch_size, seq_len) weights = torch.ones_like(label) weights = SequenceMask(weights, valid_length).float() self.reduction='none' output=super(MaskedSoftmaxCELoss, self).forward(pred.transpose(1,2), label) return (output*weights).mean(dim=1) def train_ch7(model, data_iter, lr, num_epochs, device): # Saved in d2l model.to(device) optimizer = optim.Adam(model.parameters(), lr=lr) loss = MaskedSoftmaxCELoss() tic = time.time() for epoch in range(1, num_epochs+1): l_sum, num_tokens_sum = 0.0, 0.0 for batch in data_iter: optimizer.zero_grad() X, X_vlen, Y, Y_vlen = [x.to(device) for x in batch] Y_input, Y_label, Y_vlen = Y[:,:-1], Y[:,1:], Y_vlen-1 Y_hat, _ = model(X, Y_input, X_vlen, Y_vlen) l = loss(Y_hat, Y_label, Y_vlen).sum() l.backward() with torch.no_grad(): d2l.grad_clipping_nn(model, 5, device) num_tokens = Y_vlen.sum().item() optimizer.step() l_sum += l.sum().item() num_tokens_sum += num_tokens if epoch % 50 == 0: print("epoch {0:4d},loss {1:.3f}, time {2:.1f} sec".format( epoch, (l_sum/num_tokens_sum), time.time()-tic)) tic = time.time() embed_size, num_hiddens, num_layers, dropout = 32, 32, 2, 0.0 batch_size, num_examples, max_len = 64, 1e3, 10 lr, num_epochs, ctx = 0.005, 300, d2l.try_gpu() src_vocab, tgt_vocab, train_iter = d2l.load_data_nmt( batch_size, max_len,num_examples) encoder = Seq2SeqEncoder( len(src_vocab), embed_size, num_hiddens, num_layers, dropout) decoder = Seq2SeqDecoder( len(tgt_vocab), embed_size, num_hiddens, num_layers, dropout) model = d2l.EncoderDecoder(encoder, decoder) train_ch7(model, train_iter, lr, num_epochs, ctx) def translate_ch7(model, src_sentence, src_vocab, tgt_vocab, max_len, device): src_tokens = src_vocab[src_sentence.lower().split(' ')] src_len = len(src_tokens) if src_len < max_len: src_tokens += [src_vocab.pad] * (max_len - src_len) enc_X = torch.tensor(src_tokens, device=device) enc_valid_length = torch.tensor([src_len], device=device) # use expand_dim to add the batch_size dimension. enc_outputs = model.encoder(enc_X.unsqueeze(dim=0), enc_valid_length) dec_state = model.decoder.init_state(enc_outputs, enc_valid_length) dec_X = torch.tensor([tgt_vocab.bos], device=device).unsqueeze(dim=0) predict_tokens = [] for _ in range(max_len): Y, dec_state = model.decoder(dec_X, dec_state) # The token with highest score is used as the next time step input. dec_X = Y.argmax(dim=2) py = dec_X.squeeze(dim=0).int().item() if py == tgt_vocab.eos: break predict_tokens.append(py) return ' '.join(tgt_vocab.to_tokens(predict_tokens))
注意力机制和Seq2seq模型
在“编码器—解码器(seq2seq)”⼀节⾥,解码器在各个时间步依赖相同的背景变量(context vector)来获取输⼊序列信息。当编码器为循环神经⽹络时,背景变量来⾃它最终时间步的隐藏状态。将源序列输入信息以循环单位状态编码,然后将其传递给解码器以生成目标序列。然而这种结构存在着问题,尤其是RNN机制实际中存在长程梯度消失的问题,对于较长的句子,我们很难寄希望于将输入的序列转化为定长的向量而保存所有的有效信息,所以随着所需翻译句子的长度的增加,这种结构的效果会显著下降。
与此同时,解码的目标词语可能只与原输入的部分词语有关,而并不是与所有的输入有关。例如,当把“Hello world”翻译成“Bonjour le monde”时,“Hello”映射成“Bonjour”,“world”映射成“monde”。在seq2seq模型中,解码器只能隐式地从编码器的最终状态中选择相应的信息。然而,注意力机制可以将这种选择过程显式地建模。
Attention 是一种通用的带权池化方法,输入由两部分构成:询问(query)和键值对(key-value pairs)。对于一个query来说,attention layer 会与每一个key计算注意力分数并进行权重的归一化,输出的向量则是value的加权求和,而每个key计算的权重与value一一对应。
两个常用的注意层是Dot-product Attention 和 Multilayer Perceptron Attention
class DotProductAttention(nn.Module): def __init__(self, dropout, **kwargs): super(DotProductAttention, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) # query: (batch_size, #queries, d) # key: (batch_size, #kv_pairs, d) # value: (batch_size, #kv_pairs, dim_v) # valid_length: either (batch_size, ) or (batch_size, xx) def forward(self, query, key, value, valid_length=None): d = query.shape[-1] # set transpose_b=True to swap the last two dimensions of key scores = torch.bmm(query, key.transpose(1,2)) / math.sqrt(d) attention_weights = self.dropout(masked_softmax(scores, valid_length)) print("attention_weight\n",attention_weights) return torch.bmm(attention_weights, value) class MLPAttention(nn.Module): def __init__(self, units,ipt_dim,dropout, **kwargs): super(MLPAttention, self).__init__(**kwargs) # Use flatten=True to keep query's and key's 3-D shapes. self.W_k = nn.Linear(ipt_dim, units, bias=False) self.W_q = nn.Linear(ipt_dim, units, bias=False) self.v = nn.Linear(units, 1, bias=False) self.dropout = nn.Dropout(dropout) def forward(self, query, key, value, valid_length): query, key = self.W_k(query), self.W_q(key) #print("size",query.size(),key.size()) # expand query to (batch_size, #querys, 1, units), and key to # (batch_size, 1, #kv_pairs, units). Then plus them with broadcast. features = query.unsqueeze(2) + key.unsqueeze(1) #print("features:",features.size()) #--------------开启 scores = self.v(features).squeeze(-1) attention_weights = self.dropout(masked_softmax(scores, valid_length)) return torch.bmm(attention_weights, value)
引入注意力机制的Seq2seq模型
Transformer
CNNs 易于并行化,却不适合捕捉变长序列内的依赖关系。
RNNs 适合捕捉长距离变长序列的依赖,但是却难以实现并行化处理序列。
为了整合CNN和RNN的优势,[Vaswani et al., 2017] 创新性地使用注意力机制设计了Transformer模型。该模型利用attention机制实现了并行化捕捉序列依赖,并且同时处理序列的每个位置的tokens,上述优势使得Transformer模型在性能优异的同时大大减少了训练时间。
- Transformer blocks:将seq2seq模型重的循环网络替换为了Transformer Blocks,该模块包含一个多头注意力层(Multi-head Attention Layers)以及两个position-wise feed-forward networks(FFN)。对于解码器来说,另一个多头注意力层被用于接受编码器的隐藏状态。
- Add and norm:多头注意力层和前馈网络的输出被送到两个“add and norm”层进行处理,该层包含残差结构以及层归一化。
- Position encoding:由于自注意力层并没有区分元素的顺序,所以一个位置编码层被用于向序列元素里添加位置信息。
多头注意力层
class MultiHeadAttention(nn.Module): def __init__(self, input_size, hidden_size, num_heads, dropout, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self.num_heads = num_heads self.attention = DotProductAttention(dropout) self.W_q = nn.Linear(input_size, hidden_size, bias=False) self.W_k = nn.Linear(input_size, hidden_size, bias=False) self.W_v = nn.Linear(input_size, hidden_size, bias=False) self.W_o = nn.Linear(hidden_size, hidden_size, bias=False) def forward(self, query, key, value, valid_length): # query, key, and value shape: (batch_size, seq_len, dim), # where seq_len is the length of input sequence # valid_length shape is either (batch_size, ) # or (batch_size, seq_len). # Project and transpose query, key, and value from # (batch_size, seq_len, hidden_size * num_heads) to # (batch_size * num_heads, seq_len, hidden_size). query = transpose_qkv(self.W_q(query), self.num_heads) key = transpose_qkv(self.W_k(key), self.num_heads) value = transpose_qkv(self.W_v(value), self.num_heads) if valid_length is not None: # Copy valid_length by num_heads times device = valid_length.device valid_length = valid_length.cpu().numpy() if valid_length.is_cuda else valid_length.numpy() if valid_length.ndim == 1: valid_length = torch.FloatTensor(np.tile(valid_length, self.num_heads)) else: valid_length = torch.FloatTensor(np.tile(valid_length, (self.num_heads,1))) valid_length = valid_length.to(device) output = self.attention(query, key, value, valid_length) output_concat = transpose_output(output, self.num_heads) return self.W_o(output_concat) def transpose_qkv(X, num_heads): # Original X shape: (batch_size, seq_len, hidden_size * num_heads), # -1 means inferring its value, after first reshape, X shape: # (batch_size, seq_len, num_heads, hidden_size) X = X.view(X.shape[0], X.shape[1], num_heads, -1) # After transpose, X shape: (batch_size, num_heads, seq_len, hidden_size) X = X.transpose(2, 1).contiguous() # Merge the first two dimensions. Use reverse=True to infer shape from # right to left. # output shape: (batch_size * num_heads, seq_len, hidden_size) output = X.view(-1, X.shape[2], X.shape[3]) return output # Saved in the d2l package for later use def transpose_output(X, num_heads): # A reversed version of transpose_qkv X = X.view(-1, num_heads, X.shape[1], X.shape[2]) X = X.transpose(2, 1).contiguous() return X.view(X.shape[0], X.shape[1], -1)
基于位置的前馈网络
class PositionWiseFFN(nn.Module): def __init__(self, input_size, ffn_hidden_size, hidden_size_out, **kwargs): super(PositionWiseFFN, self).__init__(**kwargs) self.ffn_1 = nn.Linear(input_size, ffn_hidden_size) self.ffn_2 = nn.Linear(ffn_hidden_size, hidden_size_out) def forward(self, X): return self.ffn_2(F.relu(self.ffn_1(X)))
Add and Norm
class AddNorm(nn.Module): def __init__(self, hidden_size, dropout, **kwargs): super(AddNorm, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) self.norm = nn.LayerNorm(hidden_size) def forward(self, X, Y): return self.norm(self.dropout(Y) + X)
位置编码
class PositionalEncoding(nn.Module): def __init__(self, embedding_size, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) self.P = np.zeros((1, max_len, embedding_size)) X = np.arange(0, max_len).reshape(-1, 1) / np.power( 10000, np.arange(0, embedding_size, 2)/embedding_size) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X) self.P = torch.FloatTensor(self.P) def forward(self, X): if X.is_cuda and not self.P.is_cuda: self.P = self.P.cuda() X = X + self.P[:, :X.shape[1], :] return self.dropout(X)
编码器
class EncoderBlock(nn.Module): def __init__(self, embedding_size, ffn_hidden_size, num_heads, dropout, **kwargs): super(EncoderBlock, self).__init__(**kwargs) self.attention = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout) self.addnorm_1 = AddNorm(embedding_size, dropout) self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size) self.addnorm_2 = AddNorm(embedding_size, dropout) def forward(self, X, valid_length): Y = self.addnorm_1(X, self.attention(X, X, X, valid_length)) return self.addnorm_2(Y, self.ffn(Y)) class TransformerEncoder(d2l.Encoder): def __init__(self, vocab_size, embedding_size, ffn_hidden_size, num_heads, num_layers, dropout, **kwargs): super(TransformerEncoder, self).__init__(**kwargs) self.embedding_size = embedding_size self.embed = nn.Embedding(vocab_size, embedding_size) self.pos_encoding = PositionalEncoding(embedding_size, dropout) self.blks = nn.ModuleList() for i in range(num_layers): self.blks.append( EncoderBlock(embedding_size, ffn_hidden_size, num_heads, dropout)) def forward(self, X, valid_length, *args): X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size)) for blk in self.blks: X = blk(X, valid_length) return X
解码器
class DecoderBlock(nn.Module): def __init__(self, embedding_size, ffn_hidden_size, num_heads,dropout,i,**kwargs): super(DecoderBlock, self).__init__(**kwargs) self.i = i self.attention_1 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout) self.addnorm_1 = AddNorm(embedding_size, dropout) self.attention_2 = MultiHeadAttention(embedding_size, embedding_size, num_heads, dropout) self.addnorm_2 = AddNorm(embedding_size, dropout) self.ffn = PositionWiseFFN(embedding_size, ffn_hidden_size, embedding_size) self.addnorm_3 = AddNorm(embedding_size, dropout) def forward(self, X, state): enc_outputs, enc_valid_length = state[0], state[1] # state[2][self.i] stores all the previous t-1 query state of layer-i # len(state[2]) = num_layers # If training: # state[2] is useless. # If predicting: # In the t-th timestep: # state[2][self.i].shape = (batch_size, t-1, hidden_size) # Demo: # love dogs ! [EOS] # | | | | # Transformer # Decoder # | | | | # I love dogs ! if state[2][self.i] is None: key_values = X else: # shape of key_values = (batch_size, t, hidden_size) key_values = torch.cat((state[2][self.i], X), dim=1) state[2][self.i] = key_values if self.training: batch_size, seq_len, _ = X.shape # Shape: (batch_size, seq_len), the values in the j-th column are j+1 valid_length = torch.FloatTensor(np.tile(np.arange(1, seq_len+1), (batch_size, 1))) valid_length = valid_length.to(X.device) else: valid_length = None X2 = self.attention_1(X, key_values, key_values, valid_length) Y = self.addnorm_1(X, X2) Y2 = self.attention_2(Y, enc_outputs, enc_outputs, enc_valid_length) Z = self.addnorm_2(Y, Y2) return self.addnorm_3(Z, self.ffn(Z)), state class TransformerDecoder(d2l.Decoder): def __init__(self, vocab_size, embedding_size, ffn_hidden_size, num_heads, num_layers, dropout, **kwargs): super(TransformerDecoder, self).__init__(**kwargs) self.embedding_size = embedding_size self.num_layers = num_layers self.embed = nn.Embedding(vocab_size, embedding_size) self.pos_encoding = PositionalEncoding(embedding_size, dropout) self.blks = nn.ModuleList() for i in range(num_layers): self.blks.append( DecoderBlock(embedding_size, ffn_hidden_size, num_heads, dropout, i)) self.dense = nn.Linear(embedding_size, vocab_size) def init_state(self, enc_outputs, enc_valid_length, *args): return [enc_outputs, enc_valid_length, [None]*self.num_layers] def forward(self, X, state): X = self.pos_encoding(self.embed(X) * math.sqrt(self.embedding_size)) for blk in self.blks: X, state = blk(X, state) return self.dense(X), state
- 点赞
- 收藏
- 分享
- 文章举报
- 动手学深度学习 Task04 机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- 深度学习d4:机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- 第四次打卡:机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- ElitesAI·动手学深度学习PyTorch版学习笔记-机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- 动手学深度学习PyTorch-机器翻译及相关技术、注意力机制与Seq2seq模型、Transformer
- Task04-pyTorch(机器翻译及其相关技术,注意力机制与Seq2Seq模型,Transformer)
- DL-Pytorch Task04:机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- 动手学深度学习-机器翻译及相关技术;注意力机制与Seq2seq模型;Transformer
- 《动手学深度学习——机器翻译及相关技术,注意力机制与seq2seq模型,Transformer》笔记
- Deep_Learning_Task04: 机器翻译及其相关技术/注意力机制/Transformer
- 机器翻译及相关技术、注意力机制和Seq2seq模型、Transformer
- Task 04- 机器翻译-注意力机制-Seq2seq-Transformer
- 动手学DL|Task4 机器翻译及其技术+注意力机制与Seq2seq模型+Transformer
- 14天动手学深度学习 task2 《动手学》:注意力机制和Seq2seq模型笔记
- 《动手学深度学习》组队学习打卡Task4——注意力机制与Seq2seq模型
- PyTorch: 序列到序列模型(Seq2Seq)实现机器翻译实战
- TensorFlow从1到2(十)带注意力机制的神经网络机器翻译
- 机器翻译中的深度学习技术:CNN,Seq2Seq,SGAN,Dual Learning
- 注意力机制与Seq2seq模型
- 注意力机制和Seq2seq模型