Attention is all you need pytorch实现 源码解析03 - 模型的训练(2)- transformer模型的代码实现以及结构
我们继续分析著名的attention is all you need 论文的pytorch实现的源码解析。
由于项目很大,所以我们会分开几讲来进行讲解。
上一讲连接在此:
Attention is all you need pytorch实现 源码解析01 - 数据预处理、词表的构建 - https://blog.csdn.net/weixin_42744102/article/details/87006081
Attention is all you need pytorch实现 源码解析02 - 模型的训练(1)- 模型的训练代码 - https://blog.csdn.net/weixin_42744102/article/details/87076089
先上github源码:https://github.com/Eathoublu/attention-is-all-you-need-pytorch
项目结构:
-transfomer
—__init__.py
—Beam.py
—Constants.py
—Layers.py
—Models.py
—Module.py
—Optim.py
—SubLayers.py
—Translator.py
话不多说,我们现在就看一下代码以及解析,代码顺序是逐级深入的,transformer这个里面又包括了encoder模型以及decoder模型,encoder模型又是由n_layers个encoderlayer平行组成的,每个encoderlayer层又有一个MultiHeadAttention子层和一个PositionwiseFeedForward子层。下面请看源码,源码贯穿了tenplates文件夹下的Models.py,Layers.py,sublayers.py,请阅读从1到14的注释。
class Transformer(nn.Module): # 1 - transformer模型继承自pytorch的nn.model(这是一个接口类,留下一个forward方法必须由子类实现),init传入的参数有词表的大小、句子序列的最大长度,剩余为可选参数,其中attention的头数为8 ''' A sequence to sequence model with attention mechanism. ''' def __init__( self, n_src_vocab, n_tgt_vocab, len_max_seq, d_word_vec=512, d_model=512, d_inner=2048, n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1, tgt_emb_prj_weight_sharing=True, emb_src_tgt_weight_sharing=True): super().__init__() self.encoder = Encoder( # 2 - 初始化了encoder模型,下面会讲到,是用于组成encoder-decoder的组件 n_src_vocab=n_src_vocab, len_max_seq=len_max_seq, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout) self.decoder = Decoder( # 3 - 初始化了decoder模型,下面会讲到,是用于组成encoder-decoder的组件 n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout) # 4 - 下面是一些参数的设置,这里就不多讲 self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) assert d_model == d_word_vec, \ 'To facilitate the residual connections, \ the dimensions of all module outputs shall be the same.' if tgt_emb_prj_weight_sharing: # Share the weight matrix between target word embedding & the final logit dense layer self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight self.x_logit_scale = (d_model ** -0.5) else: self.x_logit_scale = 1. if emb_src_tgt_weight_sharing: # Share the weight matrix between source & target word embeddings assert n_src_vocab == n_tgt_vocab, \ "To share word embedding table, the vocabulary size of src/tgt shall be the same." self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight def forward(self, src_seq, src_pos, tgt_seq, tgt_pos): # 5 - forward方法的实现,写着forward的就是在训练的时候调用的。 tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1] # 6 - 这里将传入的target的序列和位置的最后一位丢掉(这里有点不太明白,可能是因为最后一位是<EOS>符号) enc_output, *_ = self.encoder(src_seq, src_pos) # 7 - 将训练集的data传入encoder模型,得到encoder的output dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output) # 8 - 将训练集的target、data以及encoder得到的output传入decoder,得到decoder的output seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale # 9 - 将decoder的输出结果进行一个线性变化,并在return的时候返回,好,后面我们具体讲一讲encoder以及decoder层。 return seq_logit.view(-1, seq_logit.size(2)) class Encoder(nn.Module): # 10 - 这是encoder模型,我们往下看: ''' A encoder model with self attention mechanism. ''' def __init__( # 11 - 初始化,继承了部分父类(transformer类)的属性 self, n_src_vocab, len_max_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1): super().__init__() n_position = len_max_seq + 1 # 12 - 这个+1应该是为了防止溢出 self.src_word_emb = nn.Embedding( # 13 - 先说一下nn.embedding是什么:Pytorch官网的解释是:一个保存了固定字典和大小的简单查找表。这个模块常用来保存词嵌入和用下标检索它们。模块的输入是一个下标的列表,输出是对应的词嵌入。 n_src_vocab, d_word_vec, padding_idx=Constants.PAD) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) self.layer_stack = nn.ModuleList([ EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) # 14 - 这里复制n个一摸一样的EncoderLayer层,接下来,我们看Encoderlayer层。 def forward(self, src_seq, src_pos, return_attns=False): enc_slf_attn_list = [] # -- Prepare masks slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq) non_pad_mask = get_non_pad_mask(src_seq) # -- Forward enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos) for enc_layer in self.layer_stack: enc_output, enc_slf_attn = enc_layer( enc_output, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask) if return_attns: enc_slf_attn_list += [enc_slf_attn] if return_attns: return enc_output, enc_slf_attn_list return enc_output, class Decoder(nn.Module): ''' A decoder model with self attention mechanism. ''' def __init__( self, n_tgt_vocab, len_max_seq, d_word_vec, n_layers, n_head, d_k, d_v, d_model, d_inner, dropout=0.1): super().__init__() n_position = len_max_seq + 1 self.tgt_word_emb = nn.Embedding( n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD) self.position_enc = nn.Embedding.from_pretrained( get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0), freeze=True) self.layer_stack = nn.ModuleList([ DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout) for _ in range(n_layers)]) def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False): dec_slf_attn_list, dec_enc_attn_list = [], [] # -- Prepare masks non_pad_mask = get_non_pad_mask(tgt_seq) slf_attn_mask_subseq = get_subsequent_mask(tgt_seq) slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq) slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0) dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=tgt_seq) # -- Forward dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos) for dec_layer in self.layer_stack: dec_output, dec_slf_attn, dec_enc_attn = dec_layer( dec_output, enc_output, non_pad_mask=non_pad_mask, slf_attn_mask=slf_attn_mask, dec_enc_attn_mask=dec_enc_attn_mask) if return_attns: dec_slf_attn_list += [dec_slf_attn] dec_enc_attn_list += [dec_enc_attn] if return_attns: return dec_output, dec_slf_attn_list, dec_enc_attn_list return dec_output, class EncoderLayer(nn.Module): # 10 - 这是单层的encoderLayer ''' Compose with two layers ''' def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention( # 11 - 实例化多头注意力模型 n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) # 定义前馈层 def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): # 12 - 使用多头注意力模型进行训练 enc_output, enc_slf_attn = self.slf_attn( enc_input, enc_input, enc_input, mask=slf_attn_mask) enc_output *= non_pad_mask enc_output = self.pos_ffn(enc_output) # 13 - 将注意力的output传到前馈模型中得到outut enc_output *= non_pad_mask return enc_output, enc_slf_attn class MultiHeadAttention(nn.Module): # 14 - 这就是encoderlayer里面实例化的多头注意力子层 ''' Multi-Head Attention module ''' def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super().__init__() self.n_head = n_head self.d_k = d_k self.d_v = d_v self.w_qs = nn.Linear(d_model, n_head * d_k) self.w_ks = nn.Linear(d_model, n_head * d_k) self.w_vs = nn.Linear(d_model, n_head * d_v) nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k))) nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v))) self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5)) self.layer_norm = nn.LayerNorm(d_model) self.fc = nn.Linear(n_head * d_v, d_model) nn.init.xavier_normal_(self.fc.weight) self.dropout = nn.Dropout(dropout) def forward(self, q, k, v, mask=None): d_k, d_v, n_head = self.d_k, self.d_v, self.n_head sz_b, len_q, _ = q.size() sz_b, len_k, _ = k.size() sz_b, len_v, _ = v.size() residual = q q = self.w_qs(q).view(sz_b, len_q, n_head, d_k) k = self.w_ks(k).view(sz_b, len_k, n_head, d_k) v = self.w_vs(v).view(sz_b, len_v, n_head, d_v) q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x .. output, attn = self.attention(q, k, v, mask=mask) output = output.view(n_head, sz_b, len_q, d_v) output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv) output = self.dropout(self.fc(output)) output = self.layer_norm(output + residual) return output, attn
- Attention is all you need pytorch实现 源码解析04 - 模型的测试以及翻译
- 一文读懂「Attention is All You Need」| 附代码实现
- 一文读懂「Attention is All You Need」| 附代码实现
- 《Attention is All You Need》浅读(简介+代码)
- 《Attention is All You Need》浅读(简介+代码)
- 模型汇总16 各类Seq2Seq模型对比及《Attention Is All You Need》中技术详解
- Attention Is All You Need 论文阅读笔记
- Attention Is All You Need
- ReactiveSwift源码解析(十一) Atomic的代码实现以及其中的Defer延迟、Posix互斥锁、递归锁
- Attention is all you need新翻译架构的测试
- Attention Is All You Need读后感
- Attention is all you need阅读笔记
- 论文阅读笔记之Attention Is All You Need
- Attention is all you need 论文记录
- 【论文阅读】Attention Is All You Need
- ReactiveSwift源码解析(十一) Atomic的代码实现以及其中的Defer延迟、Posix互斥锁、递归锁
- NMT十篇必读论文(一)attention is all you need
- 对Attention is all you need 的理解
- 谷歌机器翻译Attention is All You Need
- Attention Is All You Need