您的位置:首页 > 编程语言

Attention is all you need pytorch实现 源码解析03 - 模型的训练(2)- transformer模型的代码实现以及结构

2019-02-12 14:37 3491 查看

我们继续分析著名的attention is all you need 论文的pytorch实现的源码解析。
由于项目很大,所以我们会分开几讲来进行讲解。

上一讲连接在此:
Attention is all you need pytorch实现 源码解析01 - 数据预处理、词表的构建 - https://blog.csdn.net/weixin_42744102/article/details/87006081

Attention is all you need pytorch实现 源码解析02 - 模型的训练(1)- 模型的训练代码 - https://blog.csdn.net/weixin_42744102/article/details/87076089

先上github源码:https://github.com/Eathoublu/attention-is-all-you-need-pytorch

项目结构:

-transfomer
—__init__.py
—Beam.py
—Constants.py
—Layers.py
—Models.py
—Module.py
—Optim.py
—SubLayers.py
—Translator.py

datasets.py
preprocess.py
train.py
translate.py

话不多说,我们现在就看一下代码以及解析,代码顺序是逐级深入的,transformer这个里面又包括了encoder模型以及decoder模型,encoder模型又是由n_layers个encoderlayer平行组成的,每个encoderlayer层又有一个MultiHeadAttention子层和一个PositionwiseFeedForward子层。下面请看源码,源码贯穿了tenplates文件夹下的Models.pyLayers.pysublayers.py,请阅读从1到14的注释。

class Transformer(nn.Module):   # 1 - transformer模型继承自pytorch的nn.model(这是一个接口类,留下一个forward方法必须由子类实现),init传入的参数有词表的大小、句子序列的最大长度,剩余为可选参数,其中attention的头数为8
''' A sequence to sequence model with attention mechanism. '''

def __init__(
self,
n_src_vocab, n_tgt_vocab, len_max_seq,
d_word_vec=512, d_model=512, d_inner=2048,
n_layers=6, n_head=8, d_k=64, d_v=64, dropout=0.1,
tgt_emb_prj_weight_sharing=True,
emb_src_tgt_weight_sharing=True):

super().__init__()

self.encoder = Encoder(  # 2 - 初始化了encoder模型,下面会讲到,是用于组成encoder-decoder的组件
n_src_vocab=n_src_vocab, len_max_seq=len_max_seq,
d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
dropout=dropout)

self.decoder = Decoder(  # 3 - 初始化了decoder模型,下面会讲到,是用于组成encoder-decoder的组件
n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq,
d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner,
n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v,
dropout=dropout)

# 4 - 下面是一些参数的设置,这里就不多讲

self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
nn.init.xavier_normal_(self.tgt_word_prj.weight)

assert d_model == d_word_vec, \
'To facilitate the residual connections, \
the dimensions of all module outputs shall be the same.'

if tgt_emb_prj_weight_sharing:
# Share the weight matrix between target word embedding & the final logit dense layer
self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
self.x_logit_scale = (d_model ** -0.5)
else:
self.x_logit_scale = 1.

if emb_src_tgt_weight_sharing:
# Share the weight matrix between source & target word embeddings
assert n_src_vocab == n_tgt_vocab, \
"To share word embedding table, the vocabulary size of src/tgt shall be the same."
self.encoder.src_word_emb.weight = self.decoder.tgt_word_emb.weight

def forward(self, src_seq, src_pos, tgt_seq, tgt_pos):  # 5 - forward方法的实现,写着forward的就是在训练的时候调用的。

tgt_seq, tgt_pos = tgt_seq[:, :-1], tgt_pos[:, :-1]  # 6 - 这里将传入的target的序列和位置的最后一位丢掉(这里有点不太明白,可能是因为最后一位是<EOS>符号)

enc_output, *_ = self.encoder(src_seq, src_pos)  # 7 - 将训练集的data传入encoder模型,得到encoder的output
dec_output, *_ = self.decoder(tgt_seq, tgt_pos, src_seq, enc_output)   # 8 - 将训练集的target、data以及encoder得到的output传入decoder,得到decoder的output
seq_logit = self.tgt_word_prj(dec_output) * self.x_logit_scale  # 9 - 将decoder的输出结果进行一个线性变化,并在return的时候返回,好,后面我们具体讲一讲encoder以及decoder层。

return seq_logit.view(-1, seq_logit.size(2))

class Encoder(nn.Module): # 10 - 这是encoder模型,我们往下看:
''' A encoder model with self attention mechanism. '''

def __init__(  # 11 - 初始化,继承了部分父类(transformer类)的属性
self,
n_src_vocab, len_max_seq, d_word_vec,
n_layers, n_head, d_k, d_v,
d_model, d_inner, dropout=0.1):

super().__init__()

n_position = len_max_seq + 1  # 12 - 这个+1应该是为了防止溢出

self.src_word_emb = nn.Embedding(  # 13 - 先说一下nn.embedding是什么:Pytorch官网的解释是:一个保存了固定字典和大小的简单查找表。这个模块常用来保存词嵌入和用下标检索它们。模块的输入是一个下标的列表,输出是对应的词嵌入。
n_src_vocab, d_word_vec, padding_idx=Constants.PAD)

self.position_enc = nn.Embedding.from_pretrained(
get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
freeze=True)

self.layer_stack = nn.ModuleList([
EncoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
for _ in range(n_layers)]) # 14 - 这里复制n个一摸一样的EncoderLayer层,接下来,我们看Encoderlayer层。

def forward(self, src_seq, src_pos, return_attns=False):

enc_slf_attn_list = []

# -- Prepare masks
slf_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=src_seq)
non_pad_mask = get_non_pad_mask(src_seq)

# -- Forward
enc_output = self.src_word_emb(src_seq) + self.position_enc(src_pos)

for enc_layer in self.layer_stack:
enc_output, enc_slf_attn = enc_layer(
enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask)
if return_attns:
enc_slf_attn_list += [enc_slf_attn]

if return_attns:
return enc_output, enc_slf_attn_list
return enc_output,

class Decoder(nn.Module):
''' A decoder model with self attention mechanism. '''

def __init__(
self,
n_tgt_vocab, len_max_seq, d_word_vec,
n_layers, n_head, d_k, d_v,
d_model, d_inner, dropout=0.1):

super().__init__()
n_position = len_max_seq + 1

self.tgt_word_emb = nn.Embedding(
n_tgt_vocab, d_word_vec, padding_idx=Constants.PAD)

self.position_enc = nn.Embedding.from_pretrained(
get_sinusoid_encoding_table(n_position, d_word_vec, padding_idx=0),
freeze=True)

self.layer_stack = nn.ModuleList([
DecoderLayer(d_model, d_inner, n_head, d_k, d_v, dropout=dropout)
for _ in range(n_layers)])

def forward(self, tgt_seq, tgt_pos, src_seq, enc_output, return_attns=False):

dec_slf_attn_list, dec_enc_attn_list = [], []

# -- Prepare masks
non_pad_mask = get_non_pad_mask(tgt_seq)

slf_attn_mask_subseq = get_subsequent_mask(tgt_seq)
slf_attn_mask_keypad = get_attn_key_pad_mask(seq_k=tgt_seq, seq_q=tgt_seq)
slf_attn_mask = (slf_attn_mask_keypad + slf_attn_mask_subseq).gt(0)

dec_enc_attn_mask = get_attn_key_pad_mask(seq_k=src_seq, seq_q=tgt_seq)

# -- Forward
dec_output = self.tgt_word_emb(tgt_seq) + self.position_enc(tgt_pos)

for dec_layer in self.layer_stack:
dec_output, dec_slf_attn, dec_enc_attn = dec_layer(
dec_output, enc_output,
non_pad_mask=non_pad_mask,
slf_attn_mask=slf_attn_mask,
dec_enc_attn_mask=dec_enc_attn_mask)

if return_attns:
dec_slf_attn_list += [dec_slf_attn]
dec_enc_attn_list += [dec_enc_attn]

if return_attns:
return dec_output, dec_slf_attn_list, dec_enc_attn_list
return dec_output,

class EncoderLayer(nn.Module):  # 10 - 这是单层的encoderLayer
''' Compose with two layers '''

def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
super(EncoderLayer, self).__init__()
self.slf_attn = MultiHeadAttention(  # 11 - 实例化多头注意力模型
n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout) # 定义前馈层

def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None): # 12 - 使用多头注意力模型进行训练
enc_output, enc_slf_attn = self.slf_attn(
enc_input, enc_input, enc_input, mask=slf_attn_mask)
enc_output *= non_pad_mask

enc_output = self.pos_ffn(enc_output)  # 13 - 将注意力的output传到前馈模型中得到outut
enc_output *= non_pad_mask

return enc_output, enc_slf_attn

class MultiHeadAttention(nn.Module):  # 14 - 这就是encoderlayer里面实例化的多头注意力子层
''' Multi-Head Attention module '''

def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super().__init__()

self.n_head = n_head
self.d_k = d_k
self.d_v = d_v

self.w_qs = nn.Linear(d_model, n_head * d_k)
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
nn.init.normal_(self.w_qs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_ks.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_vs.weight, mean=0, std=np.sqrt(2.0 / (d_model + d_v)))

self.attention = ScaledDotProductAttention(temperature=np.power(d_k, 0.5))
self.layer_norm = nn.LayerNorm(d_model)

self.fc = nn.Linear(n_head * d_v, d_model)
nn.init.xavier_normal_(self.fc.weight)

self.dropout = nn.Dropout(dropout)

def forward(self, q, k, v, mask=None):

d_k, d_v, n_head = self.d_k, self.d_v, self.n_head

sz_b, len_q, _ = q.size()
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()

residual = q

q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)

q = q.permute(2, 0, 1, 3).contiguous().view(-1, len_q, d_k) # (n*b) x lq x dk
k = k.permute(2, 0, 1, 3).contiguous().view(-1, len_k, d_k) # (n*b) x lk x dk
v = v.permute(2, 0, 1, 3).contiguous().view(-1, len_v, d_v) # (n*b) x lv x dv

mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
output, attn = self.attention(q, k, v, mask=mask)

output = output.view(n_head, sz_b, len_q, d_v)
output = output.permute(1, 2, 0, 3).contiguous().view(sz_b, len_q, -1) # b x lq x (n*dv)

output = self.dropout(self.fc(output))
output = self.layer_norm(output + residual)

return output, attn
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: