前一篇文章中,采用了在当前比较主流的Transformer模型,Transformer模型本质来看是基于Encoder-Decoder框架,其解码方式本质上和seq2seq模型的解码方式基本相同。seq2seq的重要缺陷之一在于其不具备生成能力,而PGN模型具备着良好的生成能力。因此,本文拟结合Transformer强大的特征抽取能力以及PGN模型的生成能力,希望能碰撞出一些火花。这两个模型的原理,前面系列文章已做具体讲解。本文着重于介绍该模型的实现。
整个项目的大体流程,如数据加载、训练流程、测试流程等结构,和前面的模型介绍基本相同,而本文是基于上一篇文章Transformer实现的提升,因此基本代码相同。差别在于PGN机制的融入。
Transformer模型架构和前文内容基本相同:Encoder、Decoder、以及输出层。
class PGN_TRANSFORMER(tf.keras.Model): def __init__(self, params): super(PGN_TRANSFORMER, self).__init__() self.num_blocks = params["num_blocks"] self.batch_size = params["batch_size"] self.vocab_size = params["vocab_size"] self.num_heads = params["num_heads"] self.embedding = Embedding(params["vocab_size"], params["d_model"]) self.encoder = Encoder(params["num_blocks"], params["d_model"], params["num_heads"], params["dff"], params["vocab_size"], params["dropout_rate"]) self.decoder = Decoder(params["num_blocks"], params["d_model"], params["num_heads"], params["dff"], params["vocab_size"], params["dropout_rate"]) self.final_layer = tf.keras.layers.Dense(params["vocab_size"]) def call(self, inp, extended_inp, max_oov_len, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask): # print('inp is ', inp) embed_x = self.embedding(inp) embed_dec = self.embedding(tar) enc_output = self.encoder(embed_x, training, enc_padding_mask) # (batch_size, inp_seq_len, d_model) # dec_output.shape == (batch_size, tar_seq_len, d_model) dec_output, attention_weights, p_gens = self.decoder(embed_dec, enc_output, training, look_ahead_mask, dec_padding_mask) final_output = self.final_layer(dec_output) # (batch_size, tar_seq_len, target_vocab_size) final_output = tf.nn.softmax(final_output) # p_gens = tf.keras.layers.Dense(tf.concat([before_dec, dec, attn_dists[-1]], axis=-1),units=1,activation=tf.sigmoid,trainable=training,use_bias=False) attn_dists = attention_weights['decoder_layer{}_block2'.format(self.num_blocks)] # (batch_size,num_heads, targ_seq_len, inp_seq_len) attn_dists = tf.reduce_sum(attn_dists, axis=1) / self.num_heads # (batch_size, targ_seq_len, inp_seq_len) final_dists = calc_final_dist(extended_inp, tf.unstack(final_output, axis=1), tf.unstack(attn_dists, axis=1), tf.unstack(p_gens, axis=1), max_oov_len, self.vocab_size, self.batch_size) outputs = dict(logits=tf.stack(final_dists, 1), attentions=attn_dists) return outputs
整体架构于Transformer模型的区别在于call函数中,decoder在解码过程中,需要返回概率p_gen以及上一步解码过程中的注意力分布。而在计算最终概率分布的时候(calc_final_dist),需要综合考虑更新后的词汇表概率分布以及注意力分数。
def calc_final_dist(_enc_batch_extend_vocab, vocab_dists, attn_dists, p_gens, batch_oov_len, vocab_size, batch_size): """ Calculate the final distribution, for the pointer-generator model Args: vocab_dists: The vocabulary distributions. List length max_dec_steps of (batch_size, vsize) arrays. The words are in the order they appear in the vocabulary file. attn_dists: The attention distributions. List length max_dec_steps of (batch_size, attn_len) arrays Returns: final_dists: The final distributions. List length max_dec_steps of (batch_size, extended_vsize) arrays. """ # Multiply vocab dists by p_gen and attention dists by (1-p_gen) vocab_dists = [p_gen * dist for (p_gen, dist) in zip(p_gens, vocab_dists)] attn_dists = [(1-p_gen) * dist for (p_gen, dist) in zip(p_gens, attn_dists)] # Concatenate some zeros to each vocabulary dist, to hold the probabilities for in-article OOV words # the maximum (over the batch) size of the extended vocabulary extended_size = vocab_size + batch_oov_len extra_zeros = tf.zeros((batch_size, batch_oov_len)) # list length max_dec_steps of shape (batch_size, extended_size) vocab_dists_extended = [tf.concat(axis=1, values=[dist, extra_zeros]) for dist in vocab_dists] # Project the values in the attention distributions onto the appropriate entries in the final distributions # This means that if a_i = 0.1 and the ith encoder word is w, and w has index 500 in the vocabulary # then we add 0.1 onto the 500th entry of the final distribution # This is done for each decoder timestep. # This is fiddly; we use tf.scatter_nd to do the projection batch_nums = tf.range(0, limit=batch_size) # shape (batch_size) batch_nums = tf.expand_dims(batch_nums, 1) # shape (batch_size, 1) attn_len = tf.shape(_enc_batch_extend_vocab)[1] # number of states we attend over batch_nums = tf.tile(batch_nums, [1, attn_len]) # shape (batch_size, attn_len) indices = tf.stack((batch_nums, _enc_batch_extend_vocab), axis=2) # shape (batch_size, enc_t, 2) shape = [batch_size, extended_size] # list length max_dec_steps (batch_size, extended_size) attn_dists_projected = [tf.scatter_nd(indices, copy_dist, shape) for copy_dist in attn_dists] # Add the vocab distributions and the copy distributions together to get the final distributions # final_dists is a list length max_dec_steps; each entry is a tensor shape (batch_size, extended_size) giving # the final distribution for that decoder timestep # Note that for decoder timesteps and examples corresponding to a [PAD] token, this is junk - ignore. final_dists = [vocab_dist + copy_dist for (vocab_dist, copy_dist) in zip(vocab_dists_extended, attn_dists_projected)] return final_dists
Decoder部分和Transformer模型的Decoder区别在于context vector以及p_gen概率的计算。
class Decoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, rate=0.1): super(Decoder, self).__init__() self.d_model = d_model self.num_layers = num_layers self.num_heads = num_heads self.depth = self.d_model // self.num_heads self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)] self.dropout = tf.keras.layers.Dropout(rate) self.Wh = tf.keras.layers.Dense(1) self.Ws = tf.keras.layers.Dense(1) self.Wx = tf.keras.layers.Dense(1) self.V = tf.keras.layers.Dense(1) def call(self, x, enc_output, training, look_ahead_mask, padding_mask): attention_weights = {} out = self.dropout(x, training=training) for i in range(self.num_layers): out, block1, block2 = self.dec_layers[i](out, enc_output, training, look_ahead_mask, padding_mask) attention_weights['decoder_layer{}_block1'.format(i + 1)] = block1 attention_weights['decoder_layer{}_block2'.format(i + 1)] = block2 # x.shape == (batch_size, target_seq_len, d_model) # context vectors enc_out_shape = tf.shape(enc_output) context = tf.reshape(enc_output, (enc_out_shape[0], enc_out_shape[1], self.num_heads, self.depth)) # shape : (batch_size, input_seq_len, num_heads, depth) context = tf.transpose(context, [0, 2, 1, 3]) # (batch_size, num_heads, input_seq_len, depth) context = tf.expand_dims(context, axis=2) # (batch_size, num_heads, 1, input_seq_len, depth) attn = tf.expand_dims(block2, axis=-1) # (batch_size, num_heads, target_seq_len, input_seq_len, 1) context = context * attn # (batch_size, num_heads, target_seq_len, input_seq_len, depth) context = tf.reduce_sum(context, axis=3) # (batch_size, num_heads, target_seq_len, depth) context = tf.transpose(context, [0, 2, 1, 3]) # (batch_size, target_seq_len, num_heads, depth) context = tf.reshape(context, (tf.shape(context)[0], tf.shape(context)[1], self.d_model)) # (batch_size, target_seq_len, d_model) # P_gens computing a = self.Wx(x) b = self.Ws(out) c = self.Wh(context) p_gens = tf.sigmoid(self.V(a + b + c)) return out, attention_weights, p_gens
Github代码