我已经实施了MultiAttention head在Transformers. 周围有太多的实现,所以很混乱。有人可以验证我的实施是否正确:
DotProductAttention 引用自:https://www.tensorflow.org/tutorials/text/transformer#setup
import tensorflow as tf
def scaled_dot_product(q,k,v):
#calculates Q . K(transpose)
qkt = tf.matmul(q,k,transpose_b=True)
#caculates scaling factor
dk = tf.math.sqrt(tf.cast(q.shape[-1],dtype=tf.float32))
scaled_qkt = qkt/dk
softmax = tf.nn.softmax(scaled_qkt,axis=-1)
z = tf.matmul(softmax,v)
#shape: (m,Tx,depth), same shape as q,k,v
return z
class MultiAttention(tf.keras.layers.Layer):
def __init__(self,d_model,num_of_heads):
super(MultiAttention,self).__init__()
self.d_model = d_model
self.num_of_heads = num_of_heads
self.depth = d_model//num_of_heads
self.wq = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wk = [tf.keras.layers.Dense(self.depth) for i in range(num_of_heads)]
self.wv = [tf.keras.layers.Dense(self.depth) for i in …Run Code Online (Sandbox Code Playgroud)