๋ชจ๋ธ์ ๋๋ฆฌ๊ธฐ ์ํด์ ๋ง๋ ๊ฑด ์๋๊ณ , ์ด์ ์ ๊ณต๋ถํ ๋ด์ฉ์ ์ฝ๋๋ก ๊ตฌํํด๋ณด๋ฉด์ ์ดํด๋ฅผ ๊น์ดํ๋ ์๊ฐ์ ๊ฐ๊ณ ์ ๋ง๋ค์ด๋ณด์๋ค.
๋ฐ๋ผ์ ์ค์ ๋ฐ์ดํฐ ๋ฃ๊ณ ํ์ตํ์ ๋ ๊ตฌ๋ฐ๊ธฐ์ผ ์ ์๋ค๋ ์ ์ ์ฐธ๊ณ ํด์ฃผ์๊ธธ...
1. Attention ๊ตฌํํ๊ธฐ
$$Attention = softmax(\frac{QK^{T}}{\sqrt{ d_{k} }})V$$
์ด ๊ณต์์ ๋ง๊ฒ ๊ตฌํํ๋ฉด ๋๋ค. ์ฌ๊ธฐ์ $d_{k}$ ๋ k์ ์ฐจ์์์ด๋ค.
์ฐจ๋ก์ฐจ๋ก ๋ณด๋ฉด, ์์๋๋ก ์ฐจ๊ทผ์ฐจ๊ทผ ์งํํ๋ฉด ๋๋ค. ๋ง์คํฌ ์ ์ฉ์ ๊ดํ ๊ฑด์ ๋ค์์.
def scaled_dot_product_attention(query, key, value, mask=None):
# QK^T
matmul_qk = tf.matmul(query, key, transpose_b=True) # ๋ ๊ฐ ์ค ๋ค์ ํด๋นํ๋ ๊ฐ transpose
# QK^T / sqrt(d_k)
d_k = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_attention_logits = matmul_qk / tf.math.sqrt(d_k)
# softmax
attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
# multiply with V
output = tf.matmul(attention_weights, value)
return output, attention_value
2. Multi head Attention ๊ตฌํํ๊ธฐ
Multi head attention์ ์์์ ๊ตฌํํ attention์ ํธ์ถํด์ self-attention์ ์ฌ๋ฌ ํค๋๊ฐ ๋
์์ ์ผ๋ก ๊ณ์ฐํ๋๋ก ํ๋ ๊ตฌํ ๋ฐฉ๋ฒ์ด๋ค. ์ด ํด๋์ค ์์ฒด๋ฅผ ํ๋์ ๋ ์ด์ด๋ก ๋ณผ ๊ฒ์ด๋ฏ๋ก tf.keras.layers.Layer
์ ์์๋ฐ์ ๊ตฌํํ๋ค.
class MultiHeadAttention(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads):
super(MultiHeadAttention, self).__init__()
self.num_heads = num_heads
self.d_model = d_model
# d_model % num_heads ๋ ์ธ์ ๋ 0์ด์ด์ผ ํจ! ํ๋ฑํ๊ฒ ๋๋ ์ ๊ณ์ฐํ ๊ฒ
assert self.d_model % self.num_heads == 0, "d_model must be divisiable by num_heads"
self.depth = self.d_model // self.num_heads
# q, k, v๋ฅผ ์
๋ฐ์ดํธํ ๊ฐ์ค์น
self.wq = tf.keras.layers.Dense(d_model)
self.wk = tf.keras.layers.Dense(d_model)
self.wv = tf.keras.layers.Dense(d_model)
self.dense = tf.keras.layers.Dense(d_model)
def split_heads(self, x, batch_size):
# d_model์ head ๊ฐ์๋งํผ ๋๋ ์ ์
๋ ฅ์ multi head๋ก ๋ง๋ค ์ด์์ ๋ค์ง
# shape = (64, x_length, num_heads, d_model // num_heads)
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
return tf.transpose(x, perm=[0, 2, 1, 3])
# multi-head-attention layer ํธ์ถ
def call(self, query, key, value, mask=None):
batch_size = tf.shape(query)[0]
# Q, K, T ๊ตฌํจ
query = self.wq(query)
key = self.wk(key)
value = self.wv(value)
# multi head์ ๋ง๊ฒ ๋๋๊ณ reshape
query = self.split_heads(query, batch_size)
key = self.split_heads(key, batch_size)
value = self.split_heads(value, batch_size)
# Attention score ๊ณ์ฐ
scaled_attention, attention_weights = scaled_dot_product_attention(query, key, value, mask)
# ๊ณ์ฐ ์ฝ๊ฒ ํ๊ธฐ ์ํด ๋ฐ๊ฟจ๋ ์์น ๋ณต์
scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
output = self.dense(concat_attention)
return output, attention_weights
3. Feed Forward ๊ตฌํํ๊ธฐ
Encoder์ Decoder ๋ง์ง๋ง ๋จ๊ณ์์ Feed Forward layer๊ฐ ์กด์ฌํ๋ค.
class PositionwiseFeedForward(tf.keras.layers.Layer):
def __init__(self, d_model, d_ffn=2048):
super(PositionwiseFeedForward, self).__init__()
self.d_model = d_model
self.d_ffn = d_ffn
self.dense1 = layers.Dense(self.d_ffn, activation="relu")
self.dense2 = layers.Dense(self.d_model)
def call(self, x):
x = self.dense1(x)
x = self.dense2(x)
return x
- Feed forward ๋คํธ์ํฌ๋ ๋จ์ํ๊ฒ ๋ ๊ฐ์ fully connected layer๋ฅผ ์ฐ๊ฒฐํ๋ ๋ฐฉ์์ผ๋ก ๊ตฌ์ฑํ์๋ค.
4. Encoder ๊ตฌํํ๊ธฐ
4.1. Encoder layer
๊ตฌํํด์ผํ Encoder Layer๋ฅผ ๋ณด๋ฉด, Multi-head Attention๊ณผ normalization ๋ ์ด์ด๊ฐ ์๊ณ feed forward layer๋ก ์ด์ด์ง๋ค. ๊ทธ๋ฆฌ๊ณ residual connection์ ์ด์ฉํด์ layer์ ์ ๋ ฅ ๋ฐ์ดํฐ๋ฅผ ์ถ๋ ฅ ๋ฐ์ดํฐ์ ๋ฐ์ํ๋ ์์ผ๋ก ๊ตฌ์ฑ๋์ด ์๋ค.
์ ๋จ๊ณ์์ multi-head connection๊ณผ feed forward๋ฅผ ๊ตฌํํ๋์์ผ๋ฏ๋ก ์ด๋ฅผ ์ฝ๋๋ก ํํํ๋ฉด ์๋์ ๊ฐ๋ค.
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, d_ffn, dropout_rate=0.1):
super(EncoderLayer, self).__init__()
# ์ฌ์ฉํ layer ์ ์ธ
# Multi-Head Attention
self.multi_head_attention = MultiHeadAttention(d_model, num_heads)
# FeedForward Network
self.feedforward = PositionwiseFeedForward(d_model, d_ffn)
# Dropout
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
# Layer Normalization
self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
def call(self, x, mask, training):
# multi-head attention ์คํ
attn_output, _ = self.multi_head_attention(x, x, x, mask)
# dropout
attn_output = self.dropout1(attn_output, training=training)
# ์
๋ ฅ x์ attention output์ ๋ํด์ residual connection ๊ตฌํ + norm
out1 = self.layer_norm1(x + attn_output)
# feed forward network
ffn_output = self.feedforward(out1)
# dropout
ffn_output = self.dropout2(ffn_output, training=training)
# attention ์ถ๋ ฅ out1๊ณผ feead forward output์ ๋ํด์ residual connection ๊ตฌํ + norm
out2 = self.layer_norm2(out1 + ffn_output)
return out3
์ด์ ์ด encoder layer๋ฅผ ์ฌ๋ฌ ๊ฐ ๋ถ์ฌ์ encoder๋ฅผ ๊ตฌ์ฑํ๋ค.
๋จผ์ encoder ์
๋ ฅ ๋ฒกํฐ๋ฅผ embeding ํด์ผํ๋ค. ์
๋ ฅ ๋ฒกํฐ๋ฅผ ๊ณ ์ ํฌ๊ธฐ์ dense ๋ฒกํฐ๋ก ๋ณํํ๋ ๊ณผ์ ์ด๋ค.
๊ทธ๋ฆฌ๊ณ positional encoding์ ์ํ๊ฒฐ๊ณผ์ embedding์ ๋จผ์ ํฉ์ณ์ฃผ๊ณ , ์ด๋ฅผ encoder layer๋ค์ ์
๋ ฅ์ผ๋ก ์ฌ์ฉํ๋๋ก ํ๋ค.
class Encoder(tf.keras.layers.Layer):
def __init__(
self, num_layers, d_model, num_heads, d_ffn, vocab_size, dropout_rate=0.1
):
super(Encoder, self).__init__()
# ์
๋ ฅ ์๋ฒ ๋ฉ
self.embedding = tf.keras.layers.Embedding(
input_dim=vocab_size, output_dim=d_model
) # ๋จ์ด์ง์ ํฌ๊ธฐ๊ฐ 10000์ผ ๊ฒฝ์ฐ
# Positional Encoding
self.pos_encoding = self.add_weight(
"pos_encoding", shape=[1, 10000, d_model], trainable=False
)
self.encoder_layers = [
EncoderLayer(d_model, num_heads, d_ffn, dropout_rate)
for _ in range(num_layers)
]
self.dropout = tf.keras.layers.Dropout(dropout_rate)
def call(self, x, mask, training):
# embedding + positional encoding
x = self.embedding(x) + self.pos_encoding[:, : tf.shape(x)[1], :]
x = self.dropout(x, training=training)
for layer in self.encoder_layers:
x = layer(x, mask, training)
return x
4.2 Encoder Padding Mask
def create_padding_mask(seq):
mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
return mask[:, tf.newaxis, tf.newaxis, :] # shape = (batch_size, 1, 1, seq_len)
Sequence ๊ธธ์ด๋ฅผ ํต์ผํ๊ธฐ ์ํด์ ๋ง๋ค์ด์ง padding์ด๋ค. Transformer๋ ๊ณ ์ ๊ธธ์ด sequence๋ฅผ ์ ์ถ๋ ฅ์ผ๋ก ๋๊ธฐ ๋๋ฌธ์ ์ํ์ค ๊ธธ์ด๊ฐ ๋ค์ญ๋ ์ญํด์๋ ์๋๋ค.
5. Decoder ๊ตฌํํ๊ธฐ
5.1. Decoder Layer
Decoder layer๋ ๋ ๊ฐ์ multi-head attention๊ณผ ํ๋์ feed forward network๋ก ๊ตฌ์ฑ๋์ด ์๋ค. ์ด๋, decoder layer์ ์ฒซ ๋ฒ์งธ multi-head attention์ masked multi-head attention ์ด๋ผ๊ณ ํ๋๋ฐ, decoder์ ์ ๋ ฅ์ด target ์ํ์ค์ธ ๊ฒ์ ๊ณ ๋ คํ์์ ๋, ํ์ฌ ๋ชจ๋ธ์ด ์ถ๋ก ํด์ผํ๋ ์์์ ๋จ์ด๋ณด๋ค ๋ฏธ๋์ ์ ๋ณด๋ฅผ ๊ฐ์ง ๋ชปํ๋๋ก ๊ฐ๋ฆฌ๋ ์ญํ ์ ํ๋ค. ๋ฏธ๋์ ์ ๋ณด๋ฅผ ์๋ฉด ์ด๋ฏธ ๋ต์ ์๋ ๊ฒ๊ณผ ๋ค๋ฆ์๊ธฐ ๋๋ฌธ์ด๋ค.
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self, d_model, num_heads, d_ffn, dropout_rate=0.1):
super(DecoderLayer, self).__init__()
# Masked Multi-head Attention
self.masked_multi_head_attention = MultiHeadAttention(
d_model=d_model, num_heads=num_heads
)
# Multi-Head Attention
self.multi_head_attention = MultiHeadAttention(
d_model=d_model, num_heads=num_heads
)
# FeedForward Network
self.feedforward = PositionwiseFeedForward(d_model=d_model, d_ffn=d_ffn)
# Dropout
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
self.dropout3 = layers.Dropout(dropout_rate)
# Layer Normalization
self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
self.layer_norm3 = layers.LayerNormalization(epsilon=1e-6)
def call(self, x, encoder_output, look_ahead_mask, padding_mask, training):
# Masked Multi-Head Attention
attn1, _ = self.masked_multi_head_attention(x, x, x, mask=look_ahead_mask)
attn1 = self.dropout1(attn1, training=training)
out1 = self.layer_norm1(x + attn1)
attn2, _ = self.multi_head_attention(
out1, encoder_output, encoder_output, mask=padding_mask
)
attn2 = self.dropout2(attn2, training=training)
out2 = self.layer_norm2(out1 + attn2)
ffn_output = self.feedforward(out2)
ffn_output = self.dropout3(ffn_output, training=training)
out3 = self.layer_norm3(out2 + ffn_output)
return out3
5.2. Decoder padding mask
def create_padding_mask(seq):
mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
return mask[:, tf.newaxis, tf.newaxis, :] # shape = (batch_size, 1, 1, seq_len)
์ด ๋ถ๋ถ์ Encoder padding mask์ ๊ฐ๋ค. ๋ค๋ง decoder sequence๋ encoder sequence๋ณด๋ค ๊ธธ์ด๊ฐ ํ๋ ์งง๋ค๋ ์ฌ์ค์ ์ผ๋์ ๋์ด์ผํ๋ค.
5.3. Look-ahead mask
def create_look_ahead_mask(size):
"""
tf.ones((4,4)) -> [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]
tf.linalg.band_part,
num_lower = -1 : ๋๊ฐ์ ์๋ ๋ชจ๋ ์์๋ฅผ ๋จ๊น
num_upper = 0 : ๋๊ฐ์ ์ ๋ชจ๋ ์์๋ฅผ 0์ผ๋ก ์ฒ๋ฆฌ๋ฆฌ -> [[1, 0, 0, 0], [1, 1, 0, 0], [1, 1, 1, 0], [1, 1, 1, 1]]
1 - tf.linalg.band_part(~) : ์์ ๋นผ์ค์ ๊ฐ ๋ฐ์ -> [[0, 1, 1, 1], [0, 0, 1, 1], [0, 0, 0, 1], [0, 0, 0, 0]]
1๋ก ๋ง์คํน๋จ -> ๋์ค์ 1e-6 ์ด๋ ๊ณฑํด์ ธ์ ์ฐธ์กฐ ๋ชปํ๊ฒ ๋ ์์
"""
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), num_lower=-1, num_upper=0)
return mask
look-ahead ๋ง์คํฌ๋ ํ์ต ์ค์ decoder๊ฐ ์ถ๋ก ํด์ผํ ๊ฐ๋ณด๋ค ๋ ๋ฏธ๋์ ์ ๋ณด๋ค์ ๋ณด์ง ๋ชปํ๋๋ก ๋ง๋ ์ญํ ์ ํ๋ค. decoder์์ ์ฒซ ๋ฒ์งธ multi-head attention์ ์ํํ ๋ ๋ฏธ๋์ ๊ฐ์ ๋ณด์ง ๋ชปํ๋๋ก, ์ ํํ๋ ๋ฏธ๋์ ์ ๋ณด๊ฐ์ด ์ํฅ์ ๋ฏธ์น์ง ๋ชปํ๋๋ก ์์ฃผ ์๊ฒ ๋ง๋ค์ด ํ์ต์ ์ํฅ๊ถ์์ ๋นผ๋ ์ญํ ์ ํ๋ค.
6. Transformer
class Transformer(tf.keras.Model):
def __init__(
self,
num_enc_layers,
num_dec_layers,
d_model,
num_heads,
d_ffn,
input_vocab_size,
target_vocab_size,
max_pos_enc,
dropout_rate=0.1,
):
super(Transformer, self).__init__()
# Encoder
self.encoder = Encoder(
num_enc_layers, d_model, num_heads, d_ffn, input_vocab_size, dropout_rate
)
# Decoder
self.decoder = Decoder(
num_dec_layers, d_model, num_heads, d_ffn, target_vocab_size, dropout_rate
)
# output
self.dense = layers.Dense(target_vocab_size)
def call(
self,
enc_input,
dec_input,
encoder_padding_mask,
look_ahead_mask,
decoder_padding_mask,
training,
):
# Encoder
encoder_output = self.encoder(enc_input, encoder_padding_mask, training)
# Decoder
decoder_output = self.decoder(
dec_input, encoder_output, look_ahead_mask, decoder_padding_mask, training
)
# output
output = self.dense(decoder_output)
return output
์ด์ ๊ป ๊ตฌํํ ๋ ์ด์ด๋ค์ ์์ฐจ์ ์ผ๋ก ์์์ฃผ๊ธฐ๋ง ํ๋ฉด ๋๋ค.
์ฌ์ค ๋ง๋ ๋ชจ๋ธ์ ๋ฑํ ํ
์คํธ(...) ํ์ง๋ ์์๊ณ ๊ตฌ์กฐ๋ฅผ ์ฝ๋๋ก ๊ตฌํํด๋ณด๋๋ฐ์ ์์๋ฅผ ๋๋ค.
๋ฐ๋ก ๋ฐ์ดํฐ๋ฅผ ๊ตฌํด์ ํ์ตํด๋ณด๊ฒ ๋๋ค๋ฉด ๊ทธ๊ฒ๋ ์ถ๊ฐํ๊ฒ ๋ค.