假设我们正在读一本书,并且这本书里的每个词都需要被理解和记住。接下来,我们将通过一个具体的例子和代码片段来说明Transformer编码器的工作流程。
场景:
想象你正在阅读一本关于动物行为学的书,其中有一句话:“猫追了狗。”。我们要用Transformer编码器来理解这句话,并把它转换成机器可以处理的形式。
编码器工作流程
1. 把单词变成数字(嵌入与位置编码)
首先,我们需要将每个单词转换为计算机可以处理的数字形式。这包括两个步骤:
-
词汇嵌入(Embedding):将每个单词转换为一个向量,这个向量能够捕捉到单词的语义信息。
-
位置编码(Positional Encoding):为了保留单词在句子中的顺序信息,我们给每个单词添加一个位置编码。
import torch
import torch.nn as nn
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)
# 假设我们有一个简单的分词器和词汇表
vocab = {'<pad>': 0, '猫': 1, '追': 2, '了': 3, '狗': 4}
input_ids = torch.tensor([[1, 2, 3, 4]]) # "猫追了狗"
# 初始化嵌入层和位置编码层
d_model = 512
embedding_layer = nn.Embedding(len(vocab), d_model)
positional_encoding_layer = PositionalEncoding(d_model)
# 获取嵌入和位置编码后的输入
embedded_input = embedding_layer(input_ids)
encoded_input = positional_encoding_layer(embedded_input)
2. 多头自注意力机制(Multi-Head Self-Attention)
接下来,编码器会尝试理解每个单词与其他所有单词之间的关系。它会问自己:“这个单词和其他哪些单词有联系?” 这个过程就像你在阅读时,不仅关注当前的词,还会思考它与其他词的关系。
class MultiHeadAttention(nn.Module):
def __init__(self, embed_size, num_heads):
super(MultiHeadAttention, self).__init__()
self.embed_size = embed_size
self.num_heads = num_heads
self.head_dim = embed_size // num_heads
self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
self.fc_out = nn.Linear(embed_size, embed_size)
def forward(self, values, keys, query, mask):
N = query.shape[0]
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
values = values.reshape(N, value_len, self.num_heads, self.head_dim)
keys = keys.reshape(N, key_len, self.num_heads, self.head_dim)
queries = query.reshape(N, query_len, self.num_heads, self.head_dim)
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)
out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
N, query_len, self.embed_size
)
out = self.fc_out(out)
return out
# 使用多头自注意力机制处理输入
num_heads = 8
multi_head_attention = MultiHeadAttention(d_model, num_heads)
attention_output = multi_head_attention(encoded_input, encoded_input, encoded_input, None)
3. 前馈神经网络(Feed-Forward Neural Network, FFNN)
经过自注意力机制后,编码器会对每个单词进行进一步的数学处理,以捕捉更复杂的模式。
class FeedForward(nn.Module):
def __init__(self, embed_size, forward_expansion, dropout):
super(FeedForward, self).__init__()
self.linear1 = nn.Linear(embed_size, forward_expansion * embed_size)
self.relu = nn.ReLU()
self.linear2 = nn.Linear(forward_expansion * embed_size, embed_size)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
return self.linear2(self.dropout(self.relu(self.linear1(x))))
# 使用前馈神经网络处理注意力输出
forward_expansion = 4
feed_forward = FeedForward(d_model, forward_expansion, dropout=0.1)
ffnn_output = feed_forward(attention_output)
4. 残差连接与层归一化(Residual Connections & Layer Normalization)
为了确保信息不会在传递过程中丢失,编码器会在每个步骤后添加一条直接的路径(残差连接),并且对数据进行调整(层归一化),以保持稳定性和一致性。
class EncoderLayer(nn.Module):
def __init__(self, embed_size, num_heads, forward_expansion, dropout):
super(EncoderLayer, self).__init__()
self.attention = MultiHeadAttention(embed_size, num_heads)
self.norm1 = nn.LayerNorm(embed_size)
self.norm2 = nn.LayerNorm(embed_size)
self.feed_forward = FeedForward(embed_size, forward_expansion, dropout)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
attention = self.attention(x, x, x, mask)
norm1 = self.dropout(self.norm1(attention + x))
forward = self.feed_forward(norm1)
out = self.dropout(self.norm2(forward + norm1))
return out
# 使用编码器层处理前馈神经网络的输出
encoder_layer = EncoderLayer(d_model, num_heads, forward_expansion, dropout=0.1)
layer_output = encoder_layer(ffnn_output, None)
5. 堆叠多个编码器层
最后,整个过程会被重复多次,每次都会在之前的基础上进一步提炼信息。这就像是反复阅读同一段文字,每次都能发现更多细节和深层次的意思。
class Encoder(nn.Module):
def __init__(self, src_vocab_size, embed_size, num_layers, num_heads, device, forward_expansion, dropout, max_length):
super(Encoder, self).__init__()
self.device = device
self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
self.position_embedding = PositionalEncoding(embed_size, dropout, max_length)
self.layers = nn.ModuleList(
[EncoderLayer(embed_size, num_heads, forward_expansion, dropout) for _ in range(num_layers)]
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask):
N, seq_length = x.shape
positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
out = self.dropout((self.word_embedding(x) + self.position_embedding(positions)))
for layer in self.layers:
out = layer(out, mask)
return out
# 创建完整的编码器并处理输入
src_vocab_size = len(vocab)
num_layers = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_length = 50
encoder = Encoder(src_vocab_size, d_model, num_layers, num_heads, device, forward_expansion, dropout=0.1, max_length=max_length)
final_output = encoder(input_ids.to(device), None)
总结
通过上述步骤,Transformer的编码器有效地捕捉了输入序列中的丰富信息。具体来说:
- 词汇嵌入与位置编码:将每个单词转换为包含其意义和位置的向量。
- 多头自注意力机制:理解每个单词与其他单词之间的关系。
- 前馈神经网络:对信息进行进一步的数学处理,捕捉更复杂的模式。
- 残差连接与层归一化:确保信息顺利流动,并保持稳定性和一致性。
- 堆叠多个编码器层:逐步提炼出更高级别的特征表示。
这样,编码器不仅能理解每个单词的意义,还能捕捉它们之间的复杂关系,从而为后续的任务(如翻译、文本生成等)提供强有力的支持。