算法工程师非数据结构手撕代码题

文章目录

1. top-K采样
2. top-P采样
3. temperature softmax
4. 线性回归
5. 线性回归手动更新梯度
6. kmeans算法
7. beam search算法
8. Layer Norm
9. Batch Norm
10. self-attention
11. 可训练embedding
12. 三角函数位置编码

1. top-K采样

def topK_sampler(logits, k):
    probs = torch.softmax(logits, dim=-1)
    values, indices = torch.topk(probs, k, dim=-1)
    zeros = logits.new_ones(logits.shape) * float('-inf')
    # 填充
    zeros.scatter_(-1, indices, values)
    probs_to_smaple = torch.softmax(zeros, dim=-1)
    sample_token_id = torch.multinomial(probs_to_smaple, 1)
    return sample_token_id

2. top-P采样

def top_p_sampler(logits, p):
    probs = torch.softmax(logits, dim=-1)
    # 降序排序
    sorted_probs, sorted_indices = torch.sort(probs, dim=-1, descending=True)
    # 概率累加
    cum_sum_probs = torch.cumsum(sorted_probs, dim=-1)
    sorted_indices_to_remove = cum_sum_probs > p
    # sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
    # 至少保留一个词
    sorted_indices_to_remove[..., 0] = 0
    
    probs_top_p = sorted_probs.clone()
    probs_top_p[sorted_indices_to_remove] = float("-inf")
    probs_to_smaple = torch.softmax(probs_top_p, dim=-1)
    # 采样
    sample_token_id = torch.multinomial(probs_to_smaple, 1)

    token_index = sorted_indices.gather(dim=-1, index=sample_token_id)
    return token_index

3. temperature softmax

def temperature_softmax(logits, t=1.0):
    logits = logits/t
    exp_logits = torch.exp(logits)
    sum_exp = torch.sum(exp_logits, dim=-1, keepdim=True)
    out = exp_logits/sum_exp
    return out

4. 线性回归

import torch
from torch import nn


class LinearReg(nn.Module):
    def __init__(self, input_dim, out_dim):
        super(LinearReg, self).__init__()
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.linear = nn.Linear(input_dim, out_dim, bias=True)

    def forward(self, x):
        out = self.linear(x)
        return out


def main():
    data = torch.randn(100, 2)*10
    # 假设 y=3*x_1 + 2*x_2 + 5
    weight = torch.tensor([[3.], [2]])
    bias = torch.tensor([[5.]])
    # 构建数据集,增加一些扰动
    y = (data @ weight + bias) + torch.randn(100, 2)*2

    model = LinearReg(input_dim=2, out_dim=1)
    loss_func = nn.MSELoss(reduction="mean")
    optimizer = torch.optim.SGD(lr=5e-3, params=model.parameters())
    epochs = 1000
    for step in range(epochs):
        pred = model(data)
        loss = loss_func(pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (step+1) % 10 == 0:
            print(f"{step}/{epochs} steps, loss: {loss.item():.4f}")
    print("train finished")

    # 打印模型权重
    print("训练后模型权重如下")
    print("weight", model.linear.weight)
    print("bias", model.linear.bias)


if __name__ == "__main__":
    main()

5. 线性回归手动更新梯度


import torch


def diff_mse(x, y, w):
    """
    手动求梯度
    delta_w = x.t@(x@w-y)
    """
    return x.transpose(0, 1)@(x@w-y)/x.shape[0]


def mse_loss(x, y, w):
	"""
	计算损失
	"""
    return 0.5*torch.mean(torch.square(x@w-y))


def get_batch_data(x, y, batch_size, step):
    data_len = x.shape[0]
    start = step*batch_size
    end = min(start + batch_size, data_len)
    return x[start:end], y[start:end]


def train(epochs, batch_size, lr):
    data = torch.randn(100, 2)*2    # [100, 2]
    weight = torch.tensor([[3.], [2]])   # [2, 1]
    y = data@weight + torch.randn(100, 1)*2

    param_w = torch.randn(2, 1)
    steps = data.shape[0]//batch_size
    for epoch in range(epochs):
        for step in range(steps):
            x, lb = get_batch_data(data, y, batch_size, step)
            loss = mse_loss(x, lb, param_w)
            grad = diff_mse(x, lb, param_w)
            param_w = param_w - lr*grad
            if step % 10 == 0:
                print(f"epoch:{epoch}; step:{step}; loss:{loss.item()}")
    print(f"train finished, param w: {param_w}")


if __name__ == "__main__":
    train(epochs=200, batch_size=8, lr=5e-4)

6. kmeans算法

import numpy as np


def k_means(max_iter, tol, data, k):
    """
    max_iter: 最大迭代次数
    toL: 聚类中心变化最小值
    data：输入数据 [n, dim]
    k：聚类中心个数
    """
    # 初始化聚类中心
    centers = data[np.random.choice(data.shape[0], k, replace=False)]
    for j in range(max_iter):
        # dist.shape = [n, k]
        dist = np.linalg.norm(data[:, np.newaxis]-centers, axis=2)
        # labels.shape = (n)
        labels = np.argmin(dist, axis=1)
        # 分别选出数据中属于每个聚类中心的点并求均值作为新的聚类中心
        new_centers = np.array([data[labels == i].mean(axis=0) for i in range(k)])
        # 聚类中心的变化值很小时停止迭代
        if np.all(np.linalg.norm(new_centers-centers, axis=1) < tol):
            print(f"less than tol, break down, iter num {j}")
            break
        centers = new_centers
    return centers, labels


if __name__ == "__main__":
    data = np.random.randn(100, 3)
    centers, labels = k_means(max_iter=100, tol=0.00001, data=data, k=3)
    print(centers)
    print(labels)

7. beam search算法

类似于top-k

8. Layer Norm

class LayerNorm(nn.Module):
    def __init__(self, dim):
        super(LayerNorm, self).__init__()
        self.alpha = nn.Parameter(torch.ones(dim))
        self.beta = nn.Parameter(torch.zeros(dim))
        self.epsilon = 1e-6

    def forward(self, feature):
        """
        feaure.shape = [batch-size, seq_len, embed_dim]
        """
        mean = torch.mean(feature, dim=-1, keepdim=True)
        std = torch.std(feature, dim=-1, keepdim=True)
        norm = (feature-mean)/(std+self.epsilon)
        out = self.alpha*norm + self.beta
        return out
  
 if __name__ == "__main__":
    logits = torch.randn(2, 3, 5)
    ln = LayerNorm(5)
    print(ln(logits))

9. Batch Norm

class BatchNorm(nn.Module):
    def __init__(self, momenum=0.01, eps=1e-6, dim=5):
        super(BatchNorm, self).__init__()
        self.run_mean = torch.zeros(dim)
        self.run_std = torch.ones(dim)
        self.momentum = momenum
        self.eps = eps
        self.beta = nn.Parameter(torch.ones(dim))
        self.gamma = nn.Parameter(torch.zeros(dim))

    def forward(self, feature, is_train=True):
        if is_train:
            mean = torch.mean(feature, dim=0)
            std = torch.std(feature, dim=0)
            self.run_mean = self.momentum*self.run_mean + (1-self.momentum)*mean
            self.run_std = self.momentum*self.run_std + (1-self.momentum)*std
            norm = (feature-mean)/(std+self.eps)
        else:
            norm = (feature-self.run_mean)/(self.run_std+self.eps)
        out = self.beta*norm + self.gamma
        return out


if __name__ == "__main__":
    logits = torch.randn(3, 5)
    bn = BatchNorm(momenum=0.01, eps=1e-6, dim=5)
    print(bn(logits, True))

10. self-attention

# -*- coding = utf-8 -*-
# @Time: 2024/2/24 15:44
# @Author: fun
# @Software: PyCharm

import torch
from torch import nn
import torch.nn.functional as F


class MultiHeadAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads

        self.head_dim = self.embed_dim // self.num_heads
        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)

    def forward(self, query, key, value, atten_mask=None, padding_mask=None):
        """

        :param query: [tgt_len, batch_size, embed_dim]
        :param key: [src_len, batch_size, embed_dim]
        :param value: [src_len, batch_size, embed_dim]
        :param atten_mask: decoder部分mask multi head attention
        :param padding_mask: input padding mask
        :return:
        """
        q = self.q_proj(query)   # [tgt_len, bs, embed_dim]
        k = self.k_proj(key)     # [src_len, bs, embed_dim]
        v = self.v_proj(value)   # [src_len, bs, embed_dim]
        tgt_len, batch_size, embed_dim = query.size()
        src_len = key.size()[0]
        head_dim = embed_dim // self.num_heads
        scaling = float(head_dim)**-0.5
        q = q*scaling

        if atten_mask:
            if atten_mask.dim() == 2:
                atten_mask = atten_mask.unsqueeze(0)
                if list(atten_mask.size()) != [1, tgt_len, src_len]:
                    raise RuntimeError("The 2D attention mask is not correct!")
            elif atten_mask.dim() == 3:
                if list(atten_mask.size()) != [batch_size*self.num_heads, tgt_len, src_len]:
                    raise RuntimeError("The 3D attention mask is not correct!")
            else:
                raise RuntimeError("Attention mask dim is not correct")

        q = q.contiguous().view(tgt_len, batch_size*self.num_heads, head_dim).transpose(0, 1)     # [bsz*num_head,
        # tgt_len, head_dim]
        v = v.contiguous().view(src_len, batch_size*self.num_heads, head_dim).transpose(0, 1)     # [bsz*num_head,
        # src_len, head_dim]
        k = k.contiguous().view(src_len, batch_size*self.num_heads, head_dim).transpose(0, 1)
        attn_output_weight = torch.bmm(q, k.transpose(1, 2))    # [bsz*num_heads, tgt_len, src_len]

        if atten_mask is not None:
            attn_output_weight += atten_mask
        if padding_mask is not None:
            attn_output_weight = attn_output_weight.view(batch_size, self.num_heads, tgt_len, src_len)
            attn_output_weight = attn_output_weight.masked_fill(
                padding_mask.unsqueeze(1).unsqueeze(2), float("-inf"))
            attn_output_weight = attn_output_weight.view(batch_size*self.num_heads, tgt_len, src_len)

        attn_output_weight = F.softmax(attn_output_weight, dim=-1)
        attn_output_weight = F.dropout(attn_output_weight, training=True)
        attn_output = torch.bmm(attn_output_weight, v)    # [bsz*num_heads, tgt_len, head_dim]
        attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, batch_size, head_dim*self.num_heads)
        z = self.out_proj(attn_output)
        return z

def generate_square_subsequent_mask(size):
    """
    生成 attention mask 矩阵
    :param size: [tgt_len, src_len]
    :return: 
    """
    # torch.triu返回主对角线及其上部的元素,其他元素都置为0
    mask = (torch.triu(torch.ones(size, size)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask  # [sz,sz]


if __name__ == "__main__":
    source_len = 5
    bs = 2
    model_dim = 32
    num_head = 2
    src = torch.rand((source_len, bs, model_dim))
    src_padding_mask = torch.tensor([[False, False, False, True, True],
                        [False, False, False, False, True]])

    mha = MultiHeadAttention(embed_dim=model_dim, num_heads=num_head)
    out = mha(src, src, src, padding_mask=src_padding_mask)
    print(out)
    print(out.shape)

11. 可训练embedding

# -*- coding = utf-8 -*-
# @Time: 2024/2/24 18:49
# @Author: fun
# @Software: PyCharm

from torch import nn
import math


class TokenEmbed(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TokenEmbed, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.embed_dim = embed_dim

    def forward(self, tokens):
        """

        :param tokens: [length, bs]
        :return:embedding: [length, bs, embed_dim]
        """
        # 注意：根据论文所述，token embedding 需要进行缩放
        return self.embedding(tokens.long()) * math.sqrt(self.embed_dim)

12. 三角函数位置编码

在这里插入图片描述

def position_embed(max_len, model_dim):
    pe = torch.zeros((max_len, model_dim))
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)   # [max_len, 1]
    div_term = torch.exp(torch.arange(0, model_dim, 2).float()*(math.log(10000)/model_dim))
    pe[:, 0::2] = torch.sin(position/div_term)
    pe[:, 1::2] = torch.cos(position/div_term)
    return pe

算法工程师非数据结构手撕代码题

文章目录

1. top-K采样

2. top-P采样

3. temperature softmax

4. 线性回归

5. 线性回归手动更新梯度

6. kmeans算法

7. beam search算法

8. Layer Norm

9. Batch Norm

10. self-attention

11. 可训练embedding

12. 三角函数位置编码

悦读