Bootstrap

c++ bench测试simd指令avx512

cmake配置

cmake_minimum_required(VERSION 3.10)
project(BenchmarkTest)

set(CMAKE_CXX_STANDARD 11)

# 查找 Google Benchmark
find_package(benchmark REQUIRED)

# 添加源文件
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -march=native")
add_definitions(-std=c++11)
add_executable(benchmark_test main.cc)

# 链接 Google Benchmark 库
target_link_libraries(benchmark_test benchmark::benchmark)

示例

#include <immintrin.h>
#include <iostream>
#include <cstring>
#include <benchmark/benchmark.h>

void mp_avx512(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
    for(int i = 0; i < len; ++i){
        
        __m512i va = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(a + i));
        __m512i vb = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(b + i));
        __m512i vc = _mm512_mullo_epi16(va, vb);

        _mm512_storeu_si512(reinterpret_cast<__m512i*>(result + i), vc);

    }
}

void mp_avx256(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
    for(int i = 0; i < len * 2; ++i){
        
        __m256i va = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i));
        __m256i vb = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(b + i));
        __m256i vc = _mm256_mullo_epi16(va, vb);

        _mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), vc);
    }
}

void mp_normal(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
    for(int i = 0; i < len * 32; ++i){
        result[i] = a[i] * b[i];
    }
}

static void Benchmark_mp_avx512(benchmark::State &state){
    int num = 10000;
    int16_t a[32 * num];
    int16_t b[32 * num];
    for(int i = 0; i < 32 * num; ++i){
        a[i] = 2;
        b[i] = 3;
    }
    int16_t c[32 * num];
    for(auto _ : state){
        mp_avx512(a, b, c, num);
    }

    // benchmark::ClobberMemory();
    // for(int i = 0; i < 32; ++i){
    //     std::cout << c[i] << " ";
    // }
}

static void Benchmark_mp_avx256(benchmark::State &state){
    int num = 10000;
    int16_t a[32 * num];
    int16_t b[32 * num];
    for(int i = 0; i < 32 * num; ++i){
        a[i] = 2;
        b[i] = 3;
    }
    int16_t c[32 * num];
    for(auto _ : state){
        mp_avx256(a, b, c, num);
    }

    // benchmark::ClobberMemory();
    // for(int i = 0; i < 32; ++i){
    //     std::cout << c[i] << " ";
    // }
}



static void Benchmark_mp_normal(benchmark::State &state){
    int num = 10000;
    int16_t a[32 * num];
    int16_t b[32 * num];
    for(int i = 0; i < 32 * num; ++i){
        a[i] = 2;
        b[i] = 3;
    }
    int16_t c[32 * num];
    for(auto _ : state){
        mp_normal(a, b, c, num);
    }
    // benchmark::ClobberMemory();
    // for(int i = 0; i < 32; ++i){
    //     std::cout << c[i] << " ";
    // }
}
// int main(){


//     std::cout << std::endl;
//     return 0;
// }
BENCHMARK(Benchmark_mp_avx256);
BENCHMARK(Benchmark_mp_avx512);
BENCHMARK(Benchmark_mp_normal);
BENCHMARK_MAIN();

测试结果

Running ./build/benchmark_test
Run on (56 X 2000 MHz CPU s)
CPU Caches:
  L1 Data 32 KiB (x28)
  L1 Instruction 32 KiB (x28)
  L2 Unified 1024 KiB (x28)
  L3 Unified 19712 KiB (x2)
Load Average: 16.80, 18.17, 11.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------
Benchmark                    Time             CPU   Iterations
--------------------------------------------------------------
Benchmark_mp_avx256     203131 ns       202640 ns         3275
Benchmark_mp_avx512     143850 ns       143203 ns         5232
Benchmark_mp_normal    1397004 ns      1393626 ns          489

avx512实现int16乘两种方法

inline __m512i mul(__m512i const &vec_a, __m512i const &vec_b) noexcept {
    // way1
    __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(vec_a, vec_b), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
    __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(vec_a, 8), _mm512_srli_epi16(vec_b, 8)), 8);
    return _mm512_or_si512(upper, lower);
    // Benchmark                   Time             CPU   Iterations
    // -------------------------------------------------------------
    // Benchmark_mul_int8       2510 ns         2504 ns       258753


    // way2
    // Convert int8_t to int16_t to avoid overflow during multiplication
    // __m512i vec_a_lo = _mm512_unpacklo_epi8(vec_a, _mm512_setzero_si512());
    // __m512i vec_a_hi = _mm512_unpackhi_epi8(vec_a, _mm512_setzero_si512());
    // __m512i vec_b_lo = _mm512_unpacklo_epi8(vec_b, _mm512_setzero_si512());
    // __m512i vec_b_hi = _mm512_unpackhi_epi8(vec_b, _mm512_setzero_si512());

    // // Perform the multiplication
    // __m512i prod_lo = _mm512_mullo_epi16(vec_a_lo, vec_b_lo);
    // __m512i prod_hi = _mm512_mullo_epi16(vec_a_hi, vec_b_hi);

    // // Pack the results back into int8_t
    // __m512i prod = _mm512_packs_epi16(prod_lo, prod_hi);

    // Benchmark                   Time             CPU   Iterations
    // -------------------------------------------------------------
    // Benchmark_mul_int8       3734 ns         3725 ns       187966

    // return prod;
}

bench测试

static void Benchmark_mul_int8(benchmark::State &state) {
    int num = 100;
    int8_t a[64 * num];
    int8_t b[64 * num];
    for (int i = 0; i < 64 * num; ++i) {
        a[i] = 2;
        b[i] = 3;
    }
    int8_t c[64 * num];
    for (auto _ : state) {
        for (int i = 0; i < num; i++) {
            __m512i va = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(a + i * 64));
            __m512i vb = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(b + i * 64));
            __m512i vc = mul(va, vb);
            _mm512_storeu_si512(reinterpret_cast<__m512i *>(c + i * 64), vc);
        }
    }
    // benchmark::ClobberMemory();
    for (int i = 0; i < 64 * num; ++i) {
        if (c[i] != 6) {
            std::cout << "error c[" << i << "] " << c[i] << std::endl;
            std::abort();
        }
    }
}
;