cmake配置
cmake_minimum_required(VERSION 3.10)
project(BenchmarkTest)
set(CMAKE_CXX_STANDARD 11)
# 查找 Google Benchmark
find_package(benchmark REQUIRED)
# 添加源文件
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -march=native")
add_definitions(-std=c++11)
add_executable(benchmark_test main.cc)
# 链接 Google Benchmark 库
target_link_libraries(benchmark_test benchmark::benchmark)
示例
#include <immintrin.h>
#include <iostream>
#include <cstring>
#include <benchmark/benchmark.h>
void mp_avx512(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
for(int i = 0; i < len; ++i){
__m512i va = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(a + i));
__m512i vb = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(b + i));
__m512i vc = _mm512_mullo_epi16(va, vb);
_mm512_storeu_si512(reinterpret_cast<__m512i*>(result + i), vc);
}
}
void mp_avx256(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
for(int i = 0; i < len * 2; ++i){
__m256i va = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(a + i));
__m256i vb = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(b + i));
__m256i vc = _mm256_mullo_epi16(va, vb);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(result + i), vc);
}
}
void mp_normal(const int16_t * a, const int16_t * b, int16_t *result, size_t len){
for(int i = 0; i < len * 32; ++i){
result[i] = a[i] * b[i];
}
}
static void Benchmark_mp_avx512(benchmark::State &state){
int num = 10000;
int16_t a[32 * num];
int16_t b[32 * num];
for(int i = 0; i < 32 * num; ++i){
a[i] = 2;
b[i] = 3;
}
int16_t c[32 * num];
for(auto _ : state){
mp_avx512(a, b, c, num);
}
// benchmark::ClobberMemory();
// for(int i = 0; i < 32; ++i){
// std::cout << c[i] << " ";
// }
}
static void Benchmark_mp_avx256(benchmark::State &state){
int num = 10000;
int16_t a[32 * num];
int16_t b[32 * num];
for(int i = 0; i < 32 * num; ++i){
a[i] = 2;
b[i] = 3;
}
int16_t c[32 * num];
for(auto _ : state){
mp_avx256(a, b, c, num);
}
// benchmark::ClobberMemory();
// for(int i = 0; i < 32; ++i){
// std::cout << c[i] << " ";
// }
}
static void Benchmark_mp_normal(benchmark::State &state){
int num = 10000;
int16_t a[32 * num];
int16_t b[32 * num];
for(int i = 0; i < 32 * num; ++i){
a[i] = 2;
b[i] = 3;
}
int16_t c[32 * num];
for(auto _ : state){
mp_normal(a, b, c, num);
}
// benchmark::ClobberMemory();
// for(int i = 0; i < 32; ++i){
// std::cout << c[i] << " ";
// }
}
// int main(){
// std::cout << std::endl;
// return 0;
// }
BENCHMARK(Benchmark_mp_avx256);
BENCHMARK(Benchmark_mp_avx512);
BENCHMARK(Benchmark_mp_normal);
BENCHMARK_MAIN();
测试结果
Running ./build/benchmark_test
Run on (56 X 2000 MHz CPU s)
CPU Caches:
L1 Data 32 KiB (x28)
L1 Instruction 32 KiB (x28)
L2 Unified 1024 KiB (x28)
L3 Unified 19712 KiB (x2)
Load Average: 16.80, 18.17, 11.84
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
--------------------------------------------------------------
Benchmark Time CPU Iterations
--------------------------------------------------------------
Benchmark_mp_avx256 203131 ns 202640 ns 3275
Benchmark_mp_avx512 143850 ns 143203 ns 5232
Benchmark_mp_normal 1397004 ns 1393626 ns 489
avx512实现int16乘两种方法
inline __m512i mul(__m512i const &vec_a, __m512i const &vec_b) noexcept {
// way1
__m512i upper = _mm512_and_si512(_mm512_mullo_epi16(vec_a, vec_b), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
__m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(vec_a, 8), _mm512_srli_epi16(vec_b, 8)), 8);
return _mm512_or_si512(upper, lower);
// Benchmark Time CPU Iterations
// -------------------------------------------------------------
// Benchmark_mul_int8 2510 ns 2504 ns 258753
// way2
// Convert int8_t to int16_t to avoid overflow during multiplication
// __m512i vec_a_lo = _mm512_unpacklo_epi8(vec_a, _mm512_setzero_si512());
// __m512i vec_a_hi = _mm512_unpackhi_epi8(vec_a, _mm512_setzero_si512());
// __m512i vec_b_lo = _mm512_unpacklo_epi8(vec_b, _mm512_setzero_si512());
// __m512i vec_b_hi = _mm512_unpackhi_epi8(vec_b, _mm512_setzero_si512());
// // Perform the multiplication
// __m512i prod_lo = _mm512_mullo_epi16(vec_a_lo, vec_b_lo);
// __m512i prod_hi = _mm512_mullo_epi16(vec_a_hi, vec_b_hi);
// // Pack the results back into int8_t
// __m512i prod = _mm512_packs_epi16(prod_lo, prod_hi);
// Benchmark Time CPU Iterations
// -------------------------------------------------------------
// Benchmark_mul_int8 3734 ns 3725 ns 187966
// return prod;
}
bench测试
static void Benchmark_mul_int8(benchmark::State &state) {
int num = 100;
int8_t a[64 * num];
int8_t b[64 * num];
for (int i = 0; i < 64 * num; ++i) {
a[i] = 2;
b[i] = 3;
}
int8_t c[64 * num];
for (auto _ : state) {
for (int i = 0; i < num; i++) {
__m512i va = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(a + i * 64));
__m512i vb = _mm512_loadu_si512(reinterpret_cast<const __m512i *>(b + i * 64));
__m512i vc = mul(va, vb);
_mm512_storeu_si512(reinterpret_cast<__m512i *>(c + i * 64), vc);
}
}
// benchmark::ClobberMemory();
for (int i = 0; i < 64 * num; ++i) {
if (c[i] != 6) {
std::cout << "error c[" << i << "] " << c[i] << std::endl;
std::abort();
}
}
}