项目简介
-
基于
cuda
和opencv
环境 -
目标:
- 单独使用,以加速图像处理操作;
- 结合 TensorRT 使用,进一步加快推理速度
加速效果
- 这里对比
Deeplabv3+
使用cuda
预处理前后的 tensorrt 推理速度 - 未使用cuda图像预处理的代码,可参考博主的另一个 tensorrt 的项目
FP32 | FP16 | INT8 | |
---|---|---|---|
C++图像预处理 | 22 ms | 12 ms | 10 ms |
CUDA图像预处理 | 15 ms | 5 ms | 3 ms |
- 对比
YOLOv5-v5.0
使用cuda
预处理前后的 tensorrt 推理速度
FP32 | FP16 | INT8 | |
---|---|---|---|
C++图像预处理 | 12 ms | 8 ms | 6 ms |
CUDA图像预处理 | 6 ms | 3 ms | 3 ms |
YOLOv5 TensorRT 推理代码源自作者其他的项目 C++预处理 CUDA预处理
Cuda代码
核心的核函数代码如下所示:
- BGR to RGB
__global__ void RGB(const uchar* srcData, uchar* tgtData, const int h, const int w)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * w;
int idx3 = idx * 3;
if (ix < w && iy < h)
{
tgtData[idx3] = srcData[idx3 + 2];
tgtData[idx3 + 1] = srcData[idx3 + 1];
tgtData[idx3 + 2] = srcData[idx3];
}
}
- Bilinear resize
__global__ void linear(const uchar* srcData, const int srcH, const int srcW, uchar* tgtData, const int tgtH, const int tgtW)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * tgtW;
int idx3 = idx * 3;
float scaleY = (float)tgtH / (float)srcH;
float scaleX = (float)tgtW / (float)srcW;
// (ix,iy)为目标图像坐标
// (before_x,before_y)原图坐标
float beforeX = float(ix + 0.5) / scaleX - 0.5;
float beforeY = float(iy + 0.5) / scaleY - 0.5;
// 原图像坐标四个相邻点
// 获得变换前最近的四个顶点,取整
int topY = static_cast<int>(beforeY);
int bottomY = topY + 1;
int leftX = static_cast<int>(beforeX);
int rightX = leftX + 1;
//计算变换前坐标的小数部分
float u = beforeX - leftX;
float v = beforeY - topY;
if (ix < tgtW && iy < tgtH)
{
// 如果计算的原始图像的像素大于真实原始图像尺寸
if (topY >= srcH - 1 && leftX >= srcW - 1) //右下角
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k] = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k];
}
}
else if (topY >= srcH - 1) // 最后一行
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k];
}
}
else if (leftX >= srcW - 1) // 最后一列
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k];
}
}
else // 非最后一行或最后一列情况
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k]
+ (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k]
+ u * v * srcData[(rightX + bottomY * srcW) * 3 + k];
}
}
}
}
- HWC to CHW
__global__ void toCHW(const uchar* srcData, uchar* tgtData, const int h, const int w)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * w;
int idx3 = idx * 3;
if (ix < w && iy < h)
{
tgtData[idx] = srcData[idx3];
tgtData[idx + h * w] = srcData[idx3 + 1];
tgtData[idx + h * w * 2] = srcData[idx3 + 2];
}
}
- Normalize
__global__ void norm(const uchar* srcData, float* tgtData, const int h, const int w)
{
/*
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
(img / 255. - mean) / std
*/
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * w;
int idx3 = idx * 3;
if (ix < w && iy < h)
{
tgtData[idx3] = ((float)srcData[idx3] / 255.0 - 0.406) / 0.225; // B pixel
tgtData[idx3 + 1] = ((float)srcData[idx3 + 1] / 255.0 - 0.456) / 0.224; // G pixel
tgtData[idx3 + 2] = ((float)srcData[idx3 + 2] / 255.0 - 0.485) / 0.229; // R pixel
}
}
- 综合以上预处理操作(注意:并不是简单的拼接)
// cuda image preprocess
__global__ void resize(const uchar* srcData, const int srcH, const int srcW, uchar* tgtData, const int tgtH, const int tgtW)
{
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * tgtW;
int idx3 = idx * 3;
float scaleY = (float)tgtH / (float)srcH;
float scaleX = (float)tgtW / (float)srcW;
// (ix,iy)为目标图像坐标
// (before_x,before_y)原图坐标
float beforeX = float(ix + 0.5) / scaleX - 0.5;
float beforeY = float(iy + 0.5) / scaleY - 0.5;
// 原图像坐标四个相邻点
// 获得变换前最近的四个顶点,取整
int topY = static_cast<int>(beforeY);
int bottomY = topY + 1;
int leftX = static_cast<int>(beforeX);
int rightX = leftX + 1;
//计算变换前坐标的小数部分
float u = beforeX - leftX;
float v = beforeY - topY;
if (ix < tgtW && iy < tgtH)
{
// 如果计算的原始图像的像素大于真实原始图像尺寸
if (topY >= srcH - 1 && leftX >= srcW - 1) //右下角
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k] = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k];
}
}
else if (topY >= srcH - 1) // 最后一行
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k];
}
}
else if (leftX >= srcW - 1) // 最后一列
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k];
}
}
else // 非最后一行或最后一列情况
{
for (int k = 0; k < 3; k++)
{
tgtData[idx3 + k]
= (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
+ (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k]
+ (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k]
+ u * v * srcData[(rightX + bottomY * srcW) * 3 + k];
}
}
}
}
__global__ void process(const uchar* srcData, float* tgtData, const int h, const int w)
{
/*
mean = [0.485, 0.456, 0.406]
std = [0.229, 0.224, 0.225]
(img / 255. - mean) / std
*/
int ix = threadIdx.x + blockIdx.x * blockDim.x;
int iy = threadIdx.y + blockIdx.y * blockDim.y;
int idx = ix + iy * w;
int idx3 = idx * 3;
if (ix < w && iy < h)
{
tgtData[idx] = ((float)srcData[idx3 + 2] / 255.0 - 0.485) / 0.229; // R pixel
tgtData[idx + h * w] = ((float)srcData[idx3 + 1] / 255.0 - 0.456) / 0.224; // G pixel
tgtData[idx + h * w * 2] = ((float)srcData[idx3] / 255.0 - 0.406) / 0.225; // B pixel
}
}
void preprocess(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth)
{
int srcHeight = srcImg.rows;
int srcWidth = srcImg.cols;
int srcElements = srcHeight * srcWidth * 3;
int dstElements = dstHeight * dstWidth * 3;
// target data on device
float* dstDevData;
cudaMalloc((void**)&dstDevData, sizeof(float) * dstElements);
// middle image data on device ( for bilinear resize )
uchar* midDevData;
cudaMalloc((void**)&midDevData, sizeof(uchar) * dstElements);
// source images data on device
uchar* srcDevData;
cudaMalloc((void**)&srcDevData, sizeof(uchar) * srcElements);
cudaMemcpy(srcDevData, srcImg.data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice);
dim3 blockSize(32, 32);
dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y);
// bilinear resize
resize<<<gridSize, blockSize>>>(srcDevData, srcHeight, srcWidth, midDevData, dstHeight, dstWidth);
cudaDeviceSynchronize();
// hwc to chw / bgr to rgb / normalize
process<<<gridSize, blockSize>>>(midDevData, dstDevData, dstHeight, dstWidth);
cudaMemcpy(dstData, dstDevData, sizeof(float) * dstElements, cudaMemcpyDeviceToHost);
cudaFree(srcDevData);
cudaFree(midDevData);
cudaFree(dstDevData);
}
Git地址
完整的cuda图像预处理代码链接:cuda-image-preprocess