Bootstrap

Cuda编程加速图像预处理

项目简介

  • 基于 cudaopencv 环境

  • 目标:

    • 单独使用,以加速图像处理操作;
    • 结合 TensorRT 使用,进一步加快推理速度

加速效果

  • 这里对比 Deeplabv3+ 使用 cuda 预处理前后的 tensorrt 推理速度
  • 未使用cuda图像预处理的代码,可参考博主的另一个 tensorrt 的项目
FP32FP16INT8
C++图像预处理22 ms12 ms10 ms
CUDA图像预处理15 ms5 ms3 ms
  • 对比 YOLOv5-v5.0 使用 cuda 预处理前后的 tensorrt 推理速度
FP32FP16INT8
C++图像预处理12 ms8 ms6 ms
CUDA图像预处理6 ms3 ms3 ms

YOLOv5 TensorRT 推理代码源自作者其他的项目 C++预处理 CUDA预处理

Cuda代码

核心的核函数代码如下所示:

  • BGR to RGB
__global__ void RGB(const uchar* srcData, uchar* tgtData, const int h, const int w)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * w;
    int idx3 = idx * 3;
    if (ix < w && iy < h)
    {
        tgtData[idx3] = srcData[idx3 + 2];
        tgtData[idx3 + 1] = srcData[idx3 + 1];
        tgtData[idx3 + 2] = srcData[idx3];
    }
}
  • Bilinear resize
__global__ void linear(const uchar* srcData, const int srcH, const int srcW, uchar* tgtData, const int tgtH, const int tgtW)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * tgtW;
    int idx3 = idx * 3;

    float scaleY = (float)tgtH / (float)srcH;
    float scaleX = (float)tgtW / (float)srcW;

    // (ix,iy)为目标图像坐标
    // (before_x,before_y)原图坐标
    float beforeX = float(ix + 0.5) / scaleX - 0.5;
    float beforeY = float(iy + 0.5) / scaleY - 0.5;
    // 原图像坐标四个相邻点
    // 获得变换前最近的四个顶点,取整
    int topY = static_cast<int>(beforeY);
    int bottomY = topY + 1;
    int leftX = static_cast<int>(beforeX);
    int rightX = leftX + 1;
    //计算变换前坐标的小数部分
    float u = beforeX - leftX;
    float v = beforeY - topY;

    if (ix < tgtW && iy < tgtH)
    {
        // 如果计算的原始图像的像素大于真实原始图像尺寸
        if (topY >= srcH - 1 && leftX >= srcW - 1)  //右下角
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k] = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k];
            }
        }
        else if (topY >= srcH - 1)  // 最后一行
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k];
            }
        }
        else if (leftX >= srcW - 1)  // 最后一列
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k];
            }
        }
        else  // 非最后一行或最后一列情况
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k]
                + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k]
                + u * v * srcData[(rightX + bottomY * srcW) * 3 + k];
            }
        }
    }
}
  • HWC to CHW
__global__ void toCHW(const uchar* srcData, uchar* tgtData, const int h, const int w)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * w;
    int idx3 = idx * 3;
    if (ix < w && iy < h)
    {
        tgtData[idx] = srcData[idx3];
        tgtData[idx + h * w] = srcData[idx3 + 1];
        tgtData[idx + h * w * 2] = srcData[idx3 + 2];
    }
}
  • Normalize
__global__ void norm(const uchar* srcData, float* tgtData, const int h, const int w)
{
    /*
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        (img / 255. - mean) / std
    */
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * w;
    int idx3 = idx * 3;

    if (ix < w && iy < h)
    {
        tgtData[idx3] = ((float)srcData[idx3] / 255.0 - 0.406) / 0.225;  // B pixel
        tgtData[idx3 + 1] = ((float)srcData[idx3 + 1] / 255.0 - 0.456) / 0.224;  // G pixel
        tgtData[idx3 + 2] = ((float)srcData[idx3 + 2] / 255.0 - 0.485) / 0.229;  // R pixel
    }
}
  • 综合以上预处理操作(注意:并不是简单的拼接)
// cuda image preprocess

__global__ void resize(const uchar* srcData, const int srcH, const int srcW, uchar* tgtData, const int tgtH, const int tgtW)
{
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * tgtW;
    int idx3 = idx * 3;

    float scaleY = (float)tgtH / (float)srcH;
    float scaleX = (float)tgtW / (float)srcW;

    // (ix,iy)为目标图像坐标
    // (before_x,before_y)原图坐标
    float beforeX = float(ix + 0.5) / scaleX - 0.5;
    float beforeY = float(iy + 0.5) / scaleY - 0.5;
    // 原图像坐标四个相邻点
    // 获得变换前最近的四个顶点,取整
    int topY = static_cast<int>(beforeY);
    int bottomY = topY + 1;
    int leftX = static_cast<int>(beforeX);
    int rightX = leftX + 1;
    //计算变换前坐标的小数部分
    float u = beforeX - leftX;
    float v = beforeY - topY;

    if (ix < tgtW && iy < tgtH)
    {
        // 如果计算的原始图像的像素大于真实原始图像尺寸
        if (topY >= srcH - 1 && leftX >= srcW - 1)  //右下角
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k] = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k];
            }
        }
        else if (topY >= srcH - 1)  // 最后一行
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k];
            }
        }
        else if (leftX >= srcW - 1)  // 最后一列
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k];
            }
        }
        else  // 非最后一行或最后一列情况
        {
            for (int k = 0; k < 3; k++)
            {
                tgtData[idx3 + k]
                = (1. - u) * (1. - v) * srcData[(leftX + topY * srcW) * 3 + k]
                + (u) * (1. - v) * srcData[(rightX + topY * srcW) * 3 + k]
                + (1. - u) * (v) * srcData[(leftX + bottomY * srcW) * 3 + k]
                + u * v * srcData[(rightX + bottomY * srcW) * 3 + k];
            }
        }
    }
}


__global__ void process(const uchar* srcData, float* tgtData, const int h, const int w)
{
    /*
        mean = [0.485, 0.456, 0.406]
        std = [0.229, 0.224, 0.225]
        (img / 255. - mean) / std
    */
    int ix = threadIdx.x + blockIdx.x * blockDim.x;
    int iy = threadIdx.y + blockIdx.y * blockDim.y;
    int idx = ix + iy * w;
    int idx3 = idx * 3;

    if (ix < w && iy < h)
    {
        tgtData[idx] = ((float)srcData[idx3 + 2] / 255.0 - 0.485) / 0.229;  // R pixel
        tgtData[idx + h * w] = ((float)srcData[idx3 + 1] / 255.0 - 0.456) / 0.224;  // G pixel
        tgtData[idx + h * w * 2] = ((float)srcData[idx3] / 255.0 - 0.406) / 0.225;  // B pixel
    }
}


void preprocess(const cv::Mat& srcImg, float* dstData, const int dstHeight, const int dstWidth)
{
    int srcHeight = srcImg.rows;
    int srcWidth = srcImg.cols;
    int srcElements = srcHeight * srcWidth * 3;
    int dstElements = dstHeight * dstWidth * 3;

    // target data on device
    float* dstDevData;
    cudaMalloc((void**)&dstDevData, sizeof(float) * dstElements);
    // middle image data on device ( for bilinear resize )
    uchar* midDevData;
    cudaMalloc((void**)&midDevData, sizeof(uchar) * dstElements);
    // source images data on device
    uchar* srcDevData;
    cudaMalloc((void**)&srcDevData, sizeof(uchar) * srcElements);
    cudaMemcpy(srcDevData, srcImg.data, sizeof(uchar) * srcElements, cudaMemcpyHostToDevice);

    dim3 blockSize(32, 32);
    dim3 gridSize((dstWidth + blockSize.x - 1) / blockSize.x, (dstHeight + blockSize.y - 1) / blockSize.y);

    // bilinear resize
    resize<<<gridSize, blockSize>>>(srcDevData, srcHeight, srcWidth, midDevData, dstHeight, dstWidth);
    cudaDeviceSynchronize();
    // hwc to chw / bgr to rgb / normalize
    process<<<gridSize, blockSize>>>(midDevData, dstDevData, dstHeight, dstWidth);

    cudaMemcpy(dstData, dstDevData, sizeof(float) * dstElements, cudaMemcpyDeviceToHost);

    cudaFree(srcDevData);
    cudaFree(midDevData);
    cudaFree(dstDevData);
}

Git地址

完整的cuda图像预处理代码链接:cuda-image-preprocess

;