Resnet C ++ 部署 tensort 部署（四）

Resnet C ++ 部署 pytorch功能测试（一）
Resnet C ++ 部署模型训练（二）
Resnet C ++ 部署模型测试&转 onnx（三）
Resnet C ++ 部署 tensort 部署（四）
之后，开始onnx 转trt 部署测试
1 代码

这是核心代码，改一下main 函数里面的参数，推理函数里面的参数即可运行
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cassert>
#include<Windows.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
// onnx转换头文件
#include "NvOnnxParser.h"
#include"read_config.hpp"
#include"labels.hpp"
#include "NvInferPlugin.h"
using namespace nvonnxparser;

using namespace std;

#define CHECK(status) \
    do\
    {\
        auto ret = (status);\
        if (ret != 0)\
        {\
            std::cerr << "Cuda failure: " << ret << std::endl;\
            abort();\
        }\
    } while (0)

// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 2;

const char* INPUT_BLOB_NAME = "images";
const char* OUTPUT_BLOB_NAME = "prob";
void* global_buffers[2];

using namespace nvinfer1;


//static Logger gLogger;

//构建Logger
class Logger : public ILogger
{
    void log(Severity severity, const char* msg) noexcept override
    {
        // suppress info-level messages
        if (severity <= Severity::kWARNING)
            std::cout << msg << std::endl;
    }
} gLogger;

// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config,string onnx_name)
{
    int dir_l = 0;
    int dir_r = onnx_name.rfind(".");
    string enginePath;
    onnx_name = onnx_name.substr(dir_l, dir_r)+".onnx";
    std::cout << "onnx_name:" << onnx_name << std::endl;
    const char* onnx_path = onnx_name.c_str();
    INetworkDefinition* network = builder->createNetworkV2(1U); //此处重点1U为OU就有问题
    IParser* parser = createParser(*network, gLogger);
    parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
    for (int32_t i = 0; i < parser->getNbErrors(); ++i) { std::cout << parser->getError(i)->desc() << std::endl; }
    std::cout << "successfully load the onnx model" << std::endl;
    // Build engine
    builder->setMaxBatchSize(maxBatchSize);
    builder->setMaxBatchSize(maxBatchSize);
    config->setMaxWorkspaceSize(1 << 20);
    config->setFlag(nvinfer1::BuilderFlag::kFP16); // 设置精度计算
    //config->setFlag(nvinfer1::BuilderFlag::kINT8);
    ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
    std::cout << "successfully  create engine " << std::endl;

    //销毁
    network->destroy();
    parser->destroy();

    return engine;
}

void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, string trt_name)
{

    // Create builder
    IBuilder* builder = createInferBuilder(gLogger);
    IBuilderConfig* config = builder->createBuilderConfig();
    
    std::cout << "trt_name:" << trt_name << std::endl;

    // Create model to populate the network, then set the outputs and create an engine
    ICudaEngine* engine = createEngine(maxBatchSize, builder, config, trt_name);


    assert(engine != nullptr);

    // Serialize the engine
    (*modelStream) = engine->serialize();

    // Close everything down
    engine->destroy();
    builder->destroy();
    config->destroy();
}

void do_Initial(int batchSize)
{
    
    //void* buffers[2];
    //buffers[0] = global_buffers[0];
    //buffers[1] = global_buffers[1];
    // Pointers to input and output device buffers to pass to engine.
    // Engine requires exactly IEngine::getNbBindings() number of buffers.
    // float* m_bindings[2];
    // In order to bind the buffers, we need to know the names of the input and output tensors.
     // Note that indices are guaranteed to be less than IEngine::getNbBindings()
    const int inputIndex = 0; //inputIndex = 0
    const int outputIndex = 1;//outputIndex = 1
    // Create GPU buffers on device
    CHECK(cudaMalloc(&global_buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    CHECK(cudaMalloc(&global_buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
    // Release stream and buffers
}
void do_Inference(IExecutionContext *context_,float* input, float* output, int batchSize,cudaStream_t &stream)
{
    const int inputIndex = 0; //inputIndex = 0
    const int outputIndex = 1;//outputIndex = 1
    //void* buffers[2];
    //buffers[0] = global_buffers[0];
    //buffers[1] = global_buffers[1];
     Create GPU buffers on device
    //CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
    //CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
    // Create stream
    CHECK(cudaStreamCreate(&stream));
    // DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
    CHECK(cudaMemcpyAsync(global_buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
    context_->enqueueV2(global_buffers, stream, nullptr);//Changed by xfx20241202
    CHECK(cudaMemcpyAsync(output, global_buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
    cudaStreamSynchronize(stream);
}

void do_uninitial(cudaStream_t &stream, void* buffers[2])
{
    cudaStreamDestroy(stream);
    CHECK(cudaFree(buffers[0]));
    CHECK(cudaFree(buffers[1]));
}

//加工图片变成拥有batch的输入， tensorrt输入需要的格式，为一个维度
void ProcessImage(vector<cv::Mat> images, float input_data[],const int batch_tem) {
    //只处理一张图片,总之结果为一维[batch*3*INPUT_W*INPUT_H]
    //以下代码为投机取巧了

   
    std::vector<cv::Mat> InputImage;
    if(images.size() != batch_tem)
    {
        std::cout << "image batch is unequal to batch_tem" << std::endl;
        exit(-1);
    }
    for (int i = 0; i < batch_tem; ++i)
    {
        cv::resize(images[i], images[i], cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
        InputImage.push_back(images[i]);
    } 
    int ImgCount = InputImage.size();
    //std::cout <<"ImgCount：" << ImgCount << std::endl;
    //float input_data[BatchSize * 3 * INPUT_H * INPUT_W];
    for (int b = 0; b < ImgCount; b++) {
        cv::Mat img = InputImage.at(b);
        int w = img.cols;
        int h = img.rows;
        int i = 0;
        for (int row = 0; row < h; ++row) {
            uchar* uc_pixel = img.data + row * img.step;
            for (int col = 0; col < INPUT_W; ++col) {
                input_data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
                input_data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
                input_data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
                uc_pixel += 3;
                ++i;
            }
        }

    }

}

int get_trtengine(string trt_name) {

    int dir_l = 0;
    int dir_r = trt_name.rfind(".");
    trt_name = trt_name.substr(dir_l, dir_r) + ".engine";
    IHostMemory* modelStream{ nullptr };
    APIToModel(100, &modelStream, trt_name);
    assert(modelStream != nullptr);
    std::ofstream p(trt_name, std::ios::binary);
    if (!p)
    {
        std::cerr << "could not open plan output file" << std::endl;
        return -1;
    }

    p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
    modelStream->destroy();

    return 0;

}


int infer(string trt_name, const int batch_tem_, int loop_time_) {

    int batch_tem = batch_tem_;
    int loop_time = loop_time_;

    int dir_l = 0;
    int dir_r = trt_name.rfind(".");
    trt_name = trt_name.substr(dir_l, dir_r) + ".engine";

    //加载engine引擎
    char* trtModelStream{ nullptr };
    size_t size{ 0 };
    std::ifstream file(trt_name, std::ios::binary);
    if (file.good()) {
        file.seekg(0, file.end);
        size = file.tellg();
        file.seekg(0, file.beg);
        trtModelStream = new char[size];
        assert(trtModelStream);
        file.read(trtModelStream, size);
        file.close();
    }
    //反序列为engine，创建context

    IRuntime* runtime = createInferRuntime(gLogger);
    assert(runtime != nullptr);
    ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
    int32_t inputD = 0;
   // engine->getBindingDimensions(inputD).d[0]
    //auto engine->getBindingDimensions;
    assert(engine != nullptr);
    IExecutionContext* context = engine->createExecutionContext();
    assert(context != nullptr);
    delete[] trtModelStream;
    //*********************推理*********************//
    //   循环推理

    
    float time_read_img = 0.0;
    float time_infer = 0.0;
    float *prob = new float[batch_tem*OUTPUT_SIZE];
    float *data = new float[batch_tem * 3 * INPUT_H * INPUT_W];
    do_Initial(batch_tem);
    cudaStream_t stream;
    for (int loop = 0; loop < loop_time; loop++)
    {
        // 处理图片为固定输出
        auto start = std::chrono::system_clock::now();  //时间函数       
        std::string path2 = "./data/cat.png";
        vector<cv::Mat> images;
        cv::Mat img2 = cv::imread(path2);
        //images.push_back(img);
        for (int i = images.size(); i < batch_tem; ++i)
        {
            images.push_back(img2);
        }
        //--
        ProcessImage(images, data, batch_tem);
        auto end = std::chrono::system_clock::now();
        time_read_img = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_read_img;       
        start = std::chrono::system_clock::now();  //时间函数
        for (int i = 0; i < 1; ++i)
        {
            do_Inference(context, data, prob, batch_tem, stream);
        }
        end = std::chrono::system_clock::now();
        time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
        //std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
        //输出后处理
        //std::cout <<"prob="<<prob << std::endl;
        ImageNetLabels labels;

        for (int batch = 0; batch < batch_tem; ++batch)
        {


            float cls_float = prob[0];
            int cls_id = 0;
            for (int i = (0+batch)* OUTPUT_SIZE; i < (1+batch)*OUTPUT_SIZE; i++)
            {
                if (cls_float < prob[i])
                {
                    cls_float = prob[i];
                    cls_id = i;
                }
                printf("ID %d %f: %s\n", i% OUTPUT_SIZE, prob[i],labels.imagenet_labelstring(i%1000).c_str());
            }
            //printf("LOOP_time:%d batch: %d result %d \n", loop,batch, cls_id % 100);
            printf("Batch:%d ClassId:%d  Class name:%s \n", batch, cls_id%OUTPUT_SIZE, labels.imagenet_labelstring(cls_id % 1000).c_str());
        }
        
    }
    do_uninitial(stream, global_buffers);

    std::cout << "C++ engine" << "mean read img time = " << time_read_img / loop_time << "ms\t" << "mean infer img time =" << time_infer / loop_time << "ms" << std::endl;
    // Destroy the engine
    context->destroy();
    engine->destroy();
    runtime->destroy();
    return 0;
}

int main(int argc, char** argv)
{

    bool didInitPlugins = initLibNvInferPlugins(nullptr, "");
    string init_config_path = "./config/config.yaml";
    InitParameter m_init_para=yaml_read(init_config_path);
    std::cout <<"batch size:" << m_init_para.batch_size << std::endl;
    std::cout << "loop time:" << m_init_para.batch_size << std::endl;
    std::cout << "deylay time:" << m_init_para.delay_time << std::endl;
    std::cout << "model path:" << m_init_para.model_path << std::endl;
    std::cout << "mode:" << m_init_para.mode<<std::endl;
   //  string mode = argv[1];
    string mode = m_init_para.mode;  //适用windows编译，固定指定参数

    //if (std::string(argv[1]) == "-s") {
    if (mode == "-s") {
        std::cout << "m_init_para.model_path:" << m_init_para.model_path << std::endl;
        get_trtengine(m_init_para.model_path);
    }
    //else if (std::string(argv[1]) == "-d") {
    else if (mode == "-d") {
        infer(m_init_para.model_path, m_init_para.batch_size, m_init_para.loop_time);
    }

    else {
        return -1;
    }
    return 0;
}
2 精度比较

图片	.pt	.onnx	.trt
cat	[1.0000e+00, 1.4013e-45]	[1.0000e+00, 1.4013e-45]	[0.999920,0.000080]
3 结论

可以看出 onnx 模型精度损失很小 .trt 精度损失较大
Resnet C ++ 部署 tensort 部署（四）

1 代码

2 精度比较

3 结论

悦读