Resnet C ++ 部署 pytorch功能测试(一)
Resnet C ++ 部署 模型训练(二)
Resnet C ++ 部署 模型测试&转 onnx(三)
Resnet C ++ 部署 tensort 部署(四)
之后,开始onnx 转trt 部署测试
1 代码
这是核心代码,改一下main 函数里面的参数,推理函数里面的参数即可运行
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include <cmath>
#include <cassert>
#include<Windows.h>
#include<opencv2/core/core.hpp>
#include<opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>
// onnx转换头文件
#include "NvOnnxParser.h"
#include"read_config.hpp"
#include"labels.hpp"
#include "NvInferPlugin.h"
using namespace nvonnxparser;
using namespace std;
#define CHECK(status) \
do\
{\
auto ret = (status);\
if (ret != 0)\
{\
std::cerr << "Cuda failure: " << ret << std::endl;\
abort();\
}\
} while (0)
// stuff we know about the network and the input/output blobs
static const int INPUT_H = 224;
static const int INPUT_W = 224;
static const int OUTPUT_SIZE = 2;
const char* INPUT_BLOB_NAME = "images";
const char* OUTPUT_BLOB_NAME = "prob";
void* global_buffers[2];
using namespace nvinfer1;
//static Logger gLogger;
//构建Logger
class Logger : public ILogger
{
void log(Severity severity, const char* msg) noexcept override
{
// suppress info-level messages
if (severity <= Severity::kWARNING)
std::cout << msg << std::endl;
}
} gLogger;
// Creat the engine using only the API and not any parser.
ICudaEngine* createEngine(unsigned int maxBatchSize, IBuilder* builder, IBuilderConfig* config,string onnx_name)
{
int dir_l = 0;
int dir_r = onnx_name.rfind(".");
string enginePath;
onnx_name = onnx_name.substr(dir_l, dir_r)+".onnx";
std::cout << "onnx_name:" << onnx_name << std::endl;
const char* onnx_path = onnx_name.c_str();
INetworkDefinition* network = builder->createNetworkV2(1U); //此处重点1U为OU就有问题
IParser* parser = createParser(*network, gLogger);
parser->parseFromFile(onnx_path, static_cast<int32_t>(ILogger::Severity::kWARNING));
for (int32_t i = 0; i < parser->getNbErrors(); ++i) { std::cout << parser->getError(i)->desc() << std::endl; }
std::cout << "successfully load the onnx model" << std::endl;
// Build engine
builder->setMaxBatchSize(maxBatchSize);
builder->setMaxBatchSize(maxBatchSize);
config->setMaxWorkspaceSize(1 << 20);
config->setFlag(nvinfer1::BuilderFlag::kFP16); // 设置精度计算
//config->setFlag(nvinfer1::BuilderFlag::kINT8);
ICudaEngine* engine = builder->buildEngineWithConfig(*network, *config);
std::cout << "successfully create engine " << std::endl;
//销毁
network->destroy();
parser->destroy();
return engine;
}
void APIToModel(unsigned int maxBatchSize, IHostMemory** modelStream, string trt_name)
{
// Create builder
IBuilder* builder = createInferBuilder(gLogger);
IBuilderConfig* config = builder->createBuilderConfig();
std::cout << "trt_name:" << trt_name << std::endl;
// Create model to populate the network, then set the outputs and create an engine
ICudaEngine* engine = createEngine(maxBatchSize, builder, config, trt_name);
assert(engine != nullptr);
// Serialize the engine
(*modelStream) = engine->serialize();
// Close everything down
engine->destroy();
builder->destroy();
config->destroy();
}
void do_Initial(int batchSize)
{
//void* buffers[2];
//buffers[0] = global_buffers[0];
//buffers[1] = global_buffers[1];
// Pointers to input and output device buffers to pass to engine.
// Engine requires exactly IEngine::getNbBindings() number of buffers.
// float* m_bindings[2];
// In order to bind the buffers, we need to know the names of the input and output tensors.
// Note that indices are guaranteed to be less than IEngine::getNbBindings()
const int inputIndex = 0; //inputIndex = 0
const int outputIndex = 1;//outputIndex = 1
// Create GPU buffers on device
CHECK(cudaMalloc(&global_buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
CHECK(cudaMalloc(&global_buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Release stream and buffers
}
void do_Inference(IExecutionContext *context_,float* input, float* output, int batchSize,cudaStream_t &stream)
{
const int inputIndex = 0; //inputIndex = 0
const int outputIndex = 1;//outputIndex = 1
//void* buffers[2];
//buffers[0] = global_buffers[0];
//buffers[1] = global_buffers[1];
Create GPU buffers on device
//CHECK(cudaMalloc(&buffers[inputIndex], batchSize * 3 * INPUT_H * INPUT_W * sizeof(float)));
//CHECK(cudaMalloc(&buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float)));
// Create stream
CHECK(cudaStreamCreate(&stream));
// DMA input batch data to device, infer on the batch asynchronously, and DMA output back to host
CHECK(cudaMemcpyAsync(global_buffers[inputIndex], input, batchSize * 3 * INPUT_H * INPUT_W * sizeof(float), cudaMemcpyHostToDevice, stream));
context_->enqueueV2(global_buffers, stream, nullptr);//Changed by xfx20241202
CHECK(cudaMemcpyAsync(output, global_buffers[outputIndex], batchSize * OUTPUT_SIZE * sizeof(float), cudaMemcpyDeviceToHost, stream));
cudaStreamSynchronize(stream);
}
void do_uninitial(cudaStream_t &stream, void* buffers[2])
{
cudaStreamDestroy(stream);
CHECK(cudaFree(buffers[0]));
CHECK(cudaFree(buffers[1]));
}
//加工图片变成拥有batch的输入, tensorrt输入需要的格式,为一个维度
void ProcessImage(vector<cv::Mat> images, float input_data[],const int batch_tem) {
//只处理一张图片,总之结果为一维[batch*3*INPUT_W*INPUT_H]
//以下代码为投机取巧了
std::vector<cv::Mat> InputImage;
if(images.size() != batch_tem)
{
std::cout << "image batch is unequal to batch_tem" << std::endl;
exit(-1);
}
for (int i = 0; i < batch_tem; ++i)
{
cv::resize(images[i], images[i], cv::Size(INPUT_W, INPUT_H), 0, 0, cv::INTER_LINEAR);
InputImage.push_back(images[i]);
}
int ImgCount = InputImage.size();
//std::cout <<"ImgCount:" << ImgCount << std::endl;
//float input_data[BatchSize * 3 * INPUT_H * INPUT_W];
for (int b = 0; b < ImgCount; b++) {
cv::Mat img = InputImage.at(b);
int w = img.cols;
int h = img.rows;
int i = 0;
for (int row = 0; row < h; ++row) {
uchar* uc_pixel = img.data + row * img.step;
for (int col = 0; col < INPUT_W; ++col) {
input_data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[2] / 255.0;
input_data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
input_data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
}
}
int get_trtengine(string trt_name) {
int dir_l = 0;
int dir_r = trt_name.rfind(".");
trt_name = trt_name.substr(dir_l, dir_r) + ".engine";
IHostMemory* modelStream{ nullptr };
APIToModel(100, &modelStream, trt_name);
assert(modelStream != nullptr);
std::ofstream p(trt_name, std::ios::binary);
if (!p)
{
std::cerr << "could not open plan output file" << std::endl;
return -1;
}
p.write(reinterpret_cast<const char*>(modelStream->data()), modelStream->size());
modelStream->destroy();
return 0;
}
int infer(string trt_name, const int batch_tem_, int loop_time_) {
int batch_tem = batch_tem_;
int loop_time = loop_time_;
int dir_l = 0;
int dir_r = trt_name.rfind(".");
trt_name = trt_name.substr(dir_l, dir_r) + ".engine";
//加载engine引擎
char* trtModelStream{ nullptr };
size_t size{ 0 };
std::ifstream file(trt_name, std::ios::binary);
if (file.good()) {
file.seekg(0, file.end);
size = file.tellg();
file.seekg(0, file.beg);
trtModelStream = new char[size];
assert(trtModelStream);
file.read(trtModelStream, size);
file.close();
}
//反序列为engine,创建context
IRuntime* runtime = createInferRuntime(gLogger);
assert(runtime != nullptr);
ICudaEngine* engine = runtime->deserializeCudaEngine(trtModelStream, size, nullptr);
int32_t inputD = 0;
// engine->getBindingDimensions(inputD).d[0]
//auto engine->getBindingDimensions;
assert(engine != nullptr);
IExecutionContext* context = engine->createExecutionContext();
assert(context != nullptr);
delete[] trtModelStream;
//*********************推理*********************//
// 循环推理
float time_read_img = 0.0;
float time_infer = 0.0;
float *prob = new float[batch_tem*OUTPUT_SIZE];
float *data = new float[batch_tem * 3 * INPUT_H * INPUT_W];
do_Initial(batch_tem);
cudaStream_t stream;
for (int loop = 0; loop < loop_time; loop++)
{
// 处理图片为固定输出
auto start = std::chrono::system_clock::now(); //时间函数
std::string path2 = "./data/cat.png";
vector<cv::Mat> images;
cv::Mat img2 = cv::imread(path2);
//images.push_back(img);
for (int i = images.size(); i < batch_tem; ++i)
{
images.push_back(img2);
}
//--
ProcessImage(images, data, batch_tem);
auto end = std::chrono::system_clock::now();
time_read_img = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_read_img;
start = std::chrono::system_clock::now(); //时间函数
for (int i = 0; i < 1; ++i)
{
do_Inference(context, data, prob, batch_tem, stream);
}
end = std::chrono::system_clock::now();
time_infer = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() + time_infer;
//std::cout << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "ms" << std::endl;
//输出后处理
//std::cout <<"prob="<<prob << std::endl;
ImageNetLabels labels;
for (int batch = 0; batch < batch_tem; ++batch)
{
float cls_float = prob[0];
int cls_id = 0;
for (int i = (0+batch)* OUTPUT_SIZE; i < (1+batch)*OUTPUT_SIZE; i++)
{
if (cls_float < prob[i])
{
cls_float = prob[i];
cls_id = i;
}
printf("ID %d %f: %s\n", i% OUTPUT_SIZE, prob[i],labels.imagenet_labelstring(i%1000).c_str());
}
//printf("LOOP_time:%d batch: %d result %d \n", loop,batch, cls_id % 100);
printf("Batch:%d ClassId:%d Class name:%s \n", batch, cls_id%OUTPUT_SIZE, labels.imagenet_labelstring(cls_id % 1000).c_str());
}
}
do_uninitial(stream, global_buffers);
std::cout << "C++ engine" << "mean read img time = " << time_read_img / loop_time << "ms\t" << "mean infer img time =" << time_infer / loop_time << "ms" << std::endl;
// Destroy the engine
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
int main(int argc, char** argv)
{
bool didInitPlugins = initLibNvInferPlugins(nullptr, "");
string init_config_path = "./config/config.yaml";
InitParameter m_init_para=yaml_read(init_config_path);
std::cout <<"batch size:" << m_init_para.batch_size << std::endl;
std::cout << "loop time:" << m_init_para.batch_size << std::endl;
std::cout << "deylay time:" << m_init_para.delay_time << std::endl;
std::cout << "model path:" << m_init_para.model_path << std::endl;
std::cout << "mode:" << m_init_para.mode<<std::endl;
// string mode = argv[1];
string mode = m_init_para.mode; //适用windows编译,固定指定参数
//if (std::string(argv[1]) == "-s") {
if (mode == "-s") {
std::cout << "m_init_para.model_path:" << m_init_para.model_path << std::endl;
get_trtengine(m_init_para.model_path);
}
//else if (std::string(argv[1]) == "-d") {
else if (mode == "-d") {
infer(m_init_para.model_path, m_init_para.batch_size, m_init_para.loop_time);
}
else {
return -1;
}
return 0;
}
2 精度比较
图片 | .pt | .onnx | .trt |
---|---|---|---|
cat | [1.0000e+00, 1.4013e-45] | [1.0000e+00, 1.4013e-45] | [0.999920,0.000080] |
3 结论
可以看出 onnx 模型精度损失很小 .trt 精度损失较大