对该博客的推理修改https://blog.csdn.net/hello_dear_you/article/details/109744627
我的输入是(1,3,720,1280),输出是(1,2,720,1280),源码GitHub - milesial/Pytorch-UNet: PyTorch implementation of the U-Net for image semantic segmentation with high quality images分类数量改为2,训练部分参考源码 ,源码中可能由于pytorch版本不一样需要改一下。可以百度到
重要推理部分如下
import tensorrt as trt
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
import cv2
from PIL import Image
def preprocess(mask_values, pil_img, scale, is_mask):
w, h = pil_img.size
newW, newH = int(scale * w), int(scale * h)
assert newW > 0 and newH > 0, 'Scale is too small, resized images would have no pixel'
pil_img = pil_img.resize((newW, newH), resample=Image.NEAREST if is_mask else Image.BICUBIC)
img = np.asarray(pil_img)
if is_mask:
mask = np.zeros((newH, newW), dtype=np.int64)
for i, v in enumerate(mask_values):
if img.ndim == 2:
mask[img == v] = i
else:
mask[(img == v).all(-1)] = i
return mask
else:
if img.ndim == 2:
img = img[np.newaxis, ...]
else:
img = img.transpose((2, 0, 1))
if (img > 1).any():
img = img / 255.0
return img
# 加载序列化的TensorRT引擎
engine_file_path = 'unet_engine_fp16.plan'
with open(engine_file_path, 'rb') as f, trt.Runtime(trt.Logger(trt.Logger.INFO)) as runtime:
engine = runtime.deserialize_cuda_engine(f.read())
# 创建TensorRT上下文
context = engine.create_execution_context()
# 创建TensorRT输入和输出绑定
input_index = engine.get_binding_index('input')
output_index = engine.get_binding_index('output')
input_shape = (3, 720, 1280) # 替换为UNet网络的输入形状
# 在GPU上为输入和输出数据分配内存
img = Image.open("./1_enhance00096.jpg")
img = preprocess(None, img, 1, is_mask=False)
img = np.expand_dims(img, axis=0)
input_data = img.astype(np.float32)
output_data = np.empty(engine.get_binding_shape(output_index), dtype=np.float32)
d_input = cuda.mem_alloc(input_data.nbytes)
d_output = cuda.mem_alloc(output_data.nbytes)
# 将input_data转换为连续的内存布局
input_data = np.ascontiguousarray(input_data)
# 将输入数据传输到GPU
cuda.memcpy_htod(d_input, input_data)
# 使用TensorRT进行推理
context.execute(1, [int(d_input), int(d_output)])
# 从GPU传输输出数据
cuda.memcpy_dtoh(output_data, d_output)
print(output_data)
print(output_data.shape)
print(type(output_data))
predicted_class = np.argmax(output_data, axis=1)
import matplotlib.pyplot as plt
# 可视化预测的类别
plt.imshow(predicted_class[0], cmap='viridis')
plt.colorbar()
plt.show()
print(img.shape)
#include "NvInfer.h"
#include "cuda_runtime_api.h"
#include <fstream>
#include <iostream>
#include <map>
#include <sstream>
#include <vector>
#include <chrono>
#include "NvInferPlugin.h"
#include "opencv2/opencv.hpp"
#include <NvInfer.h>
#include <NvOnnxParser.h>
using namespace nvinfer1;
#include <time.h>
// 定义Engine文件路径
const std::string enginePath = "C:\\mao\\code\\tensorrt\\Segmentation\\UNET_INFERENCE\\unet_engine_fp16.plan";
class Logger : public nvinfer1::ILogger
{
public:
void log(Severity severity, const char* msg) noexcept override
{
// 根据需要自定义日志输出逻辑
switch (severity)
{
case Severity::kINTERNAL_ERROR:
std::cerr << "INTERNAL_ERROR: " << msg << std::endl;
break;
case Severity::kERROR:
std::cerr << "ERROR: " << msg << std::endl;
break;
case Severity::kWARNING:
std::cerr << "WARNING: " << msg << std::endl;
break;
case Severity::kINFO:
std::cout << "INFO: " << msg << std::endl;
break;
default:
break;
}
}
};
static Logger gLogger;
int main(int argc, char** argv)
{
initLibNvInferPlugins(&gLogger, "");
// 创建TensorRT的运行时对象
IRuntime* runtime = createInferRuntime(gLogger);
// 从文件中反序列化Engine对象
std::ifstream engineFile(enginePath, std::ios::binary);
if (!engineFile)
{
std::cerr << "无法打开Engine文件进行读取。" << std::endl;
return 1;
}
engineFile.seekg(0, std::ios::end);
const size_t fileSize = engineFile.tellg();
engineFile.seekg(0, std::ios::beg);
std::vector<char> engineData(fileSize);
engineFile.read(engineData.data(), fileSize);
engineFile.close();
// 反序列化Engine对象
IPluginFactory* pluginFactory = nullptr; // 如果有自定义插件,可以传递一个插件工厂对象
ICudaEngine* engine = runtime->deserializeCudaEngine(engineData.data(), fileSize, pluginFactory);
if (!engine)
{
std::cerr << "无法反序列化Engine对象。" << std::endl;
return 1;
}
// 创建TensorRT的执行上下文对象
IExecutionContext* context = engine->createExecutionContext();
// 分配GPU内存
// 定义输入和输出的维度
const int batchSize = 1;
const int inputC = 3;
const int inputH = 720;
const int inputW = 1280;
const int outputC = 2;
const int outputH = 720;
const int outputW = 1280;
void* buffers[2];
cudaMalloc(&buffers[0], batchSize * inputC * inputH * inputW * sizeof(float)); // 分配输入内存
cudaMalloc(&buffers[1], batchSize * outputC * outputH * outputW * sizeof(float)); // 分配输出内存
// 创建CUDA流
cudaStream_t stream;
cudaStreamCreate(&stream);
// 读取图片作为输入
cv::Mat image = cv::imread("C:\\mao\\code\\tensorrt\\Segmentation\\UNET_INFERENCE\\1_enhance00096.jpg");
cv::cvtColor(image, image, cv::COLOR_BGR2RGB); // 将通道顺序从BGR转换为RGB
//cv::transpose(image, image); // 转置图像矩阵
//cv::flip(image, image, 0);
cv::resize(image, image, cv::Size(1280, 720));
//image.convertTo(image, CV_32F, 1.0 / 255.0);
float* inputData = new float[batchSize * inputC * inputH * inputW];
// 将inputBlob的数据复制到inputData中
for (int i = 0; i < inputH * inputW; i++) {
inputData[i] = image.at<cv::Vec3b>(i / inputW, i % inputW)[0] / 255.0; // R通道
inputData[i + inputH * inputW] = image.at<cv::Vec3b>(i / inputW, i % inputW)[1] / 255.0; // G通道
inputData[i + 2 * inputH * inputW] = image.at<cv::Vec3b>(i / inputW, i % inputW)[2] / 255.0; // B通道
}
// 将inputBlob的数据复制到inputData中
cudaMemcpyAsync(buffers[0], inputData, batchSize * inputC * inputH * inputW * sizeof(float), cudaMemcpyHostToDevice, stream);
context->enqueue(batchSize, buffers, stream, nullptr);
// 将输出从GPU复制到主机内存
float* outputData = new float[batchSize * outputC * outputH * outputW];
cudaMemcpyAsync(outputData, buffers[1], batchSize * outputC * outputH * outputW * sizeof(float), cudaMemcpyDeviceToHost, stream);
cudaStreamSynchronize(stream);
// 创建一个Mat对象来存储结果
cv::Mat resultMat(outputH, outputW, CV_8UC1);
// 遍历输出数据
for (int i = 0; i < outputH * outputW; i++) {
// 将输出数据转换为对应的类别标签
int class_idx = (outputData[i] > outputData[i + outputH * outputW]) ? 0 : 1;
resultMat.at<uchar>(i / outputW, i % outputW) = (class_idx == 0) ? 0 : 255;
}
// 显示结果
cv::imshow("Semantic Segmentation Result", resultMat);
cv::waitKey(0);
// 释放资源
delete[] inputData;
delete[] outputData;
// 释放资源
context->destroy();
engine->destroy();
runtime->destroy();
return 0;
}
上面是C++的代码,没有那没多的异常判断,但是流程比较简洁。