目录
1 简介
(1) 简介
本文采用tkDNN的方法,根据YOLOv4 作者AlexeyAB推荐,可以支持(tiny) YOLO v1~YOLO v4的加速编译。其中,tkDNN-TensorRT 可以加速YOLOv4到 2倍(batch=1),3-4 倍(batch=4)。本文以tiny YOLOv4模型在TX2上测试,batch=1时大概提速50%左右。
(2) 环境需求
- CUDA 10.0
- CUDNN 7.603
- TENSORRT 6.01(TENSORRT 5.6.1实测也可以)
- OPENCV 3.4
- yaml-cpp 0.5.2 (需要安装)
其余版本只要版本编译通过就行,其中yaml-cpp需要安装,安装方式如下:
git clone https://github.com/jbeder/yaml-cpp.git
mkdir build
cd build
cmake -DBUILD_SHARED_LIBS=ON ..
make
sudo make install
(3) 下载链接
tkDNN Github:https://github.com/ceccocats/tkDNN
https://pan.baidu.com/s/1GPGLRYgbv4YjvKGwwoMFFA
(提取码:xqrb)
darknet权重解析:https://git.hipert.unimore.it/fgatti/darknet
https://pan.baidu.com/s/1K02Cdr-yx050oVtoUBMP-g
(提取码:kb31)
C++Demo和Python Demo工程文件:
https://pan.baidu.com/s/1gzAhavHuvunMEIBh4WhdZg
(提取码:fvl7)
2 如何使用
(1) 模型准备
训练好的yolo模型(yolo4tiny.weights)、配置文件(yolo4tiny.cfg)和分类名文件voc.names(或者coco.names)。
(2) tkDNN编译
git clone https://github.com/ceccocats/tkDNN
cd tkDNN
mkdir build
cd build
cmake ..
make
(2) darknet权重文件解析
编译时,设置Makefile里面的GPU=0
git clone https://git.hipert.unimore.it/fgatti/darknet.git
cd darknet
make
mkdir layers debug
./darknet export <path-to-cfg-file> <path-to-weights> layers
解析出来的格式如下:
model
|---- layers/ (包含每层的权重参数)
|----------- *.bin
|---- debug/ (包含每层的输出参数)
|----------- *_out.bin
加速tinyYOLOv4的示例文件在tkDNN-master\tests\darknet\testyolo4tiny.cpp
//downloadWeightsifDoNotExist(input_bins[0], bin_path, "https://cloud.hipert.unimore.it/s/iRnc4pSqmx78gJs/download");
// parse darknet network
tk::dnn::Network *net = tk::dnn::darknetParser(cfg_path, wgs_path, name_path);
net->print();
注释掉downloadWeightsifDoNotExist部分,将cfg_path、wgs_path和name_path分别换位自己的路径(cfg_path——yolo4tiny.cfg, wgs_path——model/layers, name_path——voc.names)
(3) 创建rt文件
第一次调用的时候,要先检查一下是否生成引擎文件(.rt)
//convert network to tensorRT
tk::dnn::NetworkRT *netRT = new tk::dnn::NetworkRT(net, net->getNetworkRTName(bin_path.c_str()));
int ret = testInference(input_bins, output_bins, net, netRT);
(4) 推理测试
在tkDNN-master/demo/demo/demo.cpp中可以看到具体调用示例:
cv::Mat frame = cv::imread(f.iFilename.c_str(), cv::IMREAD_COLOR);
std::vector<cv::Mat> batch_frames;
batch_frames.push_back(frame);
int height = frame.rows;
int width = frame.cols;
if(!frame.data)
break;
std::vector<cv::Mat> batch_dnn_input;
batch_dnn_input.push_back(frame.clone());
//inference
detected_bbox.clear();
detNN->update(batch_dnn_input,1,write_res_on_file, ×, write_coco_json);
detNN->draw(batch_frames);
detected_bbox = detNN->detected;
// save detections labels
for(auto d:detected_bbox){
//convert detected bb in the same format as label
//<x_center>/<image_width> <y_center>/<image_width> <width>/<image_width> <height>/<image_width>
tk::dnn::BoundingBox b;
b.x = (d.x + d.w/2) / width;
b.y = (d.y + d.h/2) / height;
b.w = d.w / width;
b.h = d.h / height;
b.prob = d.prob;
b.cl = d.cl;
f.det.push_back(b);
}
(5) 注意事项
(a) 安装yaml-cpp后,需要将路径放到环境变量中,不然编译tkDNN 会报错
(b) 解析权重文件之前,需要设置Makefile里GPU=0,然后编译生成/layer文件夹
3 C++封装
封装一个简单的历程,可以输入Mat图片,输出bbox结果
/ trt_v4tiny.h
#ifndef _TRT_V4TINY_H_
#define _TRT_V4TINY_H_
typedef struct objBoxInfo
{
int label;
int xmin;
int xmax;
int ymin;
int ymax;
float prob;
}objBoxInfo;
extern "C"
{
int trtInit();
int picRecog(unsigned char*data, int width, int height, int *num, objBoxInfo* res);
int trtUnInit();
}
#endif
/// trt_v4tiny.cpp
#include <iostream>
#include <vector>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include "Yolo3Detection.h"
#include "DarknetParser.h"
#include "tkdnn.h"
#include "test.h"
#include "BoundingBox.h"
#include "trt_v4tiny.h"
int n_classes = 9;
int n_batch = 1;
std::string path_net = "/home/nvidia/tkDNN-master/model/yolo4tiny_fp16.rt";
std::string bin_path = "/home/nvidia/tkDNN-master/model";
std::vector<std::string> input_bins = {
bin_path + "/layers/input.bin"
};
std::vector<std::string> output_bins = {
bin_path + "/debug/layer30_out.bin",
bin_path + "/debug/layer37_out.bin"
};
std::string wgs_path = bin_path + "/layers";
std::string cfg_path = bin_path + "/yolo4tiny.cfg";
std::string name_path = bin_path + "/voc.names";
tk::dnn::DetectionNN *detNN;
tk::dnn::Yolo3Detection yolo;
int check_rt(std::string path_net);
int main(int argc, char* argv[]){
// init
trtInit();
// check for rt
check_rt(path_net);
std::string path_file = "/home/nvidia/tkDNN-master/demo/0.jpg";
cv::Mat img = cv::imread(path_file.c_str(), cv::IMREAD_COLOR);
if(!img.data){
std::cerr<<"read image failed!"<<std::endl;
return -1;
}
int num = 50;
objBoxInfo res[num];
for(int i = 0; i < 100; i++){
time_t t0 = clock();
picRecog(img.data, img.cols, img.rows, &num, res);
time_t t1 = clock();
std::cout<<"["<< i <<"]:time spend "<<(double)(t1 - t0)/CLOCKS_PER_SEC <<std::endl;
}
// cv::rectangle(img, cv::Point(res[0].xmin,res[0].ymin), cv::Point(res[0].xmax, res[0].ymax), cv::Scalar(255, 0, 0), 2);
// cv::imwrite("dst.jpg", img);
trtUnInit();
return 0;
}
int check_rt(std::string path_net){
if(!fileExist(path_net.c_str())){
std::cout<< "tensorrt file not exists: "<< path_net<<std::endl;
//
// downloadWeightsifDoNotExist(input_bins[0], bin_path, "https://cloud.hipert.unimore.it/s/iRnc4pSqmx78gJs/download");
// parse darknet network
tk::dnn::Network *net = tk::dnn::darknetParser(cfg_path, wgs_path, name_path);
net->print();
//convert network to tensorRT
tk::dnn::NetworkRT *netRT = new tk::dnn::NetworkRT(net, path_net.c_str());
int ret = testInference(input_bins, output_bins, net, netRT);
net->releaseLayers();
delete net;
delete netRT;
if(!fileExist(path_net.c_str()))
{
std::cerr<<"TensorRT enigine build Failed!\n"<<std::endl;
return -1;
}else{
std::cout<<"TensorRT enigine build Sucessfully!\n"<<std::endl;
}
}else{
std::cout<< "tensorrt file exists: "<< path_net<<std::endl;
}
return 0;
}
int trtInit(){
detNN = &yolo;
detNN->init(path_net, n_classes, n_batch);
std::cout<<"init trt!"<<std::endl;
return 0;
}
int trtUnInit(){
std::cout<<"uninit trt!"<<std::endl;
return 0;
}
int picRecog(unsigned char*data, int width, int height, int *num, objBoxInfo* res){
cv::Mat img(height, width, CV_8UC3, (void*)data);
std::vector<cv::Mat> batch_dnn_input;
std::vector<tk::dnn::box> detected_bbox;
//inference
detected_bbox.clear();
batch_dnn_input.push_back(img);
detNN->update(batch_dnn_input, n_batch);
// detNN->draw(batch_frames);
detected_bbox = detNN->detected;
int cnt = 0;
for(auto d:detected_bbox){
res[cnt].label = d.cl;
res[cnt].prob = d.prob;
res[cnt].xmin = d.x;
res[cnt].ymin = d.y;
res[cnt].xmax = d.x + d.w;
res[cnt].ymax = d.y + d.h;
cnt++;
}
*num = cnt;
return 0;
}
4 python封装
将C代码编译成动态链接库,通过Python调用,实现加速
#!/usr/bin/python
# -*- coding:utf-8 -*-
from __future__ import print_function
import os, datetime, logging, time, sys, glob, requests, getpass, \
json, socket, threading, re, serial, shutil, pickle, math, \
base64, subprocess, signal, cv2, numpy, traceback, multiprocessing, hashlib
from multiprocessing import Process, Pool
from ctypes import *
from struct import pack, unpack
import numpy as np
class resultTrtBox(Structure):
_fields_ = [
("label", c_int),
("xmin", c_int),
("xmax", c_int),
("ymin", c_int),
("ymax", c_int),
("prob", c_float)]
class resultTrtBoxArray(Array):
_type_ = resultTrtBox
_length_ = 50
cudaVer = os.popen("nvcc -V | grep release | awk '{print $5}' | awk -F ',' '{print $1}'")
cudaVer = cudaVer.read().split("\n")[0]
opencvVer = os.popen("pkg-config opencv --modversion")
opencvVer = opencvVer.read().split("\n")[0]
libKeyVIRutiliy = cdll.LoadLibrary("/home/nvidia/tkDNN-master/libtrt.so")
trtCarInit = libKeyVIRutiliy.trtInit
trtCarUninit = libKeyVIRutiliy.trtUnInit
trtCarRecogPic = libKeyVIRutiliy.picRecog
def testPic(path):
trtCarInit()
img = cv2.imdecode(np.fromfile(path, dtype=np.uint8), cv2.IMREAD_COLOR)
data_ctypes_ptr = cast(img.ctypes.data, POINTER(c_char))
res = resultTrtBoxArray()
for i in range(1):
recogNum = c_int(resultTrtBoxArray._length_)
t = time.time()
trtCarRecogPic(data_ctypes_ptr, img.shape[1], img.shape[0], pointer(recogNum), pointer(res))
# print(i, "%.3f" % (time.time() - t), recogNum.value)
for i in range(recogNum.value):
print(res[i].label, res[i].prob, res[i].xmin, res[i].xmax, res[i].ymin, res[i].ymax)
trtCarUninit()
if __name__ == '__main__':
print("pid:", os.getpid())
path = '/home/nvidia/tkDNN-master/demo/0.jpg'
testPic(path)
相应的工程文件已放入百度网盘,可下载测试,有问题随时交流。