Bootstrap

tensorRT加速(tiny)YOLOv3、(tiny)YOLOv4(实测有效)

1 简介

(1) 简介

本文采用tkDNN的方法,根据YOLOv4 作者AlexeyAB推荐,可以支持(tiny) YOLO v1~YOLO v4的加速编译。其中,tkDNN-TensorRT 可以加速YOLOv4到 2倍(batch=1),3-4 倍(batch=4)。本文以tiny YOLOv4模型在TX2上测试,batch=1时大概提速50%左右。

(2) 环境需求

  • CUDA 10.0
  • CUDNN 7.603
  • TENSORRT 6.01(TENSORRT 5.6.1实测也可以)
  • OPENCV 3.4
  • yaml-cpp 0.5.2 (需要安装)
    其余版本只要版本编译通过就行,其中yaml-cpp需要安装,安装方式如下:
git clone https://github.com/jbeder/yaml-cpp.git
mkdir build
cd build
cmake -DBUILD_SHARED_LIBS=ON ..
make
sudo make install

(3) 下载链接

tkDNN Github:https://github.com/ceccocats/tkDNN
https://pan.baidu.com/s/1GPGLRYgbv4YjvKGwwoMFFA
(提取码:xqrb)

darknet权重解析:https://git.hipert.unimore.it/fgatti/darknet
https://pan.baidu.com/s/1K02Cdr-yx050oVtoUBMP-g
(提取码:kb31)

C++Demo和Python Demo工程文件:
https://pan.baidu.com/s/1gzAhavHuvunMEIBh4WhdZg
(提取码:fvl7)

2 如何使用

(1) 模型准备

训练好的yolo模型(yolo4tiny.weights)、配置文件(yolo4tiny.cfg)和分类名文件voc.names(或者coco.names)。

(2) tkDNN编译

git clone https://github.com/ceccocats/tkDNN
cd tkDNN
mkdir build
cd build
cmake .. 
make

(2) darknet权重文件解析

编译时,设置Makefile里面的GPU=0

git clone https://git.hipert.unimore.it/fgatti/darknet.git
cd darknet
make
mkdir layers debug
./darknet export <path-to-cfg-file> <path-to-weights> layers

解析出来的格式如下:

    model
        |---- layers/ (包含每层的权重参数)
        |----------- *.bin
        |---- debug/  (包含每层的输出参数)
        |----------- *_out.bin

加速tinyYOLOv4的示例文件在tkDNN-master\tests\darknet\testyolo4tiny.cpp

    //downloadWeightsifDoNotExist(input_bins[0], bin_path, "https://cloud.hipert.unimore.it/s/iRnc4pSqmx78gJs/download");

    // parse darknet network
    tk::dnn::Network *net = tk::dnn::darknetParser(cfg_path, wgs_path, name_path);
    net->print();

注释掉downloadWeightsifDoNotExist部分,将cfg_path、wgs_path和name_path分别换位自己的路径(cfg_path——yolo4tiny.cfg, wgs_path——model/layers, name_path——voc.names)

(3) 创建rt文件

第一次调用的时候,要先检查一下是否生成引擎文件(.rt)

    //convert network to tensorRT
    tk::dnn::NetworkRT *netRT = new tk::dnn::NetworkRT(net, net->getNetworkRTName(bin_path.c_str()));
    
    int ret = testInference(input_bins, output_bins, net, netRT);

(4) 推理测试

在tkDNN-master/demo/demo/demo.cpp中可以看到具体调用示例:

        cv::Mat frame = cv::imread(f.iFilename.c_str(), cv::IMREAD_COLOR);
        std::vector<cv::Mat> batch_frames;
        batch_frames.push_back(frame);
        int height = frame.rows;
        int width = frame.cols;

        if(!frame.data) 
            break;
        std::vector<cv::Mat> batch_dnn_input;
        batch_dnn_input.push_back(frame.clone());

        //inference 
        detected_bbox.clear();
        detNN->update(batch_dnn_input,1,write_res_on_file, &times, write_coco_json);
        detNN->draw(batch_frames);
        detected_bbox = detNN->detected;
        // save detections labels
        for(auto d:detected_bbox){
            //convert detected bb in the same format as label
            //<x_center>/<image_width> <y_center>/<image_width> <width>/<image_width> <height>/<image_width>
            tk::dnn::BoundingBox b;
            b.x = (d.x + d.w/2) / width;
            b.y = (d.y + d.h/2) / height;
            b.w = d.w / width;
            b.h = d.h / height;
            b.prob = d.prob;
            b.cl = d.cl;
            f.det.push_back(b);         
        }

(5) 注意事项

(a) 安装yaml-cpp后,需要将路径放到环境变量中,不然编译tkDNN 会报错
(b) 解析权重文件之前,需要设置Makefile里GPU=0,然后编译生成/layer文件夹

3 C++封装

封装一个简单的历程,可以输入Mat图片,输出bbox结果

/ trt_v4tiny.h
#ifndef _TRT_V4TINY_H_
#define _TRT_V4TINY_H_

typedef struct objBoxInfo
{
    int      label;
    int      xmin;
    int      xmax;
    int      ymin;
    int      ymax;
    float    prob;
}objBoxInfo;

extern "C"
{
	int trtInit();
	int picRecog(unsigned char*data, int width, int height, int *num, objBoxInfo* res);
	int trtUnInit();
}
#endif
/// trt_v4tiny.cpp
#include <iostream>
#include <vector>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>

#include "Yolo3Detection.h"
#include "DarknetParser.h"
#include "tkdnn.h"
#include "test.h"
#include "BoundingBox.h"

#include "trt_v4tiny.h"


int n_classes = 9;
int n_batch = 1;

std::string path_net = "/home/nvidia/tkDNN-master/model/yolo4tiny_fp16.rt";
std::string bin_path  = "/home/nvidia/tkDNN-master/model";
std::vector<std::string> input_bins = { 
	bin_path + "/layers/input.bin"
};
std::vector<std::string> output_bins = {
	bin_path + "/debug/layer30_out.bin",
	bin_path + "/debug/layer37_out.bin"
};
std::string wgs_path  = bin_path + "/layers";
std::string cfg_path  = bin_path + "/yolo4tiny.cfg";
std::string name_path = bin_path + "/voc.names";
	
tk::dnn::DetectionNN *detNN;
tk::dnn::Yolo3Detection yolo;

int check_rt(std::string path_net);

int main(int argc, char* argv[]){
	// init
	trtInit();
	
	// check for rt
	check_rt(path_net);

	std::string path_file = "/home/nvidia/tkDNN-master/demo/0.jpg";
	cv::Mat img = cv::imread(path_file.c_str(), cv::IMREAD_COLOR);
	if(!img.data){
		std::cerr<<"read image failed!"<<std::endl;
		return -1;
	}
	
	int num = 50;
	objBoxInfo res[num];
	for(int i = 0; i < 100; i++){
		time_t t0 = clock();
		
		picRecog(img.data, img.cols, img.rows, &num, res);
		
		time_t t1 = clock();
		std::cout<<"["<< i <<"]:time spend "<<(double)(t1 - t0)/CLOCKS_PER_SEC <<std::endl;
	}
	
	// cv::rectangle(img, cv::Point(res[0].xmin,res[0].ymin), cv::Point(res[0].xmax, res[0].ymax), cv::Scalar(255, 0, 0), 2);
	// cv::imwrite("dst.jpg", img);
	
	trtUnInit();
	
	return 0;
	
}

int check_rt(std::string path_net){
	
	if(!fileExist(path_net.c_str())){
		std::cout<< "tensorrt file not exists: "<< path_net<<std::endl;		
		// 
		// downloadWeightsifDoNotExist(input_bins[0], bin_path, "https://cloud.hipert.unimore.it/s/iRnc4pSqmx78gJs/download");
		// parse darknet network
		tk::dnn::Network *net = tk::dnn::darknetParser(cfg_path, wgs_path, name_path);
		net->print();
		//convert network to tensorRT
		tk::dnn::NetworkRT *netRT = new tk::dnn::NetworkRT(net, path_net.c_str());
		
		int ret = testInference(input_bins, output_bins, net, netRT);
		
		net->releaseLayers();
		delete net;
		delete netRT;
		
		if(!fileExist(path_net.c_str()))
		{
			std::cerr<<"TensorRT enigine build Failed!\n"<<std::endl;
			return -1;
		}else{
			std::cout<<"TensorRT enigine build Sucessfully!\n"<<std::endl;
		}
	}else{
		std::cout<< "tensorrt file exists: "<< path_net<<std::endl;
	}
	return 0;	
}

int trtInit(){
	detNN = &yolo;
	detNN->init(path_net, n_classes, n_batch);
	std::cout<<"init trt!"<<std::endl;
		
	return 0;
}

int trtUnInit(){
	
	std::cout<<"uninit trt!"<<std::endl;
	
	return 0;
}

int picRecog(unsigned char*data, int width, int height, int *num, objBoxInfo* res){
	

	cv::Mat img(height, width, CV_8UC3, (void*)data);
	
	std::vector<cv::Mat> batch_dnn_input;
	std::vector<tk::dnn::box> detected_bbox;
	//inference 
	detected_bbox.clear();
	batch_dnn_input.push_back(img);
	
	detNN->update(batch_dnn_input, n_batch);
	// detNN->draw(batch_frames);
		
	detected_bbox = detNN->detected;

	int cnt = 0;
	for(auto d:detected_bbox){
		res[cnt].label = d.cl;
		res[cnt].prob = d.prob;
		res[cnt].xmin = d.x;
		res[cnt].ymin = d.y;
		res[cnt].xmax = d.x + d.w;
		res[cnt].ymax = d.y + d.h;		
		cnt++;
	}	
	*num = cnt;

	return 0;
}

4 python封装

将C代码编译成动态链接库,通过Python调用,实现加速

#!/usr/bin/python
# -*- coding:utf-8 -*-
from __future__ import print_function
import os, datetime, logging, time, sys, glob, requests, getpass, \
    json, socket, threading, re, serial, shutil, pickle, math, \
    base64, subprocess, signal, cv2, numpy, traceback, multiprocessing, hashlib
from multiprocessing import Process, Pool
from ctypes import *
from struct import pack, unpack
import numpy as np


class resultTrtBox(Structure):
    _fields_ = [
        ("label", c_int),
        ("xmin", c_int),
        ("xmax", c_int),
        ("ymin", c_int),
        ("ymax", c_int),
        ("prob", c_float)]

class resultTrtBoxArray(Array):
    _type_ = resultTrtBox
    _length_ = 50

cudaVer = os.popen("nvcc -V | grep release | awk '{print $5}' | awk -F ',' '{print $1}'")
cudaVer = cudaVer.read().split("\n")[0]
opencvVer = os.popen("pkg-config opencv --modversion")
opencvVer = opencvVer.read().split("\n")[0]

libKeyVIRutiliy = cdll.LoadLibrary("/home/nvidia/tkDNN-master/libtrt.so")
trtCarInit = libKeyVIRutiliy.trtInit
trtCarUninit = libKeyVIRutiliy.trtUnInit
trtCarRecogPic = libKeyVIRutiliy.picRecog

def testPic(path):
    trtCarInit()
    img = cv2.imdecode(np.fromfile(path, dtype=np.uint8), cv2.IMREAD_COLOR)
    data_ctypes_ptr = cast(img.ctypes.data, POINTER(c_char))
    res = resultTrtBoxArray()
    for i in range(1):
        recogNum = c_int(resultTrtBoxArray._length_)
        t = time.time()
        trtCarRecogPic(data_ctypes_ptr, img.shape[1], img.shape[0], pointer(recogNum), pointer(res))
        # print(i, "%.3f" % (time.time() - t), recogNum.value)
        for i in range(recogNum.value):
            print(res[i].label, res[i].prob, res[i].xmin, res[i].xmax, res[i].ymin, res[i].ymax)
    trtCarUninit()

if __name__ == '__main__':
    print("pid:", os.getpid())

    path = '/home/nvidia/tkDNN-master/demo/0.jpg'
    testPic(path)

相应的工程文件已放入百度网盘,可下载测试,有问题随时交流。

;