当前位置:网站首页>Yolov5 post-processing code of cpu/gpu (CUDA) version
Yolov5 post-processing code of cpu/gpu (CUDA) version
2022-07-23 07:56:00 【HELLOWORLD2424】
CPU/GPU(CUDA) Version of YOLOv5 Post processing code
explain
Here is YOLOv5 Post processing code , Load the array saved by the backbone , Use them separately CPU、GPU Implement post-processing , Get the detection box . In the main folder of the project , newly build src, Then put the code in src Folder , To write makefile To compile the , Be careful cuda Relevant path needs to be changed . The compiled binary file will be output in workspace Under the table of contents .
box.hpp
#ifndef BOX_HPP
#define BOX_HPP
struct Box{
float left, top, right, bottom, confidence;
int label;
Box() = default;
Box(float left, float top, float right, float bottom, float confidence, int label):
left(left), top(top), right(right), bottom(bottom), confidence(confidence), label(label){
}
};
#endif // BOX_HPP
gpu_decode.cu
#include <cuda_runtime.h>
static __device__ void affine_project(float* matrix, float x, float y, float* ox, float* oy){
*ox = matrix[0] * x + matrix[1] * y + matrix[2];
*oy = matrix[3] * x + matrix[4] * y + matrix[5];
}
static __global__ void decode_kernel(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT
){
int position = blockDim.x * blockIdx.x + threadIdx.x;
if (position >= num_bboxes) return;
float* pitem = predict + (5 + num_classes) * position;
float objectness = pitem[4];
if(objectness < confidence_threshold)
return;
float* class_confidence = pitem + 5;
float confidence = *class_confidence++;
int label = 0;
for(int i = 1; i < num_classes; ++i, ++class_confidence){
if(*class_confidence > confidence){
confidence = *class_confidence;
label = i;
}
}
confidence *= objectness;
if(confidence < confidence_threshold)
return;
int index = atomicAdd(parray, 1);
if(index >= max_objects)
return;
float cx = *pitem++;
float cy = *pitem++;
float width = *pitem++;
float height = *pitem++;
float left = cx - width * 0.5f;
float top = cy - height * 0.5f;
float right = cx + width * 0.5f;
float bottom = cy + height * 0.5f;
// affine_project(invert_affine_matrix, left, top, &left, &top);
// affine_project(invert_affine_matrix, right, bottom, &right, &bottom);
// left, top, right, bottom, confidence, class, keepflag
float* pout_item = parray + 1 + index * NUM_BOX_ELEMENT;
*pout_item++ = left;
*pout_item++ = top;
*pout_item++ = right;
*pout_item++ = bottom;
*pout_item++ = confidence;
*pout_item++ = label;
*pout_item++ = 1; // 1 = keep, 0 = ignore
}
static __device__ float box_iou(
float aleft, float atop, float aright, float abottom,
float bleft, float btop, float bright, float bbottom
){
float cleft = max(aleft, bleft);
float ctop = max(atop, btop);
float cright = min(aright, bright);
float cbottom = min(abottom, bbottom);
float c_area = max(cright - cleft, 0.0f) * max(cbottom - ctop, 0.0f);
if(c_area == 0.0f)
return 0.0f;
float a_area = max(0.0f, aright - aleft) * max(0.0f, abottom - atop);
float b_area = max(0.0f, bright - bleft) * max(0.0f, bbottom - btop);
return c_area / (a_area + b_area - c_area);
}
static __global__ void fast_nms_kernel(float* bboxes, int max_objects, float threshold, int NUM_BOX_ELEMENT){
int position = (blockDim.x * blockIdx.x + threadIdx.x);
int count = min((int)*bboxes, max_objects);
if (position >= count)
return;
// left, top, right, bottom, confidence, class, keepflag
float* pcurrent = bboxes + 1 + position * NUM_BOX_ELEMENT;
for(int i = 0; i < count; ++i){
float* pitem = bboxes + 1 + i * NUM_BOX_ELEMENT;
if(i == position || pcurrent[5] != pitem[5]) continue;
if(pitem[4] >= pcurrent[4]){
if(pitem[4] == pcurrent[4] && i < position)
continue;
float iou = box_iou(
pcurrent[0], pcurrent[1], pcurrent[2], pcurrent[3],
pitem[0], pitem[1], pitem[2], pitem[3]
);
if(iou > threshold){
pcurrent[6] = 0; // 1=keep, 0=ignore
return;
}
}
}
}
void decode_kernel_invoker(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream){
auto block = num_bboxes > 512 ? 512 : num_bboxes;
auto grid = (num_bboxes + block - 1) / block;
/* If the kernel has a wavy line , No problem , He is normal , You just don't like it */
decode_kernel<<<grid, block, 0, stream>>>(
predict, num_bboxes, num_classes, confidence_threshold,
invert_affine_matrix, parray, max_objects, NUM_BOX_ELEMENT
);
block = max_objects > 512 ? 512 : max_objects;
grid = (max_objects + block - 1) / block;
fast_nms_kernel<<<grid, block, 0, stream>>>(parray, max_objects, nms_threshold, NUM_BOX_ELEMENT);
}
main.cpp
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <stdio.h>
#include <chrono>
#include <fstream>
#include "box.hpp"
using namespace std;
using namespace cv;
#define checkRuntime(op) __check_cuda_runtime((op), #op, __FILE__, __LINE__)
bool __check_cuda_runtime(cudaError_t code, const char* op, const char* file, int line){
if(code != cudaSuccess){
const char* err_name = cudaGetErrorName(code);
const char* err_message = cudaGetErrorString(code);
printf("runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
static std::vector<uint8_t> load_file(const string& file){
ifstream in(file, ios::in | ios::binary);
if (!in.is_open())
return {
};
in.seekg(0, ios::end);
size_t length = in.tellg();
std::vector<uint8_t> data;
if (length > 0){
in.seekg(0, ios::beg);
data.resize(length);
in.read((char*)&data[0], length);
}
in.close();
return data;
}
vector<Box> cpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> boxes;
int num_classes = cols - 5;
for(int i = 0; i < rows; ++i){
float* pitem = predict + i * cols;
float objness = pitem[4];
if(objness < confidence_threshold)
continue;
float* pclass = pitem + 5;
int label = std::max_element(pclass, pclass + num_classes) - pclass;
float prob = pclass[label];
float confidence = prob * objness;
if(confidence < confidence_threshold)
continue;
float cx = pitem[0];
float cy = pitem[1];
float width = pitem[2];
float height = pitem[3];
float left = cx - width * 0.5;
float top = cy - height * 0.5;
float right = cx + width * 0.5;
float bottom = cy + height * 0.5;
boxes.emplace_back(left, top, right, bottom, confidence, (float)label);
}
std::sort(boxes.begin(), boxes.end(), [](Box& a, Box& b){
return a.confidence > b.confidence;});
std::vector<bool> remove_flags(boxes.size());
std::vector<Box> box_result;
box_result.reserve(boxes.size());
auto iou = [](const Box& a, const Box& b){
float cross_left = std::max(a.left, b.left);
float cross_top = std::max(a.top, b.top);
float cross_right = std::min(a.right, b.right);
float cross_bottom = std::min(a.bottom, b.bottom);
float cross_area = std::max(0.0f, cross_right - cross_left) * std::max(0.0f, cross_bottom - cross_top);
float union_area = std::max(0.0f, a.right - a.left) * std::max(0.0f, a.bottom - a.top)
+ std::max(0.0f, b.right - b.left) * std::max(0.0f, b.bottom - b.top) - cross_area;
if(cross_area == 0 || union_area == 0) return 0.0f;
return cross_area / union_area;
};
for(int i = 0; i < boxes.size(); ++i){
if(remove_flags[i]) continue;
auto& ibox = boxes[i];
box_result.emplace_back(ibox);
for(int j = i + 1; j < boxes.size(); ++j){
if(remove_flags[j]) continue;
auto& jbox = boxes[j];
if(ibox.label == jbox.label){
// class matched
if(iou(ibox, jbox) >= nms_threshold)
remove_flags[j] = true;
}
}
}
return box_result;
}
void decode_kernel_invoker(
float* predict, int num_bboxes, int num_classes, float confidence_threshold,
float nms_threshold, float* invert_affine_matrix, float* parray, int max_objects, int NUM_BOX_ELEMENT, cudaStream_t stream);
vector<Box> gpu_decode(float* predict, int rows, int cols, float confidence_threshold = 0.25f, float nms_threshold = 0.45f){
vector<Box> box_result;
cudaStream_t stream = nullptr;
checkRuntime(cudaStreamCreate(&stream));
float* predict_device = nullptr;
float* output_device = nullptr;
float* output_host = nullptr;
int max_objects = 1000;
int NUM_BOX_ELEMENT = 7; // left, top, right, bottom, confidence, class, keepflag
checkRuntime(cudaMalloc(&predict_device, rows * cols * sizeof(float)));
checkRuntime(cudaMalloc(&output_device, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMallocHost(&output_host, sizeof(float) + max_objects * NUM_BOX_ELEMENT * sizeof(float)));
checkRuntime(cudaMemcpyAsync(predict_device, predict, rows * cols * sizeof(float), cudaMemcpyHostToDevice, stream));
decode_kernel_invoker(
predict_device, rows, cols - 5, confidence_threshold,
nms_threshold, nullptr, output_device, max_objects, NUM_BOX_ELEMENT, stream
);
checkRuntime(cudaMemcpyAsync(output_host, output_device,
sizeof(int) + max_objects * NUM_BOX_ELEMENT * sizeof(float),
cudaMemcpyDeviceToHost, stream
));
checkRuntime(cudaStreamSynchronize(stream));
int num_boxes = min((int)output_host[0], max_objects);
for(int i = 0; i < num_boxes; ++i){
float* ptr = output_host + 1 + NUM_BOX_ELEMENT * i;
int keep_flag = ptr[6];
if(keep_flag){
box_result.emplace_back(
ptr[0], ptr[1], ptr[2], ptr[3], ptr[4], (int)ptr[5]
);
}
}
checkRuntime(cudaStreamDestroy(stream));
checkRuntime(cudaFree(predict_device));
checkRuntime(cudaFree(output_device));
checkRuntime(cudaFreeHost(output_host));
return box_result;
}
int main(){
auto data = load_file("predict.data");
auto image = cv::imread("input-image.jpg");
float* ptr = (float*)data.data();
int nelem = data.size() / sizeof(float);
int ncols = 85;
int nrows = nelem / ncols;
auto boxes = gpu_decode(ptr, nrows, ncols);
for(auto& box : boxes){
cv::rectangle(image, cv::Point(box.left, box.top), cv::Point(box.right, box.bottom), cv::Scalar(0, 255, 0), 2);
cv::putText(image, cv::format("%.2f", box.confidence), cv::Point(box.left, box.top - 7), 0, 0.8, cv::Scalar(0, 0, 255), 2, 16);
}
cv::imwrite("image-draw.jpg", image);
return 0;
}
makefile
cc := g++
name := pro
workdir := workspace
srcdir := src
objdir := objs
stdcpp := c++11
cuda_home := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/trt8cuda112cudnn8
syslib := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/lib
cpp_pkg := /datav/software/anaconda3/lib/python3.9/site-packages/trtpy/cpp-packages
cuda_arch :=
nvcc := $(cuda_home)/bin/nvcc -ccbin=$(cc)
# Definition cpp Path lookup and dependencies for mk file
cpp_srcs := $(shell find $(srcdir) -name "*.cpp")
cpp_objs := $(cpp_srcs:.cpp=.cpp.o)
cpp_objs := $(cpp_objs:$(srcdir)/%=$(objdir)/%)
cpp_mk := $(cpp_objs:.cpp.o=.cpp.mk)
# Definition cu File path lookup and dependencies mk file
cu_srcs := $(shell find $(srcdir) -name "*.cu")
cu_objs := $(cu_srcs:.cu=.cu.o)
cu_objs := $(cu_objs:$(srcdir)/%=$(objdir)/%)
cu_mk := $(cu_objs:.cu.o=.cu.mk)
# Definition opencv and cuda Library files needed
link_cuda := cudart cublas
link_trtpro :=
link_tensorRT :=
link_opencv := opencv_core opencv_imgcodecs opencv_imgproc
link_sys := stdc++ dl
link_librarys := $(link_cuda) $(link_tensorRT) $(link_sys) $(link_opencv)
# Define the header file path , Please note that there must be no spaces behind the slash
# Just write the path , No need to write -I
include_paths := src \
$(cuda_home)/include/cuda \
$(cuda_home)/include/tensorRT \
$(cpp_pkg)/opencv4.2/include
# Define library file path , Just write the path , No need to write -L
library_paths := $(cuda_home)/lib64 $(syslib) $(cpp_pkg)/opencv4.2/lib
# hold library path To splice into a string , for example a b c => a:b:c
# And then make LD_LIBRARY_PATH=a:b:c
empty :=
library_path_export := $(subst $(empty) $(empty),:,$(library_paths))
# Concatenate the library path and header file path to form a , Batch automatic addition -I、-L、-l
run_paths := $(foreach item,$(library_paths),-Wl,-rpath=$(item))
include_paths := $(foreach item,$(include_paths),-I$(item))
library_paths := $(foreach item,$(library_paths),-L$(item))
link_librarys := $(foreach item,$(link_librarys),-l$(item))
# If it's another graphics card , Please amend -gencode=arch=compute_75,code=sm_75 For the ability of the corresponding graphics card
# The corresponding number of the graphics card is shown here :https://developer.nvidia.com/zh-cn/cuda-gpus#compute
# If it is jetson nano, Hint not found -m64 Instructions , Please delete it -m64 Options . It doesn't affect the result
cpp_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 -fPIC -fopenmp -pthread
cu_compile_flags := -std=$(stdcpp) -w -g -O0 -m64 $(cuda_arch) -Xcompiler "$(cpp_compile_flags)"
link_flags := -pthread -fopenmp -Wl,-rpath='$$ORIGIN'
cpp_compile_flags += $(include_paths)
cu_compile_flags += $(include_paths)
link_flags += $(library_paths) $(link_librarys) $(run_paths)
# If the header file is modified , The instructions here allow him to automatically compile the dependent cpp perhaps cu file
ifneq ($(MAKECMDGOALS), clean)
-include $(cpp_mk) $(cu_mk)
endif
$(name) : $(workdir)/$(name)
all : $(name)
run : $(name)
@cd $(workdir) && ./$(name) $(run_args)
$(workdir)/$(name) : $(cpp_objs) $(cu_objs)
@echo Link [email protected]
@mkdir -p $(dir [email protected])
@$(cc) $^ -o [email protected] $(link_flags)
$(objdir)/%.cpp.o : $(srcdir)/%.cpp
@echo Compile CXX $<
@mkdir -p $(dir [email protected])
@$(cc) -c $< -o [email protected] $(cpp_compile_flags)
$(objdir)/%.cu.o : $(srcdir)/%.cu
@echo Compile CUDA $<
@mkdir -p $(dir [email protected])
@$(nvcc) -c $< -o [email protected] $(cu_compile_flags)
# compile cpp Dependencies , Generate mk file
$(objdir)/%.cpp.mk : $(srcdir)/%.cpp
@echo Compile depends C++ $<
@mkdir -p $(dir [email protected])
@$(cc) -M $< -MF [email protected] -MT $(@:.cpp.mk=.cpp.o) $(cpp_compile_flags)
# compile cu File Dependencies , Generate cumk file
$(objdir)/%.cu.mk : $(srcdir)/%.cu
@echo Compile depends CUDA $<
@mkdir -p $(dir [email protected])
@$(nvcc) -M $< -MF [email protected] -MT $(@:.cu.mk=.cu.o) $(cu_compile_flags)
# Define cleanup instructions
clean :
@rm -rf $(objdir) $(workdir)/$(name) $(workdir)/input-image-pytorch.jpg $(workdir)/image-draw.jpg
# Prevent symbols from being treated as files
.PHONY : clean run $(name)
# Export dependent library path , Make it possible to run
export LD_LIBRARY_PATH:=$(library_path_export)
边栏推荐
- [day 31] given an integer n, find the base and exponent of each prime factor | basic theorem of arithmetic
- ASP.Net Core创建MVC项目上传多个文件(流方式)
- scala idea提示函数参数
- The new idea 2022.2 was officially released, and the new features are really fragrant
- Detailed analysis of the 110th blog of the fledgling Xiao Li in stm32__ NVIC_ Setprioritygrouping (uint32_t prioritygroup) function
- 沃尔沃xc90的安全性如何?一起来看看吧
- 11.37万的星瑞是怎样一个产品和表现力?一起来看看吧
- Problems encountered in punching
- 无代码生产新模式探索
- Scala idea prompt function parameters
猜你喜欢
随机推荐
初出茅庐的小李第110篇博客之详细剖析STM32中__NVIC_SetPriorityGrouping(uint32_t PriorityGroup)函数
局域网SDN硬核技术内幕 20 亢龙有悔——规格与限制(上)
VMware 中搭建 SylixOS 环境
Scala generic generic class details - t
【刷题记录】18. 四数之和
VMware虚拟机更改静态IP和主机名,使用Xshell进行连接
局域网SDN技术硬核内幕 4 从计算虚拟化到网络虚拟化
002_Kubernetes安装配置
大厂底层必修:“应用程序与 AMS 的通讯实现”
局域网SDN硬核技术内幕 22 亢龙有悔——规格与限制(下)
pny 文件转图片
4G传输模块的功能应用
Niuke Xiaobai month race 53
Qt+VTK+PCL图片转灰度图且以灰度为Y轴显示
C语音实现tcp客户端和tcp服务端,Qt调用测试
局域网SDN技术硬核内幕 9 从软件Overlay到硬件Overlay
模拟Not All Endpoints Registered异常及解决方案
Information system project managers must recite the core examination points (49) contract law
How to open the file in keil is the real path in the 109th blog of fledgling Xiao Li
RN底层原理 -- 1. Component和PureComponent解析









