3、picodet c++版onnxruntime推理及reshape和transpose的c++实现

k是每个检测头最后的大小，如输入是256，每个头的stride分别是[8,16,32,64],那么应有k就是[32,16,8,4],直接c++实现,有些搞不明白，先用python来实现。k ，这个m就是冒号这里的位置，取m=0,1,2,3,4,5,6,7,分别表示这一维度下的8个。关于c代码，可以查看我上边的PicoDet::generate_proposal里代码。4x8xkxk对应的是pyth

牛andmore牛

886人浏览 · 2023-04-19 14:51:13

牛andmore牛 · 2023-04-19 14:51:13 发布

文章目录

1、完整onnx c++推理
2、裁剪后模型的推理
- 2.1 分类reshape和transpose用python模拟c++
- 2.2 回归的reshape和transpose的python模拟
3、softmax改进

1、完整onnx c++推理

这里指的完整是指在用paddle export.py benchmark=True时的导出的模型，模型没有post和nms，推理方法可以直接参考：https://github.com/hpc203/picodet-onnxruntime

我做了一点小修改，代码如下：

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
#include <math.h>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>

//using namespace cv;
//using namespace std;
//using namespace Ort;

typedef struct BoxInfo
{
	float x1;
	float y1;
	float x2;
	float y2;
	float score;
	int label;
} BoxInfo;

class PicoDet
{
public:
	PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold);
	void detect(cv::Mat& cv_image);
private:
	float score_threshold = 0.5;
	float nms_threshold = 0.5;
	std::vector<std::string> class_names;
	int num_class;

	cv::Mat resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left);
	std::vector<float> input_image_;
	void normalize_(cv::Mat img);
	void softmax_(const float* x, float* y, int length);
	void generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box);
	void nms(std::vector<BoxInfo>& input_boxes);
	const bool keep_ratio = false;
	int inpWidth;
	int inpHeight;
	int num_outs;
	int reg_max;
	std::vector<int> stride;
	//const float mean[3] = { 103.53, 116.28, 123.675 };
	//const float stds[3] = { 57.375, 57.12, 58.395 };
	const float mean[3] = { 0.0, 0.0, 0.0 };
	const float stds[3] = { 255.0, 255.0, 255.0 };

	Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "picodet");
	Ort::Session* ort_session = nullptr;
	Ort::SessionOptions sessionOptions = Ort::SessionOptions();
	std::vector<char*> input_names;
	std::vector<char*> output_names;
	std::vector<std::vector<int64_t>> input_node_dims; // >=1 outputs
	std::vector<std::vector<int64_t>> output_node_dims; // >=1 outputs
};

PicoDet::PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold)
{
	std::ifstream ifs(classesFile.c_str());
	std::string line;
	while (std::getline(ifs, line)) this->class_names.push_back(line);
	this->num_class = class_names.size();
	this->nms_threshold = nms_threshold;
	this->score_threshold = objThreshold;

	std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
	//OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0);
	sessionOptions.SetGraphOptimizationLevel(ORT_ENABLE_BASIC);
	ort_session = new Ort::Session(env, widestr.c_str(), sessionOptions);
	size_t numInputNodes = ort_session->GetInputCount();
	size_t numOutputNodes = ort_session->GetOutputCount();
	Ort::AllocatorWithDefaultOptions allocator;
	for (int i = 0; i < numInputNodes; i++)
	{
		input_names.push_back(ort_session->GetInputName(i, allocator));
		Ort::TypeInfo input_type_info = ort_session->GetInputTypeInfo(i);
		auto input_tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
		auto input_dims = input_tensor_info.GetShape();
		input_node_dims.push_back(input_dims);
	}
	for (int i = 0; i < numOutputNodes; i++)
	{
		output_names.push_back(ort_session->GetOutputName(i, allocator));
		Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
		auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
		auto output_dims = output_tensor_info.GetShape();
		output_node_dims.push_back(output_dims);
		/*for (int j = 0; j < output_dims.size(); j++)
		{
			cout << output_dims[j] << ",";
		}
		cout << endl;*/
	}
	this->inpHeight = input_node_dims[0][2];
	this->inpWidth = input_node_dims[0][3];
	this->num_outs = int(numOutputNodes * 0.5);
	this->reg_max = output_node_dims[this->num_outs][output_node_dims[this->num_outs].size() - 1] / 4 - 1;
	for (int i = 0; i < this->num_outs; i++)
	{
		stride.push_back(int(8 * pow(2, i)));
	}
}

cv::Mat PicoDet::resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left)
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	cv::Mat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR);
			*left = int((this->inpWidth - *neww) * 0.5);
			copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, cv::BORDER_CONSTANT, 0);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR); //插值方式要与模型训练相一致
			*top = (int)(this->inpHeight - *newh) * 0.5;
			copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, cv::BORDER_CONSTANT, 0);
		}
	}
	else {
		cv::resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_LINEAR);
	}
	return dstimg;
}

void PicoDet::normalize_(cv::Mat img)
{
	//    img.convertTo(img, CV_32F);
	int row = img.rows;
	int col = img.cols;
	this->input_image_.resize(row * col * img.channels());
	for (int c = 0; c < 3; c++)
	{
		for (int i = 0; i < row; i++)
		{
			for (int j = 0; j < col; j++)
			{
				float pix = img.ptr<uchar>(i)[j * 3 + c];
				this->input_image_[c * row * col + i * col + j] = (pix / 255.0 - mean[c] / 255.0) / (stds[c] / 255.0);
				//this->input_image_[c * row * col + i * col + j] = (pix - mean[c]) / stds[c];
			}
		}
	}
}

void PicoDet::softmax_(const float* x, float* y, int length)
{
	float sum = 0;
	int i = 0;
	for (i = 0; i < length; i++)
	{
		y[i] = exp(x[i]);
		sum += y[i];
	}
	for (i = 0; i < length; i++)
	{
		y[i] /= sum;
	}
}

void PicoDet::generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box)
{
	const int num_grid_y = (int)ceil((float)this->inpHeight / stride_);
	const int num_grid_x = (int)ceil((float)this->inpWidth / stride_);
	////cout << "num_grid_x=" << num_grid_x << ",num_grid_y=" << num_grid_y << endl;
	const int reg_1max = reg_max + 1;
	//std::cout << "score:" << std::endl;
	for (int i = 0; i < num_grid_y; i++)
	{
		for (int j = 0; j < num_grid_x; j++)
		{
			int max_ind = 0;
			float max_score = 0;
			
			for (int k = 0; k < num_class; k++)
			{   
				/*这个代码是原始的输出*/
		    	float score = out_score[i * num_grid_x * num_class + j * num_class + k];
				/*以下代码是去掉reshape和transpose的，用C来实现这个功能的这两部分代码选一个即可,可以理解成ijk对应kij*/
				//float score = std::sqrt(out_score[k*num_grid_y*num_grid_x+i*num_grid_x+j]);
				//std::cout <<score << " ";
				if (score > max_score)
				{
					max_score = score;
					max_ind = k;
				}
			}
			if (max_score >= score_threshold)
			{
				std::cout << "box:" << std::endl;
				//const float* pbox = out_box + idx * reg_1max * 4;
				float dis_pred[4];
				float* y = new float[reg_1max];
				for (int k = 0; k < 4; k++)
				{
					/*原始模型*/
					const float* tmp = out_box + i * num_grid_x * reg_1max * 4 + j * reg_1max * 4 + k * reg_1max;
					//std::cout << "r:" << *tmp << std::endl;
					/*换用没有reshape transpose的*/
					//float* tmp = new float[reg_1max];
					//for (int m = 0; m < reg_1max; m++)
					//{
					//tmp[m] = out_box[k * num_grid_y * num_grid_x * reg_1max + i * num_grid_x + j + m * num_grid_y * num_grid_x];
					//}
					//std::cout << "r:" << *tmp << std::endl;
					softmax_(tmp, y, reg_1max);
					float dis = 0.f;
					for (int l = 0; l < reg_1max; l++)
					{
						dis += l * y[l];
					}
					dis_pred[k] = dis * stride_;
				}
				delete[] y;
				float pb_cx = (j + 0.5f) * stride_ - 0.5;
				float pb_cy = (i + 0.5f) * stride_ - 0.5;
				float x0 = pb_cx - dis_pred[0];
				float y0 = pb_cy - dis_pred[1];
				float x1 = pb_cx + dis_pred[2];
				float y1 = pb_cy + dis_pred[3];
				generate_boxes.push_back(BoxInfo{ x0, y0, x1, y1, max_score, max_ind });
			}
		}
	}
}

void PicoDet::nms(std::vector<BoxInfo>& input_boxes)
{
	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
	std::vector<float> vArea(input_boxes.size());
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
			* (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
	}

	std::vector<bool> isSuppressed(input_boxes.size(), false);
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		if (isSuppressed[i]) { continue; }
		for (int j = i + 1; j < int(input_boxes.size()); ++j)
		{
			if (isSuppressed[j]) { continue; }
			float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
			float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
			float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
			float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);

			float w = (std::max)(float(0), xx2 - xx1 + 1);
			float h = (std::max)(float(0), yy2 - yy1 + 1);
			float inter = w * h;
			float ovr = inter / (vArea[i] + vArea[j] - inter);

			if (ovr >= this->nms_threshold)
			{
				isSuppressed[j] = true;
			}
		}
	}
	// return post_nms;
	int idx_t = 0;
	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
}

void PicoDet::detect(cv::Mat& srcimg)
{
	int newh = 0, neww = 0, top = 0, left = 0;
	cv::Mat cv_image = srcimg.clone();
	cv::Mat dst = this->resize_image(cv_image, &newh, &neww, &top, &left);
	this->normalize_(dst);
	std::array<int64_t, 4> input_shape_{ 1, 3, this->inpHeight, this->inpWidth };

	auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
	Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape_.data(), input_shape_.size());


	std::vector<Ort::Value> ort_outputs = ort_session->Run(Ort::RunOptions{ nullptr }, &input_names[0], &input_tensor_, 1, output_names.data(), output_names.size());   // ???????
	/////generate proposals
	std::vector<BoxInfo> generate_boxes;
	for (int i = 0; i < this->num_outs; i++)
	{
		//auto cls_shape = this->output_node_dims[i];
		const float* cls_score = ort_outputs[i].GetTensorMutableData<float>();
		//std::vector<int64_t> new_cls_shape = { cls_shape[0],cls_shape[1],cls_shape[2] * cls_shape[3] };
		


		const float* bbox_pred = ort_outputs[i + this->num_outs].GetTensorMutableData<float>();
		//auto reg_shape = this->output_node_dims[i+this->num_outs];
		generate_proposal(generate_boxes, stride[i], cls_score, bbox_pred);
	}

	//// Perform non maximum suppression to eliminate redundant overlapping boxes with
	//// lower confidences
	nms(generate_boxes);
	float ratioh = (float)cv_image.rows / newh;
	float ratiow = (float)cv_image.cols / neww;
	for (size_t i = 0; i < generate_boxes.size(); ++i)
	{
		int xmin = (int)std::max((generate_boxes[i].x1 - left) * ratiow, 0.f);
		int ymin = (int)std::max((generate_boxes[i].y1 - top) * ratioh, 0.f);
		int xmax = (int)std::min((generate_boxes[i].x2 - left) * ratiow, (float)cv_image.cols);
		int ymax = (int)std::min((generate_boxes[i].y2 - top) * ratioh, (float)cv_image.rows);
		rectangle(srcimg, cv::Point(xmin, ymin), cv::Point(xmax, ymax), cv::Scalar(0, 0, 255), 2);
		std::string label = cv::format("%.2f", generate_boxes[i].score);
		label = this->class_names[generate_boxes[i].label] + ":" + label;
		putText(srcimg, label, cv::Point(xmin, ymin - 5), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 1);
	}
}

int main()
{
	PicoDet mynet("picodet_xs_320_voc_256_20230405_shape.onnx", "ball.names", 0.5, 0.5);  /// choice = ["picodet_m_320_coco.onnx", "picodet_m_416_coco.onnx", "picodet_s_320_coco.onnx", "picodet_s_416_coco.onnx"]
	//PicoDet mynet("Cpicodet_xs_320_voc_256_20230405_shape_sim_prune.onnx", "ball.names", 0.5, 0.5);
	std::string imgpath = "test.jpg";
    cv::Mat bgrimg = cv::imread(imgpath,cv::IMREAD_COLOR);
	cv::Mat rgbimg;
	cv::cvtColor(bgrimg,rgbimg,cv::COLOR_BGR2RGB);
	mynet.detect(rgbimg);
	cv::Mat resultimg;
	cv::cvtColor(rgbimg, resultimg, cv::COLOR_RGB2BGR);
	cv::imwrite("test_result.jpg", resultimg);
	static const std::string kWinName = "Deep learning object detection in ONNXRuntime";
	cv::namedWindow(kWinName, cv::WINDOW_NORMAL);
	cv::imshow(kWinName, resultimg);
	cv::waitKey(0);
	cv::destroyAllWindows();
}

2、裁剪后模型的推理

这里主要是把reshape和两个算子给去掉了，需要用代码来实现

这部分只要把裁剪的部分加下即可，我们用的是直接从原始模型上进行裁剪的，与原始模型相比，差的部分就是：
请添加图片描述

所以把这部分加上就可以了
对于这种reshape和transpose很多的加速芯片是无法使用的，要用cpu完成，所以把这些算子单独取出来看效果。

2.1 分类reshape和transpose用python模拟c++

如图，有两个头，分别是分类和位置回归。对于onnxruntime的输出形状为1xcxkxk ->reshape->1xcx(kk)->transpose->1x(kk)c;
对于位置回归头 1x32xkxk->reshape->1x32x(kk)->transpose->1x(k*k)x32

k是每个检测头最后的大小，如输入是256，每个头的stride分别是[8,16,32,64],那么应有k就是[32,16,8,4],直接c++实现,有些搞不明白，先用python来实现。
我们选用k为4来做实验，类别数为2,batchsize为1。python要模拟c++,按照内存分布来说python要flatten，在内存中连续分布。

import numpy as np
num_grid_x=4 #宽
num_grid_y=4 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            print(b[i,j,k],"  ",a[k,i,j])

可发看到结果是相同的，那么如果输出不是正方形，会怎么样，看下边代码：

num_grid_x=4 #宽
num_grid_y=3 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            print(b[i,j,k],"  ",a[k,i,j])

同样是没有问题的，也就是说可以得出结论：kxkxc 与cxkxk索引对应关系是 i，j,k对应k,i,j,也可以理解成kxkxc变成cxkxk是transpose 0,1,2变成1，2，0，ijk变成kij
接差用c语言的思路来实现

num_grid_x=4 #宽
num_grid_y=4 #高
num_cls=2 #分类
a = np.arange(num_cls*num_grid_x*num_grid_y).reshape(num_cls,num_grid_y,num_grid_x) #裁剪后模型输出c,k,k
aa=a.flatten()
b = a.transpose(1,2,0) # reshape transpose
bb = b.flatten()
#以上部分a就是裁剪后的模型，b就是完整的，我们想用正常的访问b的方法来实现对a的访问，从而实现reshape和transpose 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(num_cls): 
            tb=i*num_grid_x*num_cls+j*num_cls+k
            ta=k*num_grid_y*num_grid_x+i*num_grid_x+j
            print(bb[tb]," ",aa[ta])

关于c代码，可以查看我上边的PicoDet::generate_proposal里代码

2.2 回归的reshape和transpose的python模拟

前边32是固定的

num_grid_x=4
num_grid_y=4

a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)# a是裁剪后的返回结果 32xkxk与4x8xkxk在内存中是差别不大的
b = a.transpose(2,3,0,1) 
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            print(b[i,j,k],"  ",a[k,:,i,j])

[  0  16  32  48  64  80  96 112]    [  0  16  32  48  64  80  96 112]
[128 144 160 176 192 208 224 240]    [128 144 160 176 192 208 224 240]
[256 272 288 304 320 336 352 368]    [256 272 288 304 320 336 352 368]
[384 400 416 432 448 464 480 496]    [384 400 416 432 448 464 480 496]
[  1  17  33  49  65  81  97 113]    [  1  17  33  49  65  81  97 113]
[129 145 161 177 193 209 225 241]    [129 145 161 177 193 209 225 241]
[257 273 289 305 321 337 353 369]    [257 273 289 305 321 337 353 369]
[385 401 417 433 449 465 481 497]    [385 401 417 433 449 465 481 497]
[  2  18  34  50  66  82  98 114]    [  2  18  34  50  66  82  98 114]
[130 146 162 178 194 210 226 242]    [130 146 162 178 194 210 226 242]
[258 274 290 306 322 338 354 370]    [258 274 290 306 322 338 354 370]
[386 402 418 434 450 466 482 498]    [386 402 418 434 450 466 482 498]
[  3  19  35  51  67  83  99 115]    [  3  19  35  51  67  83  99 115]
[131 147 163 179 195 211 227 243]    [131 147 163 179 195 211 227 243]
[259 275 291 307 323 339 355 371]    [259 275 291 307 323 339 355 371]
[387 403 419 435 451 467 483 499]    [387 403 419 435 451 467 483 499]
[  4  20  36  52  68  84 100 116]    [  4  20  36  52  68  84 100 116]
[132 148 164 180 196 212 228 244]    [132 148 164 180 196 212 228 244]
[260 276 292 308 324 340 356 372]    [260 276 292 308 324 340 356 372]
[388 404 420 436 452 468 484 500]    [388 404 420 436 452 468 484 500]
[  5  21  37  53  69  85 101 117]    [  5  21  37  53  69  85 101 117]
[133 149 165 181 197 213 229 245]    [133 149 165 181 197 213 229 245]
[261 277 293 309 325 341 357 373]    [261 277 293 309 325 341 357 373]
[389 405 421 437 453 469 485 501]    [389 405 421 437 453 469 485 501]
[  6  22  38  54  70  86 102 118]    [  6  22  38  54  70  86 102 118]
[134 150 166 182 198 214 230 246]    [134 150 166 182 198 214 230 246]
[262 278 294 310 326 342 358 374]    [262 278 294 310 326 342 358 374]
[390 406 422 438 454 470 486 502]    [390 406 422 438 454 470 486 502]
[  7  23  39  55  71  87 103 119]    [  7  23  39  55  71  87 103 119]
[135 151 167 183 199 215 231 247]    [135 151 167 183 199 215 231 247]
[263 279 295 311 327 343 359 375]    [263 279 295 311 327 343 359 375]
[391 407 423 439 455 471 487 503]    [391 407 423 439 455 471 487 503]
[  8  24  40  56  72  88 104 120]    [  8  24  40  56  72  88 104 120]
[136 152 168 184 200 216 232 248]    [136 152 168 184 200 216 232 248]
[264 280 296 312 328 344 360 376]    [264 280 296 312 328 344 360 376]
[392 408 424 440 456 472 488 504]    [392 408 424 440 456 472 488 504]
[  9  25  41  57  73  89 105 121]    [  9  25  41  57  73  89 105 121]
[137 153 169 185 201 217 233 249]    [137 153 169 185 201 217 233 249]
[265 281 297 313 329 345 361 377]    [265 281 297 313 329 345 361 377]
[393 409 425 441 457 473 489 505]    [393 409 425 441 457 473 489 505]
[ 10  26  42  58  74  90 106 122]    [ 10  26  42  58  74  90 106 122]
[138 154 170 186 202 218 234 250]    [138 154 170 186 202 218 234 250]
[266 282 298 314 330 346 362 378]    [266 282 298 314 330 346 362 378]
[394 410 426 442 458 474 490 506]    [394 410 426 442 458 474 490 506]
[ 11  27  43  59  75  91 107 123]    [ 11  27  43  59  75  91 107 123]
[139 155 171 187 203 219 235 251]    [139 155 171 187 203 219 235 251]
[267 283 299 315 331 347 363 379]    [267 283 299 315 331 347 363 379]
[395 411 427 443 459 475 491 507]    [395 411 427 443 459 475 491 507]
[ 12  28  44  60  76  92 108 124]    [ 12  28  44  60  76  92 108 124]
[140 156 172 188 204 220 236 252]    [140 156 172 188 204 220 236 252]
[268 284 300 316 332 348 364 380]    [268 284 300 316 332 348 364 380]
[396 412 428 444 460 476 492 508]    [396 412 428 444 460 476 492 508]
[ 13  29  45  61  77  93 109 125]    [ 13  29  45  61  77  93 109 125]
[141 157 173 189 205 221 237 253]    [141 157 173 189 205 221 237 253]
[269 285 301 317 333 349 365 381]    [269 285 301 317 333 349 365 381]
[397 413 429 445 461 477 493 509]    [397 413 429 445 461 477 493 509]
[ 14  30  46  62  78  94 110 126]    [ 14  30  46  62  78  94 110 126]
[142 158 174 190 206 222 238 254]    [142 158 174 190 206 222 238 254]
[270 286 302 318 334 350 366 382]    [270 286 302 318 334 350 366 382]
[398 414 430 446 462 478 494 510]    [398 414 430 446 462 478 494 510]
[ 15  31  47  63  79  95 111 127]    [ 15  31  47  63  79  95 111 127]
[143 159 175 191 207 223 239 255]    [143 159 175 191 207 223 239 255]
[271 287 303 319 335 351 367 383]    [271 287 303 319 335 351 367 383]
[399 415 431 447 463 479 495 511]    [399 415 431 447 463 479 495 511]

上边代码，因为是四维的，但只用三维的来操作，只是为了做到32变成4*8，获取4份连续的8个数（8个可能的位置），换成c代码来看下边两份代码：

num_grid_x=4
num_grid_y=4
a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)
aa = a.flatten()
b = a.transpose(2,3,0,1)
bb = b.flatten()
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            #print(b[i,j,k],"  ",a[k,:,i,j])
            t1 =i*num_grid_x*32+j*32+k*8
            t2 =k*8*num_grid_x*num_grid_y+i*num_grid_x+j
            print(f"{bb[t1:t1+8]} {aa[t2:t2+8]}")

[  0  16  32  48  64  80  96 112] [0 1 2 3 4 5 6 7]
[128 144 160 176 192 208 224 240] [128 129 130 131 132 133 134 135]
[256 272 288 304 320 336 352 368] [256 257 258 259 260 261 262 263]
[384 400 416 432 448 464 480 496] [384 385 386 387 388 389 390 391]
[  1  17  33  49  65  81  97 113] [1 2 3 4 5 6 7 8]
[129 145 161 177 193 209 225 241] [129 130 131 132 133 134 135 136]
[257 273 289 305 321 337 353 369] [257 258 259 260 261 262 263 264]
[385 401 417 433 449 465 481 497] [385 386 387 388 389 390 391 392]
[  2  18  34  50  66  82  98 114] [2 3 4 5 6 7 8 9]
[130 146 162 178 194 210 226 242] [130 131 132 133 134 135 136 137]
[258 274 290 306 322 338 354 370] [258 259 260 261 262 263 264 265]
[386 402 418 434 450 466 482 498] [386 387 388 389 390 391 392 393]
[  3  19  35  51  67  83  99 115] [ 3  4  5  6  7  8  9 10]
[131 147 163 179 195 211 227 243] [131 132 133 134 135 136 137 138]
[259 275 291 307 323 339 355 371] [259 260 261 262 263 264 265 266]
[387 403 419 435 451 467 483 499] [387 388 389 390 391 392 393 394]
[  4  20  36  52  68  84 100 116] [ 4  5  6  7  8  9 10 11]
[132 148 164 180 196 212 228 244] [132 133 134 135 136 137 138 139]
[260 276 292 308 324 340 356 372] [260 261 262 263 264 265 266 267]
[388 404 420 436 452 468 484 500] [388 389 390 391 392 393 394 395]
[  5  21  37  53  69  85 101 117] [ 5  6  7  8  9 10 11 12]
[133 149 165 181 197 213 229 245] [133 134 135 136 137 138 139 140]
[261 277 293 309 325 341 357 373] [261 262 263 264 265 266 267 268]
[389 405 421 437 453 469 485 501] [389 390 391 392 393 394 395 396]
[  6  22  38  54  70  86 102 118] [ 6  7  8  9 10 11 12 13]
[134 150 166 182 198 214 230 246] [134 135 136 137 138 139 140 141]
[262 278 294 310 326 342 358 374] [262 263 264 265 266 267 268 269]
[390 406 422 438 454 470 486 502] [390 391 392 393 394 395 396 397]
[  7  23  39  55  71  87 103 119] [ 7  8  9 10 11 12 13 14]
[135 151 167 183 199 215 231 247] [135 136 137 138 139 140 141 142]
[263 279 295 311 327 343 359 375] [263 264 265 266 267 268 269 270]
[391 407 423 439 455 471 487 503] [391 392 393 394 395 396 397 398]
[  8  24  40  56  72  88 104 120] [ 8  9 10 11 12 13 14 15]
[136 152 168 184 200 216 232 248] [136 137 138 139 140 141 142 143]
[264 280 296 312 328 344 360 376] [264 265 266 267 268 269 270 271]
[392 408 424 440 456 472 488 504] [392 393 394 395 396 397 398 399]
[  9  25  41  57  73  89 105 121] [ 9 10 11 12 13 14 15 16]
[137 153 169 185 201 217 233 249] [137 138 139 140 141 142 143 144]
[265 281 297 313 329 345 361 377] [265 266 267 268 269 270 271 272]
[393 409 425 441 457 473 489 505] [393 394 395 396 397 398 399 400]
[ 10  26  42  58  74  90 106 122] [10 11 12 13 14 15 16 17]
[138 154 170 186 202 218 234 250] [138 139 140 141 142 143 144 145]
[266 282 298 314 330 346 362 378] [266 267 268 269 270 271 272 273]
[394 410 426 442 458 474 490 506] [394 395 396 397 398 399 400 401]
[ 11  27  43  59  75  91 107 123] [11 12 13 14 15 16 17 18]
[139 155 171 187 203 219 235 251] [139 140 141 142 143 144 145 146]
[267 283 299 315 331 347 363 379] [267 268 269 270 271 272 273 274]
[395 411 427 443 459 475 491 507] [395 396 397 398 399 400 401 402]
[ 12  28  44  60  76  92 108 124] [12 13 14 15 16 17 18 19]
[140 156 172 188 204 220 236 252] [140 141 142 143 144 145 146 147]
[268 284 300 316 332 348 364 380] [268 269 270 271 272 273 274 275]
[396 412 428 444 460 476 492 508] [396 397 398 399 400 401 402 403]
[ 13  29  45  61  77  93 109 125] [13 14 15 16 17 18 19 20]
[141 157 173 189 205 221 237 253] [141 142 143 144 145 146 147 148]
[269 285 301 317 333 349 365 381] [269 270 271 272 273 274 275 276]
[397 413 429 445 461 477 493 509] [397 398 399 400 401 402 403 404]
[ 14  30  46  62  78  94 110 126] [14 15 16 17 18 19 20 21]
[142 158 174 190 206 222 238 254] [142 143 144 145 146 147 148 149]
[270 286 302 318 334 350 366 382] [270 271 272 273 274 275 276 277]
[398 414 430 446 462 478 494 510] [398 399 400 401 402 403 404 405]
[ 15  31  47  63  79  95 111 127] [15 16 17 18 19 20 21 22]
[143 159 175 191 207 223 239 255] [143 144 145 146 147 148 149 150]
[271 287 303 319 335 351 367 383] [271 272 273 274 275 276 277 278]
[399 415 431 447 463 479 495 511] [399 400 401 402 403 404 405 406]

num_grid_x=4
num_grid_y=4
a = np.arange(4*8*num_grid_x*num_grid_y).reshape(32,num_grid_y,num_grid_x).reshape(4,8,num_grid_x,num_grid_y)
aa = a.flatten()
b = a.transpose(2,3,0,1) # num_grid_y num_grid_x 4 8
bb = b.flatten()
for i in range(num_grid_y):
    for j in range(num_grid_x):
        for k in range(4):
            #print(b[i,j,k],"  ",a[k,:,i,j])
            t1 =i*num_grid_x*32+j*32+k*8
            t2 =k*8*num_grid_x*num_grid_y+i*num_grid_x+j
            print(f"{bb[t1:t1+8]} {aa[t2:t2+8*16:16]}")

[  0  16  32  48  64  80  96 112] [  0  16  32  48  64  80  96 112]
[128 144 160 176 192 208 224 240] [128 144 160 176 192 208 224 240]
[256 272 288 304 320 336 352 368] [256 272 288 304 320 336 352 368]
[384 400 416 432 448 464 480 496] [384 400 416 432 448 464 480 496]
[  1  17  33  49  65  81  97 113] [  1  17  33  49  65  81  97 113]
[129 145 161 177 193 209 225 241] [129 145 161 177 193 209 225 241]
[257 273 289 305 321 337 353 369] [257 273 289 305 321 337 353 369]
[385 401 417 433 449 465 481 497] [385 401 417 433 449 465 481 497]
[  2  18  34  50  66  82  98 114] [  2  18  34  50  66  82  98 114]
[130 146 162 178 194 210 226 242] [130 146 162 178 194 210 226 242]
[258 274 290 306 322 338 354 370] [258 274 290 306 322 338 354 370]
[386 402 418 434 450 466 482 498] [386 402 418 434 450 466 482 498]
[  3  19  35  51  67  83  99 115] [  3  19  35  51  67  83  99 115]
[131 147 163 179 195 211 227 243] [131 147 163 179 195 211 227 243]
[259 275 291 307 323 339 355 371] [259 275 291 307 323 339 355 371]
[387 403 419 435 451 467 483 499] [387 403 419 435 451 467 483 499]
[  4  20  36  52  68  84 100 116] [  4  20  36  52  68  84 100 116]
[132 148 164 180 196 212 228 244] [132 148 164 180 196 212 228 244]
[260 276 292 308 324 340 356 372] [260 276 292 308 324 340 356 372]
[388 404 420 436 452 468 484 500] [388 404 420 436 452 468 484 500]
[  5  21  37  53  69  85 101 117] [  5  21  37  53  69  85 101 117]
[133 149 165 181 197 213 229 245] [133 149 165 181 197 213 229 245]
[261 277 293 309 325 341 357 373] [261 277 293 309 325 341 357 373]
[389 405 421 437 453 469 485 501] [389 405 421 437 453 469 485 501]
[  6  22  38  54  70  86 102 118] [  6  22  38  54  70  86 102 118]
[134 150 166 182 198 214 230 246] [134 150 166 182 198 214 230 246]
[262 278 294 310 326 342 358 374] [262 278 294 310 326 342 358 374]
[390 406 422 438 454 470 486 502] [390 406 422 438 454 470 486 502]
[  7  23  39  55  71  87 103 119] [  7  23  39  55  71  87 103 119]
[135 151 167 183 199 215 231 247] [135 151 167 183 199 215 231 247]
[263 279 295 311 327 343 359 375] [263 279 295 311 327 343 359 375]
[391 407 423 439 455 471 487 503] [391 407 423 439 455 471 487 503]
[  8  24  40  56  72  88 104 120] [  8  24  40  56  72  88 104 120]
[136 152 168 184 200 216 232 248] [136 152 168 184 200 216 232 248]
[264 280 296 312 328 344 360 376] [264 280 296 312 328 344 360 376]
[392 408 424 440 456 472 488 504] [392 408 424 440 456 472 488 504]
[  9  25  41  57  73  89 105 121] [  9  25  41  57  73  89 105 121]
[137 153 169 185 201 217 233 249] [137 153 169 185 201 217 233 249]
[265 281 297 313 329 345 361 377] [265 281 297 313 329 345 361 377]
[393 409 425 441 457 473 489 505] [393 409 425 441 457 473 489 505]
[ 10  26  42  58  74  90 106 122] [ 10  26  42  58  74  90 106 122]
[138 154 170 186 202 218 234 250] [138 154 170 186 202 218 234 250]
[266 282 298 314 330 346 362 378] [266 282 298 314 330 346 362 378]
[394 410 426 442 458 474 490 506] [394 410 426 442 458 474 490 506]
[ 11  27  43  59  75  91 107 123] [ 11  27  43  59  75  91 107 123]
[139 155 171 187 203 219 235 251] [139 155 171 187 203 219 235 251]
[267 283 299 315 331 347 363 379] [267 283 299 315 331 347 363 379]
[395 411 427 443 459 475 491 507] [395 411 427 443 459 475 491 507]
[ 12  28  44  60  76  92 108 124] [ 12  28  44  60  76  92 108 124]
[140 156 172 188 204 220 236 252] [140 156 172 188 204 220 236 252]
[268 284 300 316 332 348 364 380] [268 284 300 316 332 348 364 380]
[396 412 428 444 460 476 492 508] [396 412 428 444 460 476 492 508]
[ 13  29  45  61  77  93 109 125] [ 13  29  45  61  77  93 109 125]
[141 157 173 189 205 221 237 253] [141 157 173 189 205 221 237 253]
[269 285 301 317 333 349 365 381] [269 285 301 317 333 349 365 381]
[397 413 429 445 461 477 493 509] [397 413 429 445 461 477 493 509]
[ 14  30  46  62  78  94 110 126] [ 14  30  46  62  78  94 110 126]
[142 158 174 190 206 222 238 254] [142 158 174 190 206 222 238 254]
[270 286 302 318 334 350 366 382] [270 286 302 318 334 350 366 382]
[398 414 430 446 462 478 494 510] [398 414 430 446 462 478 494 510]
[ 15  31  47  63  79  95 111 127] [ 15  31  47  63  79  95 111 127]
[143 159 175 191 207 223 239 255] [143 159 175 191 207 223 239 255]
[271 287 303 319 335 351 367 383] [271 287 303 319 335 351 367 383]
[399 415 431 447 463 479 495 511] [399 415 431 447 463 479 495 511]

所以要用c代码的话：

//reg_1max=8,tmp用来存放连续的8个数
float* tmp = new float[reg_1max]; 
for (int m = 0; m < reg_1max; m++)
{
tmp[m] = out_box[k * reg_1max* num_grid_y * num_grid_x  + i * num_grid_x + j + m * num_grid_y * num_grid_x];
}

理解一下，kxkx4x8对应的是ik48+j48+k8,这是8个数的首地址，接差连取8个即可
4x8xkxk对应的是python: k,:,i,j，冒号就是全取，c对应的是：k8ij+ik+j+mkk ，这个m就是冒号这里的位置，取m=0,1,2,3,4,5,6,7,分别表示这一维度下的8个。

3、softmax改进

换了一个softmax的实现方式

#define _CRT_SECURE_NO_WARNINGS
#include <iostream>
#include <fstream>
#include <string>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
//#include <cuda_provider_factory.h>
#include <onnxruntime_cxx_api.h>

//using namespace cv;
//using namespace std;
//using namespace Ort;

typedef struct BoxInfo
{
	float x1;
	float y1;
	float x2;
	float y2;
	float score;
	int label;
} BoxInfo;

class PicoDet
{
public:
	PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold);
	void detect(cv::Mat& cv_image);
private:
	float score_threshold = 0.5;
	float nms_threshold = 0.5;
	std::vector<std::string> class_names;
	int num_class;

	cv::Mat resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left);
	std::vector<float> input_image_;
	void normalize_(cv::Mat img);
	inline float fast_exp(float x);
	template <typename _Tp>
	int activation_function_softmax(const _Tp* src, _Tp* dst, int length);
	//void softmax_(const float* x, float* y, int length);
	void generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box);
	void nms(std::vector<BoxInfo>& input_boxes);
	const bool keep_ratio = false;
	int inpWidth;
	int inpHeight;
	int num_outs;
	int reg_max;
	std::vector<int> stride;
	//const float mean[3] = { 103.53, 116.28, 123.675 };
	//const float stds[3] = { 57.375, 57.12, 58.395 };
	const float mean[3] = { 0.0, 0.0, 0.0 };
	const float stds[3] = { 255.0, 255.0, 255.0 };

	Ort::Env env = Ort::Env(ORT_LOGGING_LEVEL_ERROR, "picodet");
	Ort::Session* ort_session = nullptr;
	Ort::SessionOptions sessionOptions = Ort::SessionOptions();
	std::vector<char*> input_names;
	std::vector<char*> output_names;
	std::vector<std::vector<int64_t>> input_node_dims; // >=1 outputs
	std::vector<std::vector<int64_t>> output_node_dims; // >=1 outputs
};
inline float PicoDet::fast_exp(float x) {
	union {
		uint32_t i;
		float f;
	} v{};
	v.i = (1 << 23) * (1.4426950409 * x + 126.93490512f);
	return v.f;
}

template <typename _Tp>
int PicoDet::activation_function_softmax(const _Tp* src, _Tp* dst, int length) {
	const _Tp alpha = *std::max_element(src, src + length);
	_Tp denominator{ 0 };

	for (int i = 0; i < length; ++i) {
		dst[i] = fast_exp(src[i] - alpha);
		denominator += dst[i];
	}

	for (int i = 0; i < length; ++i) {
		dst[i] /= denominator;
	}

	return 0;
}
PicoDet::PicoDet(std::string model_path, std::string classesFile, float nms_threshold, float objThreshold)
{
	std::ifstream ifs(classesFile.c_str());
	std::string line;
	while (std::getline(ifs, line)) this->class_names.push_back(line);
	this->num_class = class_names.size();
	this->nms_threshold = nms_threshold;
	this->score_threshold = objThreshold;

	std::wstring widestr = std::wstring(model_path.begin(), model_path.end());
	//OrtStatus* status = OrtSessionOptionsAppendExecutionProvider_CUDA(sessionOptions, 0);
	sessionOptions.SetGraphOptimizationLevel(ORT_ENABLE_BASIC);
	ort_session = new Ort::Session(env, widestr.c_str(), sessionOptions);
	size_t numInputNodes = ort_session->GetInputCount();
	size_t numOutputNodes = ort_session->GetOutputCount();
	Ort::AllocatorWithDefaultOptions allocator;
	for (int i = 0; i < numInputNodes; i++)
	{
		input_names.push_back(ort_session->GetInputName(i, allocator));
		Ort::TypeInfo input_type_info = ort_session->GetInputTypeInfo(i);
		auto input_tensor_info = input_type_info.GetTensorTypeAndShapeInfo();
		auto input_dims = input_tensor_info.GetShape();
		input_node_dims.push_back(input_dims);
	}
	for (int i = 0; i < numOutputNodes; i++)
	{
		output_names.push_back(ort_session->GetOutputName(i, allocator));
		Ort::TypeInfo output_type_info = ort_session->GetOutputTypeInfo(i);
		auto output_tensor_info = output_type_info.GetTensorTypeAndShapeInfo();
		auto output_dims = output_tensor_info.GetShape();
		output_node_dims.push_back(output_dims);
		/*for (int j = 0; j < output_dims.size(); j++)
		{
			cout << output_dims[j] << ",";
		}
		cout << endl;*/
	}
	this->inpHeight = input_node_dims[0][2];
	this->inpWidth = input_node_dims[0][3];
	this->num_outs = int(numOutputNodes * 0.5);
	this->reg_max = output_node_dims[this->num_outs][output_node_dims[this->num_outs].size() - 1] / 4 - 1;
	for (int i = 0; i < this->num_outs; i++)
	{
		stride.push_back(int(8 * pow(2, i)));
	}
}

cv::Mat PicoDet::resize_image(cv::Mat srcimg, int* newh, int* neww, int* top, int* left)
{
	int srch = srcimg.rows, srcw = srcimg.cols;
	*newh = this->inpHeight;
	*neww = this->inpWidth;
	cv::Mat dstimg;
	if (this->keep_ratio && srch != srcw) {
		float hw_scale = (float)srch / srcw;
		if (hw_scale > 1) {
			*newh = this->inpHeight;
			*neww = int(this->inpWidth / hw_scale);
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
			*left = int((this->inpWidth - *neww) * 0.5);
			copyMakeBorder(dstimg, dstimg, 0, 0, *left, this->inpWidth - *neww - *left, cv::BORDER_CONSTANT, 0);
		}
		else {
			*newh = (int)this->inpHeight * hw_scale;
			*neww = this->inpWidth;
			resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
			*top = (int)(this->inpHeight - *newh) * 0.5;
			copyMakeBorder(dstimg, dstimg, *top, this->inpHeight - *newh - *top, 0, 0, cv::BORDER_CONSTANT, 0);
		}
	}
	else {
		cv::resize(srcimg, dstimg, cv::Size(*neww, *newh), cv::INTER_AREA);
	}
	return dstimg;
}

void PicoDet::normalize_(cv::Mat img)
{
	//    img.convertTo(img, CV_32F);
	int row = img.rows;
	int col = img.cols;
	this->input_image_.resize(row * col * img.channels());
	for (int c = 0; c < 3; c++)
	{
		for (int i = 0; i < row; i++)
		{
			for (int j = 0; j < col; j++)
			{
				float pix = img.ptr<uchar>(i)[j * 3 + c];
				this->input_image_[c * row * col + i * col + j] = (pix / 255.0 - mean[c] / 255.0) / (stds[c] / 255.0);
				//this->input_image_[c * row * col + i * col + j] = (pix - mean[c]) / stds[c];
			}
		}
	}
}
/*
void PicoDet::softmax_(const float* x, float* y, int length)
{
	float sum = 0;
	int i = 0;
	for (i = 0; i < length; i++)
	{
		y[i] = exp(x[i]);
		sum += y[i];
	}
	for (i = 0; i < length; i++)
	{
		y[i] /= sum;
	}
}
*/



void PicoDet::generate_proposal(std::vector<BoxInfo>& generate_boxes, const int stride_, const float* out_score, const float* out_box)
{
	const int num_grid_y = (int)ceil((float)this->inpHeight / stride_);
	const int num_grid_x = (int)ceil((float)this->inpWidth / stride_);
	////cout << "num_grid_x=" << num_grid_x << ",num_grid_y=" << num_grid_y << endl;
	const int reg_1max = reg_max + 1;
	//std::cout << "score:" << std::endl;
	for (int i = 0; i < num_grid_y; i++)
	{
		for (int j = 0; j < num_grid_x; j++)
		{
			int max_ind = 0;
			float max_score = 0;
			
			for (int k = 0; k < num_class; k++)
			{   
				/*这个代码是原始的输出*/
		    	//float score = out_score[i * num_grid_x * num_class + j * num_class + k];
				/*以下代码是去掉reshape和transpose的，用C来实现这个功能的这两部分代码选一个即可,可以理解成ijk对应kij*/
				float score = std::sqrt(out_score[k*num_grid_y*num_grid_x+i*num_grid_x+j]);
				//std::cout <<score << " ";
				if (score > max_score)
				{
					max_score = score;
					max_ind = k;
				}
			}
			if (max_score >= score_threshold)
			{
				std::cout << "box:" << std::endl;
				//const float* pbox = out_box + idx * reg_1max * 4;
				float dis_pred[4];
				float* y = new float[reg_1max];
				for (int k = 0; k < 4; k++)
				{
					/*原始模型*/
					//const float* tmp = out_box + i * num_grid_x * reg_1max * 4 + j * reg_1max * 4 + k * reg_1max;
					//std::cout << "r:" << *tmp << std::endl;
					/*换用没有reshape transpose的*/
					float* tmp = new float[reg_1max];
					for (int m = 0; m < reg_1max; m++)
					{
						tmp[m] = out_box[k * num_grid_y * num_grid_x * reg_1max + i * num_grid_x + j + m * num_grid_y * num_grid_x];
					}
					//std::cout << "r:" << *tmp << std::endl;
					//softmax_(tmp, y, reg_1max);
					activation_function_softmax(tmp, y, reg_1max);
					float dis = 0.f;
					for (int l = 0; l < reg_1max; l++)
					{
						dis += l * y[l];
					}
					dis_pred[k] = dis * stride_;
				}
				delete[] y;
				float pb_cx = (j + 0.5f) * stride_ - 0.5;
				float pb_cy = (i + 0.5f) * stride_ - 0.5;
				float x0 = pb_cx - dis_pred[0];
				float y0 = pb_cy - dis_pred[1];
				float x1 = pb_cx + dis_pred[2];
				float y1 = pb_cy + dis_pred[3];
				generate_boxes.push_back(BoxInfo{ x0, y0, x1, y1, max_score, max_ind });
			}
		}
	}
}

void PicoDet::nms(std::vector<BoxInfo>& input_boxes)
{
	sort(input_boxes.begin(), input_boxes.end(), [](BoxInfo a, BoxInfo b) { return a.score > b.score; });
	std::vector<float> vArea(input_boxes.size());
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		vArea[i] = (input_boxes.at(i).x2 - input_boxes.at(i).x1 + 1)
			* (input_boxes.at(i).y2 - input_boxes.at(i).y1 + 1);
	}

	std::vector<bool> isSuppressed(input_boxes.size(), false);
	for (int i = 0; i < int(input_boxes.size()); ++i)
	{
		if (isSuppressed[i]) { continue; }
		for (int j = i + 1; j < int(input_boxes.size()); ++j)
		{
			if (isSuppressed[j]) { continue; }
			float xx1 = (std::max)(input_boxes[i].x1, input_boxes[j].x1);
			float yy1 = (std::max)(input_boxes[i].y1, input_boxes[j].y1);
			float xx2 = (std::min)(input_boxes[i].x2, input_boxes[j].x2);
			float yy2 = (std::min)(input_boxes[i].y2, input_boxes[j].y2);

			float w = (std::max)(float(0), xx2 - xx1 + 1);
			float h = (std::max)(float(0), yy2 - yy1 + 1);
			float inter = w * h;
			float ovr = inter / (vArea[i] + vArea[j] - inter);

			if (ovr >= this->nms_threshold)
			{
				isSuppressed[j] = true;
			}
		}
	}
	// return post_nms;
	int idx_t = 0;
	input_boxes.erase(remove_if(input_boxes.begin(), input_boxes.end(), [&idx_t, &isSuppressed](const BoxInfo& f) { return isSuppressed[idx_t++]; }), input_boxes.end());
}

void PicoDet::detect(cv::Mat& srcimg)
{
	int newh = 0, neww = 0, top = 0, left = 0;
	cv::Mat cv_image = srcimg.clone();
	cv::Mat dst = this->resize_image(cv_image, &newh, &neww, &top, &left);
	this->normalize_(dst);
	std::array<int64_t, 4> input_shape_{ 1, 3, this->inpHeight, this->inpWidth };

	auto allocator_info = Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
	Ort::Value input_tensor_ = Ort::Value::CreateTensor<float>(allocator_info, input_image_.data(), input_image_.size(), input_shape_.data(), input_shape_.size());


	std::vector<Ort::Value> ort_outputs = ort_session->Run(Ort::RunOptions{ nullptr }, &input_names[0], &input_tensor_, 1, output_names.data(), output_names.size());   // ???????
	/////generate proposals
	std::vector<BoxInfo> generate_boxes;
	for (int i = 0; i < this->num_outs; i++)
	{
		//auto cls_shape = this->output_node_dims[i];
		const float* cls_score = ort_outputs[i].GetTensorMutableData<float>();
		//std::vector<int64_t> new_cls_shape = { cls_shape[0],cls_shape[1],cls_shape[2] * cls_shape[3] };
		


		const float* bbox_pred = ort_outputs[i + this->num_outs].GetTensorMutableData<float>();
		//auto reg_shape = this->output_node_dims[i+this->num_outs];
		generate_proposal(generate_boxes, stride[i], cls_score, bbox_pred);
	}

	//// Perform non maximum suppression to eliminate redundant overlapping boxes with
	//// lower confidences
	nms(generate_boxes);
	float ratioh = (float)cv_image.rows / newh;
	float ratiow = (float)cv_image.cols / neww;
	for (size_t i = 0; i < generate_boxes.size(); ++i)
	{
		int xmin = (int)std::max((generate_boxes[i].x1 - left) * ratiow, 0.f);
		int ymin = (int)std::max((generate_boxes[i].y1 - top) * ratioh, 0.f);
		int xmax = (int)std::min((generate_boxes[i].x2 - left) * ratiow, (float)cv_image.cols);
		int ymax = (int)std::min((generate_boxes[i].y2 - top) * ratioh, (float)cv_image.rows);
		rectangle(srcimg, cv::Point(xmin, ymin), cv::Point(xmax, ymax), cv::Scalar(0, 0, 255), 2);
		std::string label = cv::format("%.2f", generate_boxes[i].score);
		label = this->class_names[generate_boxes[i].label] + ":" + label;
		putText(srcimg, label, cv::Point(xmin, ymin - 5), cv::FONT_HERSHEY_SIMPLEX, 0.75, cv::Scalar(0, 255, 0), 1);
	}
}

int main()
{
	//PicoDet mynet("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/picodet_xs_320_voc_256_20230405_shape.onnx", "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/ball.names", 0.5, 0.5);  /// choice = ["picodet_m_320_coco.onnx", "picodet_m_416_coco.onnx", "picodet_s_320_coco.onnx", "picodet_s_416_coco.onnx"]
	PicoDet mynet("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/picodet_xs_320_voc_256_20230405_shape_sim_prune.onnx", "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/ball.names", 0.5, 0.5);
	std::string imgpath = "C:/Users/tl/Desktop/demo_ncnn/ncnn_our/test.jpg";
	cv::Mat srcimg = cv::imread(imgpath);
	mynet.detect(srcimg);
	cv::imwrite("C:/Users/tl/Desktop/demo_ncnn/ncnn_our/test_result.jpg", srcimg);
	static const std::string kWinName = "Deep learning object detection in ONNXRuntime";
	cv::namedWindow(kWinName, cv::WINDOW_NORMAL);
	cv::imshow(kWinName, srcimg);
	cv::waitKey(0);
	cv::destroyAllWindows();
}

腾讯云开发者社区

腾讯云面向开发者汇聚海量精品云计算使用和开发经验，营造开放的云计算技术生态圈。

更多推荐

自动化提示词生成工具盘点

腾讯云开发者社区

AI 浪潮下的锚与帆：工程师文化的变与不变 | 架构师夜生活

腾讯云开发者社区

腾讯云架构师技术沙龙 · 长沙站圆满落幕，共话AI驱动下的技术架构与前沿应用

人工智能已成为推动技术创新与产业变革的重要引擎，开发者正身处一场前所未有的技术变革之中。通过本次腾讯云架构师技术沙龙，各位专家深入分享前沿技术洞察，探讨 AI 落地的应用路径与实践经验，为架构师的职业发展指明方向。腾讯云架构师长沙同盟和腾讯云架构师技术同盟长沙地区理事会正式成立。未来，腾讯云架构师长沙同盟将凝心聚力，打造属于本地架构师的学习与成长的家园，助力中国架构的蓬勃发展。未来已来，让我们携手