wki*_*ing 0 c++ optimization opencv cpu-usage
所有
我正在编写一个图像处理程序,在一帧中使用cv :: resize(INTER_LINEAR)数百次。但是,我发现cv :: resize()是一个cpu杀手,它是我程序中的热点。是否有更好的方法来以更少的CPU使用率来调整图像大小?
代码有点像这样:
void process(const cv::Mat& frame) {
for(int i = 0; i < COUNTS; ++i) {
int new_rows = CalculateHeight();
int new_cols = CalculateWidth();
cv::Mat new_img;
cv::resize(frame, new_mg, cv::Size(new_cols, new_rows));
// ...
}
// ...
}
Run Code Online (Sandbox Code Playgroud)
谢谢!
这是我使用OpenCV函数调整随机图像大小10,000次所做的一些测试的结果。最好的解决方案似乎是在调整大小之前使用ROI或滚动您自己的ASM AVX函数以使用每1/3(或您需要的缩放比例)行和列来转换为灰度(如果可能)。调整大小功能是相当优化的。
Colour
INTER_LINEAR 7953.89ms
INTER_LINEAR GPU 2252.72ms
INTER_LINEAR GPU MEMIO 23303.7ms
INTER_NEAREST 7297.58ms
INTER_NEAREST GPU 906.336ms
INTER_NEAREST GPU MEMIO 22374.1ms
BORDER_DEFAULT 47488.8ms
BORDER_REFLECT 47515.4ms
BORDER_REPLICATE 47516ms
BORDER_WRAP 47980.7ms
PYR GPU 4126.93ms
Grayscale
INTER_LINEAR 413.789ms
INTER_LINEAR GPU 1027.85ms
INTER_LINEAR GPU MEMIO 9568.99ms
INTER_NEAREST 978.89ms
INTER_NEAREST GPU 747.621ms
INTER_NEAREST GPU MEMIO 9346.28ms
BORDER_DEFAULT 19266.7ms
BORDER_REFLECT 19274.1ms
BORDER_REPLICATE 19300.8ms
BORDER_WRAP 19386.3ms
PYR GPU 2272.7ms
#include "opencv2/opencv.hpp"
#include "opencv2/cudaimgproc.hpp"
#include "opencv2/cudawarping.hpp"
#include <iostream>
#include <string>
#include <chrono>
using namespace std;
using namespace cv;
template <typename T>
double resizePerfEval(const Mat& frame, unsigned int n, T resizeFlag) {
auto start = chrono::steady_clock::now();
for (auto i = 0; i < n; i++) {
Mat temp;
resize(frame, temp, Size(), 0.5, 0.5, resizeFlag);
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
template <typename T>
double pyramidPerfEval(const Mat& frame, unsigned int n, T border) {
auto start = chrono::steady_clock::now();
Size s(frame.cols / 2, frame.rows / 2);
for (auto i = 0; i < n; i++) {
Mat tmp;
pyrDown(frame, tmp, s, border);
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
template <typename T>
double resizePerfEvalGPU(const Mat& frame, unsigned int n, T resizeFlag, bool uploadDownload=false) {
auto start = chrono::steady_clock::now();
Mat tmp;
cuda::GpuMat frame_d, temp;
frame_d.upload(frame);
for (auto i = 0; i < n; i++) {
cuda::resize(frame_d, temp, Size(), 0.5, 0.5, resizeFlag);
if (uploadDownload) {
temp.download(tmp);
frame_d.upload(frame);
}
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
double pyramidPerfEvalGPU(const Mat& frame, unsigned int n, bool uploadDownload = false) {
auto start = chrono::steady_clock::now();
Mat tmp;
cuda::GpuMat frame_d, temp;
frame_d.upload(frame);
for (auto i = 0; i < n; i++) {
cuda::pyrDown(frame_d, temp);
if (uploadDownload) {
temp.download(tmp);
frame_d.upload(frame);
}
}
return chrono::duration <double, milli>(chrono::steady_clock::now() - start).count();
}
void runTest(const Mat& frame, unsigned int n) {
cout << "INTER_LINEAR " << resizePerfEval(frame, n, INTER_LINEAR) << "ms" << endl;
cout << "INTER_LINEAR GPU " << resizePerfEvalGPU(frame, n, INTER_LINEAR) << "ms" << endl;
cout << "INTER_LINEAR GPU MEMIO " << resizePerfEvalGPU(frame, n, INTER_LINEAR, true) << "ms" << endl;
cout << "INTER_NEAREST " << resizePerfEval(frame, n, INTER_NEAREST) << "ms" << endl;
cout << "INTER_NEAREST GPU " << resizePerfEvalGPU(frame, n, INTER_NEAREST) << "ms" << endl;
cout << "INTER_NEAREST GPU MEMIO " << resizePerfEvalGPU(frame, n, INTER_NEAREST, true) << "ms" << endl;
cout << "BORDER_DEFAULT " << pyramidPerfEval(frame, n, BORDER_DEFAULT) << "ms" << endl;
cout << "BORDER_REFLECT " << pyramidPerfEval(frame, n, BORDER_REFLECT) << "ms" << endl;
cout << "BORDER_REPLICATE " << pyramidPerfEval(frame, n, BORDER_REPLICATE) << "ms" << endl;
cout << "BORDER_WRAP " << pyramidPerfEval(frame, n, BORDER_WRAP) << "ms" << endl;
cout << "PYR GPU " << pyramidPerfEvalGPU(frame, n) << "ms" << endl;
}
int main(int argc, char* argv[])
{
Mat gsframe, frame = Mat::ones(Size(1920, 1080), CV_8UC3);
randu(frame, Scalar::all(0), Scalar::all(255));
cvtColor(frame, gsframe, CV_BGR2GRAY);
auto n = 10000;
cout << "Colour" << endl;
runTest(frame, n);
cout << endl << "Grayscale" << endl;
runTest(gsframe, n);
return 0;
}
Run Code Online (Sandbox Code Playgroud)
如果算法在PC上运行,则另一种方法是在启用CUDA的GPU上调整大小。您在选择卡时必须小心,因为您需要足够的内存带宽以适应从GPU内存上载和下载图像所花费的时间。
从结果中注意到,CPU在灰度级上击败了GPU,并且图像在GPU内存上不可用。如果图像在GPU内存中可用,则对于Color,其使用GPU的速度提高了3.5倍(特别是对于非常大的图像尺寸)。对于高端应用,可以使用带有GPUDirect的NVIDIA采集卡来实现。
基准测试是在Xeon E5 v2 @ 3.0Ghz 680GTX上进行的
归档时间: |
|
查看次数: |
4310 次 |
最近记录: |