avo*_*ado 11 c++ parallel-processing tbb openmp c++17
由于 c++17 std 库支持并行算法,我认为它是我们的首选,但在与tbb
and比较后openmp
,我改变了主意,我发现 std 库要慢得多。
通过这个帖子,我想请教一下我是否应该放弃标准库的并行算法,而使用tbb
or 的专业建议openmp
,谢谢!
环境:
基准代码:
#include <algorithm>
#include <cmath>
#include <chrono>
#include <execution>
#include <iostream>
#include <tbb/parallel_for.h>
#include <vector>
const size_t N = 1000000;
double std_for() {
auto values = std::vector<double>(N);
size_t n_par = 5lu;
auto indices = std::vector<size_t>(n_par);
std::iota(indices.begin(), indices.end(), 0lu);
size_t stride = static_cast<size_t>(N / n_par) + 1;
std::for_each(
std::execution::par,
indices.begin(),
indices.end(),
[&](size_t index) {
int begin = index * stride;
int end = (index+1) * stride;
for (int i = begin; i < end; ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
});
double total = 0;
for (double value : values)
{
total += value;
}
return total;
}
double tbb_for() {
auto values = std::vector<double>(N);
tbb::parallel_for(
tbb::blocked_range<int>(0, values.size()),
[&](tbb::blocked_range<int> r) {
for (int i=r.begin(); i<r.end(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
});
double total = 0;
for (double value : values) {
total += value;
}
return total;
}
double omp_for()
{
auto values = std::vector<double>(N);
#pragma omp parallel for
for (int i=0; i<values.size(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
double total = 0;
for (double value : values) {
total += value;
}
return total;
}
double seq_for()
{
auto values = std::vector<double>(N);
for (int i=0; i<values.size(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
double total = 0;
for (double value : values) {
total += value;
}
return total;
}
void time_it(double(*fn_ptr)(), const std::string& fn_name) {
auto t1 = std::chrono::high_resolution_clock::now();
auto rez = fn_ptr();
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
std::cout << fn_name << ", rez = " << rez << ", dur = " << duration << std::endl;
}
int main(int argc, char** argv) {
std::string op(argv[1]);
if (op == "std_for") {
time_it(&std_for, op);
} else if (op == "omp_for") {
time_it(&omp_for, op);
} else if (op == "tbb_for") {
time_it(&tbb_for, op);
} else if (op == "seq_for") {
time_it(&seq_for, op);
}
}
Run Code Online (Sandbox Code Playgroud)
编译选项:
g++ --std=c++17 -O3 b.cpp -ltbb -I /usr/local/include -L /usr/local/lib -fopenmp
Run Code Online (Sandbox Code Playgroud)
结果:
std_for, rez = 500106, dur = 11119
tbb_for, rez = 500106, dur = 7372
omp_for, rez = 500106, dur = 4781
seq_for, rez = 500106, dur = 27910
Run Code Online (Sandbox Code Playgroud)
我们可以看到它std_for
比seq_for
(sequential for-loop)快,但它仍然比tbb
and慢得多openmp
。
正如人们在评论中建议的那样,for
为了公平起见,我分别运行每个。更新上面的代码,结果如下,
>>> ./a.out seq_for
seq_for, rez = 500106, dur = 29885
>>> ./a.out tbb_for
tbb_for, rez = 500106, dur = 10619
>>> ./a.out omp_for
omp_for, rez = 500106, dur = 10052
>>> ./a.out std_for
std_for, rez = 500106, dur = 12423
Run Code Online (Sandbox Code Playgroud)
正如 ppl 所说,与之前的结果相比,连续运行 4 个版本是不公平的。
您已经发现,到底要测量什么以及如何测量很重要。您的最终任务肯定与这个简单的练习有很大不同,并且不完全反映此处找到的结果。
除了受执行任务顺序影响的缓存和预热(您在更新的问题中明确研究了这一点)之外,您的示例中还应该考虑另一个问题。
实际的并行代码才是重要的。如果这不能确定您的性能/运行时间,那么并行化不是正确的解决方案。但在您的示例中,您还测量资源分配、初始化和最终计算。如果这些因素导致了最终应用程序的实际成本,那么并行化并不是灵丹妙药。因此,为了公平比较并真正测量实际的并行代码执行性能。我建议沿着这条线修改你的代码(抱歉,我没有安装 openmp)并继续你的学习:
#include <algorithm>
#include <cmath>
#include <chrono>
#include <execution>
#include <iostream>
#include <tbb/parallel_for.h>
#include <vector>
const size_t N = 10000000; // #1
void std_for(std::vector<double>& values,
std::vector<size_t> const& indices,
size_t const stride) {
std::for_each(
std::execution::par,
indices.begin(),
indices.end(),
[&](size_t index) {
int begin = index * stride;
int end = (index+1) * stride;
for (int i = begin; i < end; ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
});
}
void tbb_for(std::vector<double>& values) {
tbb::parallel_for(
tbb::blocked_range<int>(0, values.size()),
[&](tbb::blocked_range<int> r) {
for (int i=r.begin(); i<r.end(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
});
}
/*
double omp_for()
{
auto values = std::vector<double>(N);
#pragma omp parallel for
for (int i=0; i<values.size(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
double total = 0;
for (double value : values) {
total += value;
}
return total;
}
*/
void seq_for(std::vector<double>& values)
{
for (int i=0; i<values.size(); ++i) {
values[i] = 1.0 / (1 + std::exp(-std::sin(i * 0.001)));
}
}
void time_it(void(*fn_ptr)(std::vector<double>&), const std::string& fn_name) {
std::vector<double> values = std::vector<double>(N);
auto t1 = std::chrono::high_resolution_clock::now();
fn_ptr(values);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
double total = 0;
for (double value : values) {
total += value;
}
std::cout << fn_name << ", res = " << total << ", dur = " << duration << std::endl;
}
void time_it_std(void(*fn_ptr)(std::vector<double>&, std::vector<size_t> const&, size_t const), const std::string& fn_name) {
std::vector<double> values = std::vector<double>(N);
size_t n_par = 5lu; // #2
auto indices = std::vector<size_t>(n_par);
std::iota(indices.begin(), indices.end(), 0lu);
size_t stride = static_cast<size_t>(N / n_par) + 1;
auto t1 = std::chrono::high_resolution_clock::now();
fn_ptr(values, indices, stride);
auto t2 = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>( t2 - t1 ).count();
double total = 0;
for (double value : values) {
total += value;
}
std::cout << fn_name << ", res = " << total << ", dur = " << duration << std::endl;
}
int main(int argc, char** argv) {
std::string op(argv[1]);
if (op == "std_for") {
time_it_std(&std_for, op);
// } else if (op == "omp_for") {
//time_it(&omp_for, op);
} else if (op == "tbb_for") {
time_it(&tbb_for, op);
} else if (op == "seq_for") {
time_it(&seq_for, op);
}
}
Run Code Online (Sandbox Code Playgroud)
在我的(慢速)系统上,这会导致:
我这里注意到,从 seq_for 到 tbb_for 的差异进一步增大。现在它是 ~4x,而在您的示例中它看起来更像 ~3x。std_for 仍然比 tbb_for 慢 20..30% 左右。
然而,还有更多参数。将 N(参见#1)增加 10 倍(好吧,这不是很重要)并将 n_par(参见#2)从 5 增加到 100(这很重要)后,结果为
这里 std_for 与 tbb_for 是同等的!
因此,回答你的问题:我显然不会立即放弃 c++17 std 并行化。