ran*_*ser 9 c++ performance iterator eigen
我正在创建一些函数来做一些事情,比如负数和正数的分离和,kahan,成对和其他东西,其中我从矩阵中获取元素的顺序无关紧要,例如:
template <typename T, int R, int C>
inline T sum(const Eigen::Matrix<T,R,C>& xs)
{
T sumP(0);
T sumN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
sumP += xs(i,j);
else if (xs(i,j)<0) //ignore 0 elements: improvement for sparse matrices I think
sumN += xs(i,j);
}
return sumP+sumN;
}
Run Code Online (Sandbox Code Playgroud)
现在,我想尽可能提高效率,所以我的问题是,如上所述循环遍历每一行的每一列会更好,或者像下面这样做相反:
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
Run Code Online (Sandbox Code Playgroud)
(我想这取决于矩阵元素在内存中的分配顺序,但我在Eigen的手册中找不到).
还有,还有其他替代方法,比如使用迭代器(它们存在于Eigen中吗?)可能会稍快一点?
ran*_*ser 17
我做了一些基准测试来检查哪种方式更快,我得到了以下结果(以秒为单位):
12
30
3
6
23
3
Run Code Online (Sandbox Code Playgroud)
第一行是按照@jleahy的建议进行迭代.第二行是迭代,就像我在问题中的代码中所做的那样(@jleahy的逆序).第三行是做迭代使用PlainObjectBase::data()这样for (int i = 0; i < matrixObject.size(); i++).其他3行重复与上面相同,但有一个临时的@ lucas92建议
我也做了相同的测试,但使用替换/ if else.*/for/else /(对稀疏矩阵没有特殊处理),我得到以下(在几秒钟内):
10
27
3
6
24
2
Run Code Online (Sandbox Code Playgroud)
再次进行测试给我的结果非常相似.我用g++ 4.7.3用-O3.代码:
#include <ctime>
#include <iostream>
#include <Eigen/Dense>
using namespace std;
template <typename T, int R, int C>
inline T sum_kahan1(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
if (xs(j,i)>0)
{
yP = xs(j,i) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (xs(j,i)<0)
{
yN = xs(j,i) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
{
yP = xs(i,j) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (xs(i,j)<0)
{
yN = xs(i,j) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
if ((*(xs.data() + i))>0)
{
yP = (*(xs.data() + i)) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if ((*(xs.data() + i))<0)
{
yN = (*(xs.data() + i)) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
T temporary = xs(j,i);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
T temporary = xs(i,j);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3t(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
T temporary = (*(xs.data() + i));
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else if (temporary<0)
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
if (xs(j,i)>0)
{
yP = xs(j,i) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = xs(j,i) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
if (xs(i,j)>0)
{
yP = xs(i,j) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = xs(i,j) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3e(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
if ((*(xs.data() + i))>0)
{
yP = (*(xs.data() + i)) - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = (*(xs.data() + i)) - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan1te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nCols; ++i)
for (size_t j = 0; j < nRows; ++j)
{
T temporary = xs(j,i);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan2te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, nRows = xs.rows(), nCols = xs.cols(); i < nRows; ++i)
for (size_t j = 0; j < nCols; ++j)
{
T temporary = xs(i,j);
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
template <typename T, int R, int C>
inline T sum_kahan3te(const Eigen::Matrix<T,R,C>& xs) {
if (xs.size() == 0) return 0;
T sumP(0);
T sumN(0);
T tP(0);
T tN(0);
T cP(0);
T cN(0);
T yP(0);
T yN(0);
for (size_t i = 0, size = xs.size(); i < size; i++)
{
T temporary = (*(xs.data() + i));
if (temporary>0)
{
yP = temporary - cP;
tP = sumP + yP;
cP = (tP - sumP) - yP;
sumP = tP;
}
else
{
yN = temporary - cN;
tN = sumN + yN;
cN = (tN - sumN) - yN;
sumN = tN;
}
}
return sumP+sumN;
}
int main() {
Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic> test = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic>::Random(10000,10000);
cout << "start" << endl;
int now;
now = time(0);
sum_kahan1(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3t(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3e(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan1te(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan2te(test);
cout << time(0) - now << endl;
now = time(0);
sum_kahan3te(test);
cout << time(0) - now << endl;
return 0;
}
Run Code Online (Sandbox Code Playgroud)
默认情况下,Eigen以列主(Fortran)顺序分配矩阵(文档).
迭代矩阵的最快方法是按存储顺序执行,以错误的方式执行此操作会增加缓存未命中数(如果矩阵不适合L1,则会占用您的计算时间,因此读取会增加计算时间)通过cacheline/elemsize的因子(可能是64/8 = 8).
如果您的矩阵适合L1缓存,这将没有什么区别,但一个好的编译器应该能够矢量化循环,启用AVX(在一个闪亮的新核心i7上)可以为您提供多达4倍的加速.(256位/ 64位).
最后不要指望任何Eigen的内置函数能给你加速(我不认为有任何迭代器,但我可能会弄错),它们只会给你相同的(非常简单) )代码.
TLDR:交换迭代顺序,您需要最快地改变行索引.