why is valarray so slow?

2019-03-08 15:56发布

Excuse me for my question for the valarray again. I am trying to use it since it is much like the matlab while operating the vector & matrices. I first did some performance check and found that valarray cannot achieve the performance declared as in the book c++ programming language by stroustrup.

The test program actually did 5M multiplication of doubles. I thought that c=a*b would at least be comparable to the for loop double type element multiplication, but I am totally wrong. Tried on several computers and vc6.0 and vs2008.

By the way, I tested on matlab using the following code:

len=5*1024*1024;
a=rand(len,1);b=rand(len,1);c=zeros(len,1);
tic;c=a.*b;toc;

and the result is 46ms. This time is not high precision, only works as a reference.

The code is:

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std ;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();

int main()
{
    enum { N = 5*1024*1024 };
    valarray<double> a(N), b(N), c(N) ;
    QueryPerformanceFrequency(&sys_freq);   
    int i,j;
    for(  j=0 ; j<8 ; ++j )
    {
        for(  i=0 ; i<N ; ++i ) 
        {
            a[i]=rand();
            b[i]=rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
        double dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
        dtime=gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        c = a*b ;
        dtime=gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
        dtime=gettime_hp()-dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n" ;

        cout << "------------------------------------------------------\n" ;
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}

The running results: (release mode with maximal speed optimization)

double operator* 52.3019 ms
valarray operator* 128.338 ms
valarray[i] operator* 43.1801 ms
------------------------------------------------------
double operator* 43.4036 ms
valarray operator* 145.533 ms
valarray[i] operator* 44.9121 ms
------------------------------------------------------
double operator* 43.2619 ms
valarray operator* 158.681 ms
valarray[i] operator* 43.4871 ms
------------------------------------------------------
double operator* 42.7317 ms
valarray operator* 173.164 ms
valarray[i] operator* 80.1004 ms
------------------------------------------------------
double operator* 43.2236 ms
valarray operator* 158.004 ms
valarray[i] operator* 44.3813 ms
------------------------------------------------------

debugging mode with same optimization:

double operator* 41.8123 ms
valarray operator* 201.484 ms
valarray[i] operator* 41.5452 ms
------------------------------------------------------
double operator* 40.2238 ms
valarray operator* 215.351 ms
valarray[i] operator* 40.2076 ms
------------------------------------------------------
double operator* 40.5859 ms
valarray operator* 232.007 ms
valarray[i] operator* 40.8803 ms
------------------------------------------------------
double operator* 40.9734 ms
valarray operator* 234.325 ms
valarray[i] operator* 40.9711 ms
------------------------------------------------------
double operator* 41.1977 ms
valarray operator* 234.409 ms
valarray[i] operator* 41.1429 ms
------------------------------------------------------
double operator* 39.7754 ms
valarray operator* 234.26 ms
valarray[i] operator* 39.6338 ms
------------------------------------------------------

标签: c++ valarray
7条回答
beautiful°
2楼-- · 2019-03-08 16:28

I finally got this through using delayed evaluation. The code may be ugly since I am just starting learning these c++ advanced concepts. Correct me if you have better idea please. Thanks a lot for all your assistance. Here is the code:

#include <iostream>
#include <valarray>
#include <iostream>
#include "windows.h"

using namespace std ;
SYSTEMTIME stime;
LARGE_INTEGER sys_freq;

double gettime_hp();
//to improve the c=a*b (it will generate a temp first, assigned to c and delete the temp
//which causes the program really slow
//the solution is the expression template and let the compiler to decide when all the expression is known
//delayed evaluation
//typedef valarray<double> Vector;
class Vector;
class VecMul
{
public:
    const Vector& va;
    const Vector& vb;
    //Vector& vc;
    VecMul(const Vector& v1,const Vector& v2):va(v1),vb(v2){}
    operator Vector();
};

class Vector:public valarray<double>
{
    valarray<double> *p;
public:
    explicit Vector(int n)
    {
        p=new valarray<double>(n);
    }
    Vector& operator=(const VecMul &m)
    {
        for(int i=0;i<m.va.size();i++) (*p)[i]=(m.va)[i]*(m.vb)[i];//ambiguous
        return *this;
    }
    double& operator[](int i) const {return (*p)[i];}  //const vector_type[i]
    int size()const {return (*p).size();}
};



inline VecMul operator*(const Vector& v1,const Vector& v2)
{
    return VecMul(v1,v2);
}


int main()
{
    enum { N = 5*1024*1024 };
    Vector a(N), b(N), c(N) ;
    QueryPerformanceFrequency(&sys_freq);   
    int i,j;
    for(  j=0 ; j<8 ; ++j )
    {
        for(  i=0 ; i<N ; ++i ) 
        {
            a[i]=rand();
            b[i]=rand();
        }

        double* a1 = &a[0], *b1 = &b[0], *c1 = &c[0] ;
        double dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c1[i] = a1[i] * b1[i] ;
        dtime=gettime_hp()-dtime;
        cout << "double operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        c = a*b ;
        dtime=gettime_hp()-dtime;
        cout << "valarray operator* " << dtime << " ms\n" ;

        dtime=gettime_hp();
        for(  i=0 ; i<N ; ++i ) c[i] = a[i] * b[i] ;
        dtime=gettime_hp()-dtime;
        cout << "valarray[i] operator* " << dtime<< " ms\n" ;

        cout << "------------------------------------------------------\n" ;
    }
}

double gettime_hp()
{
    LARGE_INTEGER tick;
    extern LARGE_INTEGER sys_freq;
    QueryPerformanceCounter(&tick);
    return (double)tick.QuadPart*1000.0/sys_freq.QuadPart;
}

The running result on Visual studio is:

double operator* 41.2031 ms
valarray operator* 43.8407 ms
valarray[i] operator* 42.49 ms
查看更多
登录 后发表回答