Is std::vector so much slower than plain arrays?

2018-12-31 15:34发布

I've always thought it's the general wisdom that std::vector is "implemented as an array," blah blah blah. Today I went down and tested it, and it seems to be not so:

Here's some test results:

UseArray completed in 2.619 seconds
UseVector completed in 9.284 seconds
UseVectorPushBack completed in 14.669 seconds
The whole thing completed in 26.591 seconds

That's about 3 - 4 times slower! Doesn't really justify for the "vector may be slower for a few nanosecs" comments.

And the code I used:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
        TestTimer(const std::string & name) : name(name),

            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;

        std::string name;
        boost::posix_time::ptime start;

struct Pixel

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)

    unsigned char r, g, b;

void UseVector()
    TestTimer t("UseVector");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        std::vector<Pixel> pixels;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;

void UseVectorPushBack()
    TestTimer t("UseVectorPushBack");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        std::vector<Pixel> pixels;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));

void UseArray()
    TestTimer t("UseArray");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        for(int i = 0 ; i < dimension * dimension; ++i)
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;


int main()
    TestTimer t1("The whole thing");


    return 0;

Am I doing it wrong or something? Or have I just busted this performance myth?

I'm using Release mode in Visual Studio 2005.

In Visual C++, #define _SECURE_SCL 0 reduces UseVector by half (bringing it down to 4 seconds). This is really huge, IMO.

2楼-- · 2018-12-31 15:43

Martin York's answer bothers me because it seems like an attempt to brush the initialisation problem under the carpet. But he is right to identify redundant default construction as the source of performance problems.

[EDIT: Martin's answer no longer suggests changing the default constructor.]

For the immediate problem at hand, you could certainly call the 2-parameter version of the vector<Pixel> ctor instead:

std::vector<Pixel> pixels(dimension * dimension, Pixel(255, 0, 0));

That works if you want to initialise with a constant value, which is a common case. But the more general problem is: How can you efficiently initialise with something more complicated than a constant value?

For this you can use a back_insert_iterator, which is an iterator adaptor. Here's an example with a vector of ints, although the general idea works just as well for Pixels:

#include <iterator>
// Simple functor return a list of squares: 1, 4, 9, 16...
struct squares {
    squares() { i = 0; }
    int operator()() const { ++i; return i * i; }

    int i;


std::vector<int> v;
v.reserve(someSize);     // To make insertions efficient
std::generate_n(std::back_inserter(v), someSize, squares());

Alternatively you could use copy() or transform() instead of generate_n().

The downside is that the logic to construct the initial values needs to be moved into a separate class, which is less convenient than having it in-place (although lambdas in C++1x make this much nicer). Also I expect this will still not be as fast as a malloc()-based non-STL version, but I expect it will be close, since it only does one construction for each element.

3楼-- · 2018-12-31 15:43

By the way the slow down your seeing in classes using vector also occurs with standard types like int. Heres a multithreaded code:

#include <iostream>
#include <cstdio>
#include <map>
#include <string>
#include <typeinfo>
#include <vector>
#include <pthread.h>
#include <sstream>
#include <fstream>
using namespace std;

//pthread_mutex_t map_mutex=PTHREAD_MUTEX_INITIALIZER;

long long num=500000000;
int procs=1;

struct iterate
    int id;
    int num;
    void * member;
    iterate(int a, int b, void *c) : id(a), num(b), member(c) {}

//fill out viterate and piterate
void * viterate(void * input)
    printf("am in viterate\n");
    iterate * info=static_cast<iterate *> (input);
    // reproduce member type
    vector<int> test= *static_cast<vector<int>*> (info->member);
    for (int i=info->id; i<test.size(); i+=info->num)
        //printf("am in viterate loop\n");

void * piterate(void * input)
    printf("am in piterate\n");
    iterate * info=static_cast<iterate *> (input);;
    int * test=static_cast<int *> (info->member);
    for (int i=info->id; i<num; i+=info->num) {
        //printf("am in piterate loop\n");

int main()
    cout<<"producing vector of size "<<num<<endl;
    vector<int> vtest(num);
    cout<<"produced  a vector of size "<<vtest.size()<<endl;
    pthread_t thread[procs];

    iterate** it=new iterate*[procs];
    int ans;
    void *status;

    cout<<"begining to thread through the vector\n";
    for (int i=0; i<procs; i++) {
        it[i]=new iterate(i, procs, (void *) &vtest);
    //  ans=pthread_create(&thread[i],NULL,viterate, (void *) it[i]);
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    cout<<"end of threading through the vector";
    //reuse the iterate structures

    cout<<"producing a pointer with size "<<num<<endl;
    int * pint=new int[num];
    cout<<"produced a pointer with size "<<num<<endl;

    cout<<"begining to thread through the pointer\n";
    for (int i=0; i<procs; i++) {
        ans=pthread_create(&thread[i], NULL, piterate, (void*) it[i]);
    for (int i=0; i<procs; i++) {
        pthread_join(thread[i], &status);
    cout<<"end of threading through the pointer\n";

    //delete structure array for iterate
    for (int i=0; i<procs; i++) {
        delete it[i];
    delete [] it;

    //delete pointer
    delete [] pint;

    cout<<"end of the program"<<endl;
    return 0;

The behavior from the code shows the instantiation of vector is the longest part of the code. Once you get through that bottle neck. The rest of the code runs extremely fast. This is true no matter how many threads you are running on.

By the way ignore the absolutely insane number of includes. I have been using this code to test things for a project so the number of includes keep growing.

4楼-- · 2018-12-31 15:44

Try disabling checked iterators and building in release mode. You shouldn't see much of a performance difference.

5楼-- · 2018-12-31 15:44

GNU's STL (and others), given vector<T>(n), default constructs a prototypal object T() - the compiler will optimise away the empty constructor - but then a copy of whatever garbage happened to be in the memory addresses now reserved for the object is taken by the STL's __uninitialized_fill_n_aux, which loops populating copies of that object as the default values in the vector. So, "my" STL is not looping constructing, but constructing then loop/copying. It's counter intuitive, but I should have remembered as I commented on a recent stackoverflow question about this very point: the construct/copy can be more efficient for reference counted objects etc..


vector<T> x(n);


vector<T> x;

is - on many STL implementations - something like:

T temp;
for (int i = 0; i < n; ++i)
    x[i] = temp;

The issue being that the current generation of compiler optimisers don't seem to work from the insight that temp is uninitialised garbage, and fail to optimise out the loop and default copy constructor invocations. You could credibly argue that compilers absolutely shouldn't optimise this away, as a programmer writing the above has a reasonable expectation that all the objects will be identical after the loop, even if garbage (usual caveats about 'identical'/operator== vs memcmp/operator= etc apply). The compiler can't be expected to have any extra insight into the larger context of std::vector<> or the later usage of the data that would suggest this optimisation safe.

This can be contrasted with the more obvious, direct implementation:

for (int i = 0; i < n; ++i)
    x[i] = T();

Which we can expect a compiler to optimise out.

To be a bit more explicit about the justification for this aspect of vector's behaviour, consider:

std::vector<big_reference_counted_object> x(10000);

Clearly it's a major difference if we make 10000 independent objects versus 10000 referencing the same data. There's a reasonable argument that the advantage of protecting casual C++ users from accidentally doing something so expensive outweights the very small real-world cost of hard-to-optimise copy construction.

ORIGINAL ANSWER (for reference / making sense of the comments): No chance. vector is as fast as an array, at least if you reserve space sensibly. ...

6楼-- · 2018-12-31 15:44

Here's how the push_back method in vector works:

  1. The vector allocates X amount of space when it is initialized.
  2. As stated below it checks if there is room in the current underlying array for the item.
  3. It makes a copy of the item in the push_back call.

After calling push_back X items:

  1. The vector reallocates kX amount of space into a 2nd array.
  2. It Copies the entries of the first array onto the second.
  3. Discards the first array.
  4. Now uses the second array as storage until it reaches kX entries.

Repeat. If you're not reserving space its definitely going to be slower. More than that, if it's expensive to copy the item then 'push_back' like that is going to eat you alive.

As to the vector versus array thing, I'm going to have to agree with the other people. Run in release, turn optimizations on, and put in a few more flags so that the friendly people at Microsoft don't #@%$^ it up for ya.

One more thing, if you don't need to resize, use Boost.Array.

7楼-- · 2018-12-31 15:44

A better benchmark (I think...), compiler due to optimizations can change code, becouse results of allocated vectors/arrays are not used anywhere. Results:

$ g++ test.cpp -o test -O3 -march=native
$ ./test 
UseArray inner completed in 0.652 seconds
UseArray completed in 0.773 seconds
UseVector inner completed in 0.638 seconds
UseVector completed in 0.757 seconds
UseVectorPushBack inner completed in 6.732 seconds
UseVectorPush completed in 6.856 seconds
The whole thing completed in 8.387 seconds


gcc version 6.2.0 20161019 (Debian 6.2.0-9)


model name  : Intel(R) Core(TM) i7-3630QM CPU @ 2.40GHz

And the code:

#include <cstdlib>
#include <vector>

#include <iostream>
#include <string>

#include <boost/date_time/posix_time/ptime.hpp>
#include <boost/date_time/microsec_time_clock.hpp>

class TestTimer
        TestTimer(const std::string & name) : name(name),

            using namespace std;
            using namespace boost;

            posix_time::ptime now(date_time::microsec_clock<posix_time::ptime>::local_time());
            posix_time::time_duration d = now - start;

            cout << name << " completed in " << d.total_milliseconds() / 1000.0 <<
                " seconds" << endl;

        std::string name;
        boost::posix_time::ptime start;

struct Pixel

    Pixel(unsigned char r, unsigned char g, unsigned char b) : r(r), g(g), b(b)

    unsigned char r, g, b;

void UseVector(std::vector<std::vector<Pixel> >& results)
    TestTimer t("UseVector inner");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        std::vector<Pixel>& pixels =;
        pixels.resize(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;

void UseVectorPushBack(std::vector<std::vector<Pixel> >& results)
    TestTimer t("UseVectorPushBack inner");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        std::vector<Pixel>& pixels =;
            pixels.reserve(dimension * dimension);

        for(int i = 0; i < dimension * dimension; ++i)
            pixels.push_back(Pixel(255, 0, 0));

void UseArray(Pixel** results)
    TestTimer t("UseArray inner");

    for(int i = 0; i < 1000; ++i)
        int dimension = 999;

        Pixel * pixels = (Pixel *)malloc(sizeof(Pixel) * dimension * dimension);

        results[i] = pixels;

        for(int i = 0 ; i < dimension * dimension; ++i)
            pixels[i].r = 255;
            pixels[i].g = 0;
            pixels[i].b = 0;

        // free(pixels);

void UseArray()
    TestTimer t("UseArray");
    Pixel** array = (Pixel**)malloc(sizeof(Pixel*)* 1000);
    for(int i=0;i<1000;++i)

void UseVector()
    TestTimer t("UseVector");
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());

void UseVectorPushBack()
    TestTimer t("UseVectorPush");
        std::vector<std::vector<Pixel> > vector(1000, std::vector<Pixel>());

int main()
    TestTimer t1("The whole thing");


    return 0;
登录 后发表回答