The problem
For a long time I had the impression that using a nested std::vector<std::vector...>
for simulating an N-dimensional array is in general bad, since the memory is not guarantee to be contiguous, and one may have cache misses. I thought it's better to use a flat vector and map from multiple dimensions to 1D and vice versa. So, I decided to test it (code listed at the end). It is pretty straightforward, I timed reading/writing to a nested 3D vector vs my own 3D wrapper of an 1D vector. I compiled the code with both g++
and clang++
, with -O3
optimization turned on. For each run I changed the dimensions, so I can get a pretty good idea about the behaviour. To my surprise, these are the results I obtained on my machine MacBook Pro (Retina, 13-inch, Late 2012), 2.5GHz i5, 8GB RAM, OS X 10.10.5:
g++ 5.2
dimensions nested flat
X Y Z (ms) (ms)
100 100 100 -> 16 24
150 150 150 -> 58 98
200 200 200 -> 136 308
250 250 250 -> 264 746
300 300 300 -> 440 1537
clang++ (LLVM 7.0.0)
dimensions nested flat
X Y Z (ms) (ms)
100 100 100 -> 16 18
150 150 150 -> 53 61
200 200 200 -> 135 137
250 250 250 -> 255 271
300 300 300 -> 423 477
As you can see, the "flatten" wrapper is never beating the nested version. Moreover, g++'s libstdc++ implementation performs quite badly compared to libc++ implementation, for example for 300 x 300 x 300
the flatten version is almost 4 times slower than the nested version. libc++ seems to have equal performance.
My questions:
- Why isn't the flatten version faster? Shouldn't it be? Am I missing something in the testing code?
- Moreover, why does g++'s libstdc++ performs so badly when using flatten vectors? Again, shouldn't it perform better?
The code I used:
#include <chrono>
#include <cstddef>
#include <iostream>
#include <memory>
#include <random>
#include <vector>
// Thin wrapper around flatten vector
template<typename T>
class Array3D
{
std::size_t _X, _Y, _Z;
std::vector<T> _vec;
public:
Array3D(std::size_t X, std::size_t Y, std::size_t Z):
_X(X), _Y(Y), _Z(Z), _vec(_X * _Y * _Z) {}
T& operator()(std::size_t x, std::size_t y, std::size_t z)
{
return _vec[z * (_X * _Y) + y * _X + x];
}
const T& operator()(std::size_t x, std::size_t y, std::size_t z) const
{
return _vec[z * (_X * _Y) + y * _X + x];
}
};
int main(int argc, char** argv)
{
std::random_device rd{};
std::mt19937 rng{rd()};
std::uniform_real_distribution<double> urd(-1, 1);
const std::size_t X = std::stol(argv[1]);
const std::size_t Y = std::stol(argv[2]);
const std::size_t Z = std::stol(argv[3]);
// Standard library nested vector
std::vector<std::vector<std::vector<double>>>
vec3D(X, std::vector<std::vector<double>>(Y, std::vector<double>(Z)));
// 3D wrapper around a 1D flat vector
Array3D<double> vec1D(X, Y, Z);
// TIMING nested vectors
std::cout << "Timing nested vectors...\n";
auto start = std::chrono::steady_clock::now();
volatile double tmp1 = 0;
for (std::size_t x = 0 ; x < X; ++x)
{
for (std::size_t y = 0 ; y < Y; ++y)
{
for (std::size_t z = 0 ; z < Z; ++z)
{
vec3D[x][y][z] = urd(rng);
tmp1 += vec3D[x][y][z];
}
}
}
std::cout << "\tSum: " << tmp1 << std::endl; // we make sure the loops are not optimized out
auto end = std::chrono::steady_clock::now();
std::cout << "Took: ";
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << ms << " milliseconds\n";
// TIMING flatten vector
std::cout << "Timing flatten vector...\n";
start = std::chrono::steady_clock::now();
volatile double tmp2 = 0;
for (std::size_t x = 0 ; x < X; ++x)
{
for (std::size_t y = 0 ; y < Y; ++y)
{
for (std::size_t z = 0 ; z < Z; ++z)
{
vec1D(x, y, z) = urd(rng);
tmp2 += vec1D(x, y, z);
}
}
}
std::cout << "\tSum: " << tmp2 << std::endl; // we make sure the loops are not optimized out
end = std::chrono::steady_clock::now();
std::cout << "Took: ";
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << ms << " milliseconds\n";
}
EDIT
Changing the Array3D<T>::operator()
return to
return _vec[(x * _Y + y) * _Z + z];
as per @1201ProgramAlarm's suggestion does indeed get rid of the "weird" behaviour of g++, in the sense that the flat and nested versions take now roughly the same time. However it's still intriguing. I thought the nested one will be much worse due to cache issues. May I just be lucky and have all the memory contiguously allocated?
Why the nested vectors are about the same speed as flat in your microbenchmark, after fixing the indexing order: You'd expect the flat array to be faster (see Tobias's answer about potential locality problems, and my other answer for why nested vectors suck in general, but not too badly for sequential access). But your specific test is doing so many things that let out-of-order execution hide the overhead of using nested vectors, and/or that just slow things down so much that the extra overhead is lost in measurement noise.
I put your performance-bugfixed source code up on Godbolt so we can look at the asm of the inner loop as compiled by g++5.2, with -O3
. (Apple's fork of clang might be similar to clang3.7, but I'll just look at the gcc version.) There's a lot of code from C++ functions, but you can right-click on a source line to scroll the asm windows to the code for that line. Also, mouseover a source line to bold the asm that implements that line, or vice versa.
gcc's inner two loops for the nested version are as follows (with some comments added by hand):
## outer-most loop not shown
.L213: ## middle loop (over `y`)
test rbp, rbp # Z
je .L127 # inner loop runs zero times if Z==0
mov rax, QWORD PTR [rsp+80] # MEM[(struct vector * *)&vec3D], MEM[(struct vector * *)&vec3D]
xor r15d, r15d # z = 0
mov rax, QWORD PTR [rax+r12] # MEM[(struct vector * *)_195], MEM[(struct vector * *)_195]
mov rdx, QWORD PTR [rax+rbx] # D.103857, MEM[(double * *)_38]
## Top of inner-most loop.
.L128:
lea rdi, [rsp+5328] # tmp511, ## function arg: pointer to the RNG object, which is a local on the stack.
lea r14, [rdx+r15*8] # D.103851, ## r14 = &(vec3D[x][y][z])
call double std::generate_canonical<double, 53ul, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&) #
addsd xmm0, xmm0 # D.103853, D.103853 ## return val *= 2.0: [0.0, 2.0]
mov rdx, QWORD PTR [rsp+80] # MEM[(struct vector * *)&vec3D], MEM[(struct vector * *)&vec3D] ## redo the pointer-chasing from vec3D.data()
mov rdx, QWORD PTR [rdx+r12] # MEM[(struct vector * *)_150], MEM[(struct vector * *)_150]
subsd xmm0, QWORD PTR .LC6[rip] # D.103859, ## and subtract 1.0: [-1.0, 1.0]
mov rdx, QWORD PTR [rdx+rbx] # D.103857, MEM[(double * *)_27]
movsd QWORD PTR [r14], xmm0 # *_155, D.103859 # store into vec3D[x][y][z]
movsd xmm0, QWORD PTR [rsp+64] # D.103853, tmp1 # reload volatile tmp1
addsd xmm0, QWORD PTR [rdx+r15*8] # D.103853, *_62 # add the value just stored into the array (r14 = rdx+r15*8 because nothing else modifies the pointers in the outer vectors)
add r15, 1 # z,
cmp rbp, r15 # Z, z
movsd QWORD PTR [rsp+64], xmm0 # tmp1, D.103853 # spill tmp1
jne .L128 #,
#End of inner-most loop
.L127: ## middle-loop
add r13, 1 # y,
add rbx, 24 # sizeof(std::vector<> == 24) == the size of 3 pointers.
cmp QWORD PTR [rsp+8], r13 # %sfp, y
jne .L213 #,
## outer loop not shown.
And for the flat loop:
## outer not shown.
.L214:
test rbp, rbp # Z
je .L135 #,
mov rax, QWORD PTR [rsp+280] # D.103849, vec1D._Y
mov rdi, QWORD PTR [rsp+288] # D.103849, vec1D._Z
xor r15d, r15d # z
mov rsi, QWORD PTR [rsp+296] # D.103857, MEM[(double * *)&vec1D + 24B]
.L136: ## inner-most loop
imul rax, r12 # D.103849, x
lea rax, [rax+rbx] # D.103849,
imul rax, rdi # D.103849, D.103849
lea rdi, [rsp+5328] # tmp520,
add rax, r15 # D.103849, z
lea r14, [rsi+rax*8] # D.103851, # &vec1D(x,y,z)
call double std::generate_canonical<double, 53ul, std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul> >(std::mersenne_twister_engine<unsigned long, 32ul, 624ul, 397ul, 31ul, 2567483615ul, 11ul, 4294967295ul, 7ul, 2636928640ul, 15ul, 4022730752ul, 18ul, 1812433253ul>&) #
mov rax, QWORD PTR [rsp+280] # D.103849, vec1D._Y
addsd xmm0, xmm0 # D.103853, D.103853
mov rdi, QWORD PTR [rsp+288] # D.103849, vec1D._Z
mov rsi, QWORD PTR [rsp+296] # D.103857, MEM[(double * *)&vec1D + 24B]
mov rdx, rax # D.103849, D.103849
imul rdx, r12 # D.103849, x # redo address calculation a 2nd time per iteration
subsd xmm0, QWORD PTR .LC6[rip] # D.103859,
add rdx, rbx # D.103849, y
imul rdx, rdi # D.103849, D.103849
movsd QWORD PTR [r14], xmm0 # MEM[(double &)_181], D.103859 # store into the address calculated earlier
movsd xmm0, QWORD PTR [rsp+72] # D.103853, tmp2
add rdx, r15 # tmp374, z
add r15, 1 # z,
addsd xmm0, QWORD PTR [rsi+rdx*8] # D.103853, MEM[(double &)_170] # tmp2 += vec1D(x,y,z). rsi+rdx*8 == r14, so this is a reload of the store this iteration.
cmp rbp, r15 # Z, z
movsd QWORD PTR [rsp+72], xmm0 # tmp2, D.103853
jne .L136 #,
.L135: ## middle loop: increment y
add rbx, 1 # y,
cmp r13, rbx # Y, y
jne .L214 #,
## outer loop not shown.
Your MacBook Pro (Late 2012) has an Intel IvyBridge CPU, so I'm using numbers for that microarchitecture from Agner Fog's instruction tables and microarch guide. Things should be mostly the same on other Intel/AMD CPUs.
The only 2.5GHz mobile IvB i5 is the i5-3210M, so your CPU has 3MiB of L3 cache. This means even your smallest test case (100^3 * 8B per double
~= 7.63MiB) is larger than your last-level cache, so none of your test cases fit in cache at all. That's probably a good thing, because you allocate and default-initialize both nested and flat before testing either of them. However, you do test in the same order you allocate, so if the nested array is still cache after zeroing the flat array, the flat array may still be hot in L3 cache after the timing loop over the nested array.
If you'd used a repeat-loop to loop over the same array multiple times, you could have got times large enough to measure for smaller array sizes.
You're doing several things here that are super-weird and make this so slow that out-of-order execution can hide the extra latency of changing y
, even if your inner z
vectors are not perfectly contiguous.
You run a slow PRNG inside the timed loop. std::uniform_real_distribution<double> urd(-1, 1);
is extra overhead on top of std::mt19937 rng{rd()};
, which is already slow compared to FP-add latency (3 cycles), or compared to the L1D cache load throughput of 2 per cycle. All this extra time running the PRNG gives out-of-order execution a chance to run the array-indexing instructions so the final address is ready by the time the data is. Unless you have a lot of cache misses, you're mostly just measuring PRNG speed, because it produces results much slower than 1 per clock cycle.
g++5.2 doesn't fully inline the urd(rng)
code, and the x86-64 System V calling convention has no call-preserved XMM registers. So tmp1
/tmp2
have to be spilled/reloaded for every element, even if they weren't volatile
.
It also loses its place in the Z vector, and has to redo the outer 2 levels of indirection before accessing the next z
element. This is because it doesn't know about the internals of the function its calling, and assumes that it might have a pointer to the outer vector<>
's memory. (The flat version does two multiplies for indexing in the inner loop, instead of a simple pointer-add.)
clang (with libc++) does fully inline the PRNG, so moving to the next z
is just add reg, 8
to increment a pointer in both the flat and nested versions. You could get the same behaviour from gcc by getting an iterator outside the inner loop, or getting a reference to the inner vector, instead of redoing operator[]
and hoping the compiler will hoist it for you.
Intel/AMD FP add/sub/mul throughput/latency is not data-dependent, except for denormals. (x87 also slows down for NaN and maybe infinity, but SSE doesn't. 64-bit code uses SSE even for scalar float
/double
.) So you could have just initialized your array with zeros, or with a PRNG outisde the timing loop. (Or left them zeroed, since the vector<double>
constructor does that for you, and it actually takes extra code to get it not to in cases where you're going to write something else.) Division and sqrt performance is data-dependent on some CPUs, and much slower than add/sub/mul.
You write every element right before you read it, inside the inner loop. In the source, this looks like a store/reload. That's what gcc actually does, unfortunately, but clang with libc++ (which inlines the PRNG) transforms the loop body:
// original
vec3D[x][y][z] = urd(rng);
tmp1 += vec3D[x][y][z];
// what clang's asm really does
double xmm7 = urd(rng);
vec3D[x][y][z] = xmm7;
tmp1 += xmm7;
In clang's asm:
# do { ...
addsd xmm7, xmm4 # last instruction of the PRNG
movsd qword ptr [r8], xmm7 # store it into the Z vector
addsd xmm7, qword ptr [rsp + 88]
add r8, 8 # pointer-increment to walk along the Z vector
dec r13 # i--
movsd qword ptr [rsp + 88], xmm7
jne .LBB0_74 # }while(i != 0);
It's allowed to do this because vec3D
isn't volatile
or atomic<>
, so it would be undefined behaviour for any other thread to be writing this memory at the same time. This means it can optimize away store/reloads to objects in memory into just a store (and simply use the value it stored, without reloading). Or optimize the store away entirely if it can prove it's a dead store (a store that nothing can ever read, e.g. to an unused static
variable).
In gcc's version, it does the indexing for the store before the PRNG call, and the indexing for the reload after. So I think gcc isn't sure that the function call doesn't modify a pointer, because pointers to the outer vectors have escaped the function. (And the PRNG doesn't inline).
However, even an actual store/reload in the asm is still less sensitive to cache-misses than a simple load!
Store->load forwarding still works even if the store misses in cache. So a cache-miss in a Z vector doesn't directly delay the critical path. It only slows you down if out-of-order execution can't hide the latency of the cache miss. (A store can retire as soon as the data is written to the store-buffer (and all previous instructions have retired). I'm not sure if a load can retire before the cache-line even makes it to L1D, if it got its data from store-forwarding. It may be able to, because x86 does allow StoreLoad reordering (stores are allowed to become globally visible after loads). In that case, a store/reload only adds 6 cycles of latency for the PRNG result (off the critical path from one PRNG state to next PRNG state). And it's only throughput a bottleneck if it cache-misses so much that the store-buffer fills up and prevents new store uops from executing, which in turn eventually prevents new uops from issuing into the out-of-order core when the Reservation Station or ROB fills up with un-executed or un-retired (respectively) uops.
With reversed indexing (original version of the flat code), probably the main bottleneck was the scattered stores. IDK why clang did so much better than gcc there. Maybe clang managed to invert a loop and traverse memory in sequential order after all. (Since it fully inlined the PRNG, there were no function calls that would require the state of memory to match program-order.)
Traversing each Z vector in-order means that cache misses are relatively far-between (even if each Z vector is not contiguous with the previous one), giving lots of time for for the stores to execute. Or even if a store-forwarded load can't actually retire until the L1D cache actually owns the cache line (in Modified state of the MESI protocol), speculative execution has the correct data and didn't have to wait for the latency of the cache-miss. The out-of-order instruction window is probably big enough to keep the critical path probably from stalling before the load can retire. (Cache miss loads are normally really bad, because dependent instructions can't be executed without data for them to operate on. So they much more easily create bubbles in the pipeline. With a full cache-miss from DRAM having a latency of over 300 cycles, and the out-of-order window being 168 uops on IvB, it can't hide all of the latency for code executing at even 1 uop (approximately 1 instruction) per clock.) For pure stores, the out-of-order window extends beyond the ROB size, because they don't need to commit to L1D to retire. In fact, they can't commit until after they retire, because that's the point at which they're known to be non-speculative. (So making them globally-visible earlier than that would prevent roll-back on detection of an exception or mis-speculation.)
I don't have libc++
installed on my desktop, so I can't benchmark that version against g++. With g++5.4, I'm finding Nested: 225 milliseconds and Flat: 239 milliseconds. I suspect that the extra array-indexing multiplies are a problem, and compete with ALU instructions the PRNG uses. In contrast, the nested version redoing a bunch of pointer-chasing that hits in L1D cache can happen in parallel. My desktop is a Skylake i7-6700k at 4.4GHz. SKL has a ROB (ReOrder Buffer) size of 224 uops, and RS of 97 uops, so the out-of-order window is very large. It also has FP-add latency of 4 cycles (unlike previous uarches where it was 3).
volatile double tmp1 = 0;
Your accumulator is volatile
, which forces the compiler to store/reload it every iteration of the inner loop. The total latency of the loop-carried dependency chain in the inner loop is 9 cycles: 3 for addsd
, and 6 for store-forwarding from movsd
the store to the movsd
reload. (clang folds the reload into a memory operand with addsd xmm7, qword ptr [rsp + 88]
, but same difference. ([rsp+88]
is on the stack, where variables with automatic storage are stored, if they need to be spilled from registers.)
As noted above, the non-inline function call for gcc will also force a spill/reload in the x86-64 System V calling convention (used by everything but Windows). But a smart compiler could have done 4 PRNG calls, for example, and then 4 array stores. (If you'd used an iterator to make sure gcc knew the vectors holding other vectors weren't changing.)
Using -ffast-math
would have let the compiler auto-vectorize (if not for the PRNG and volatile
). This would let you run through the arrays fast enough that lack of locality between different Z vectors could be a real problem. It would also let compilers unroll with multiple accumulators, to hide the FP-add latency. e.g. they could (and clang would) make asm equivalent to:
float t0=0, t1=0, t2=0, t3=0;
for () {
t0 += a[i + 0];
t1 += a[i + 1];
t2 += a[i + 2];
t3 += a[i + 3];
}
t0 = (t0 + t1) + (t2 + t3);
That has 4 separate dependency chains, so it can keep 4 FP adds in flight. Since IvB has 3 cycle latency, one per clock throughput for addsd
, we only need to keep 4 in flight to saturate its throughput. (Skylake has 4c latency, 2 per clock throughput, same as mul or FMA, so you need 8 accumulators to avoid latency bottlenecks. Actually, even more is better. As testing by the asker of that question showed, Haswell did better with even more accumulators when coming close to maxing out load throughput.)
Something like that would be a much better test of how efficient it is to loop over an Array3D. If you want to stop the loop from being optimized away entirely, just use the result. Test your microbenchmark to make sure that increasing the problem size scales the time; if not then something got optimized away, or you're not testing what you think you're testing. Don't make the inner-loop temporary volatile
!!
Writing microbenchmarks is not easy. You have to understand enough to write one that tests what you think you're testing. :P This is a good example of how easy it is to go wrong.
May I just be lucky and have all the memory contiguously allocated?
Yes, that probably happens for many small allocations done in-order, when you haven't allocated and freed anything before doing this. If they were large enough (usually one 4kiB page or larger), glibc malloc
would switch to using mmap(MAP_ANONYMOUS)
and then the kernel would choose randomized virtual addresses (ASLR). So with larger Z, you might expect locality to get worse. But on the other hand, larger Z vectors mean you spend more of your time looping over one contiguous vector so a cache miss when changing y
(and x
) becomes relatively less important.
Looping sequentially over your data with your apparently doesn't expose this, because the extra pointer accesses hit in cache, so the pointer-chasing has low enough latency for OOO execution to hide it with your slow loop.
Prefetch has a really easy time keeping up here.
Different compilers / library can make a big difference with this weird test. On my system (Arch Linux, i7-6700k Skylake with 4.4GHz max turbo), the best of 4 runs at 300 300 300
for g++5.4 -O3 was:
Timing nested vectors...
Sum: 579.78
Took: 225 milliseconds
Timing flatten vector...
Sum: 579.78
Took: 239 milliseconds
Performance counter stats for './array3D-gcc54 300 300 300':
532.066374 task-clock (msec) # 1.000 CPUs utilized
2 context-switches # 0.004 K/sec
0 cpu-migrations # 0.000 K/sec
54,523 page-faults # 0.102 M/sec
2,330,334,633 cycles # 4.380 GHz
7,162,855,480 instructions # 3.07 insn per cycle
632,509,527 branches # 1188.779 M/sec
756,486 branch-misses # 0.12% of all branches
0.532233632 seconds time elapsed
vs. g++7.1 -O3 (which apparently decided to branch on something that g++5.4 didn't)
Timing nested vectors...
Sum: 932.159
Took: 363 milliseconds
Timing flatten vector...
Sum: 932.159
Took: 378 milliseconds
Performance counter stats for './array3D-gcc71 300 300 300':
810.911200 task-clock (msec) # 1.000 CPUs utilized
0 context-switches # 0.000 K/sec
0 cpu-migrations # 0.000 K/sec
54,523 page-faults # 0.067 M/sec
3,546,467,563 cycles # 4.373 GHz
7,107,511,057 instructions # 2.00 insn per cycle
794,124,850 branches # 979.299 M/sec
55,074,134 branch-misses # 6.94% of all branches
0.811067686 seconds time elapsed
vs. clang4.0 -O3 (with gcc's libstdc++, not libc++)
perf stat ./array3D-clang40-libstdc++ 300 300 300
Timing nested vectors...
Sum: -349.786
Took: 1657 milliseconds
Timing flatten vector...
Sum: -349.786
Took: 1631 milliseconds
Performance counter stats for './array3D-clang40-libstdc++ 300 300 300':
3358.297093 task-clock (msec) # 1.000 CPUs utilized
9 context-switches # 0.003 K/sec
0 cpu-migrations # 0.000 K/sec
54,521 page-faults # 0.016 M/sec
14,679,919,916 cycles # 4.371 GHz
12,917,363,173 instructions # 0.88 insn per cycle
1,658,618,144 branches # 493.887 M/sec
916,195 branch-misses # 0.06% of all branches
3.358518335 seconds time elapsed
I didn't dig into what clang did wrong, or try with -ffast-math
and/or -march=native
. (Those won't do much unless you remove volatile
, though.)
perf stat -d
doesn't show more cache misses (L1 or last-level) for clang than gcc. But it does show clang is doing more than twice as many L1D loads.
I did try with a non-square array. It's almost exactly the same time keeping total element count the same but changing the final dimension to 5 or 6.
Even a minor change to the C helps, and makes "flatten" faster than nested with gcc (from 240ms down to 220ms for 300^3, but barely making any difference for nested.):
// vec1D(x, y, z) = urd(rng);
double res = urd(rng);
vec1D(x, y, z) = res; // indexing calculation only done once, after the function call
tmp2 += vec1D(x, y, z);
// using iterators would still avoid redoing it at all.
Reading over the other answers I'm not really satisfied with the accuracy and level of detail of the answers, so I'll attempt an explanation myself:
The man issue here is not indirection but a question of spatial locality:
There are basically two things that make caching especially effective:
Temporal locality, which means that a memory word that has been accessed recently will likely be accessed again in the near future. This might for example happen at nodes near the root of a binary search tree that is accessed frequently.
Spatial locality, which means that if a memory word has been accessed, it is likely that the memory words before or after this word will soon be accessed as well. This happens in our case, for nested and flattened arrays.
To assess the impact that indirection and cache effects might have on this problem, let's just assume that we have X = Y = Z = 1024
Judging from this question, a single cache line (L1, L2 or L3) is 64 bytes long, that means 8 double values. Let's assume the L1 cache has 32 kB (4096 doubles), the L2 cache has 256 kB (32k doubles) and the L3 cache has 8 MB (1M doubles).
This means that - assuming the cache is filled with no other data (which is a bold guess, I know) - in the flattened case only every 4th value of y
leads to a L1 cache miss (the L2 cache latency is probably around 10-20 cycles), only every 32nd value of y
leads to a L2 cache miss (the L3 cache latency is some value lower than 100 cycles) and only in case of a L3 cache miss we actually have to access main memory. I don't want to open up the whole calculation here, since taking the whole cache hierarchy into account makes it a bit more difficult, but let's just say that almost all accesses to memory can be cached in the flattened case.
In the original formulation of this question, the flattened index was computed differently (z * (_X * _Y) + y * _X + x
), an increase of the value that changes in the innermost loop (z) always means a jump of _X * _Y * 64 bit
, thus leading to a much more non-local memory layout, which increased cache faults by a large amount.
In the nested case, the answer depends a lot on the value of Z:
- If Z is rather large, most of the memory accesses are contiguous, since they refer to the entries of a single vector in the nested
vector<vector<vector>>>
, which are laid out contiguously. Only when the y or x value is increased we need to actually use indirection to retrieve the start pointer of the next innermost vector.
- If Z is rather small, we need to change our 'base pointer' memory access quite often, which might lead to cache-misses if the storage areas of the innermost vectors are placed rather randomly in memory. However, if they are more or less contiguous, we might observe minimal to no differences in cache performance.
Since there was a question about the assembly output, let me give a short overview:
If you compare the assembly output of the nested and flattened array, you notice a lot of similarities: There are three equivalent nested loops and the counting variables x, y and z are stored in registers. The only real difference - aside from the fact that the nested version uses two counters for every outer index to avoid multiplying by 24 on every address computation, and the flattened version does the same for the innermost loop and multiplying by 8 - can be found in the y loop where instead of just incrementing y and computing the flattened index, we need to do three interdependent memory loads to determine the base pointer for our inner loop:
mov rax, QWORD PTR [rdi]
mov rax, QWORD PTR [rax+r11]
mov rax, QWORD PTR [rax+r10]
But since these only happen every Zth time and the pointers for the 'middle vector' are most likely cached, the time difference is negligible.
(This doesn't really answer the question. I think I read it backwards initially, assuming that the OP had just found what I was expecting, that nested vectors are slower than flat.)
You should expect the nested-vector version to be slower for anything other than sequential access. After fixing the row/column major indexing order for your flat version, it should be faster for many uses, especially since it's easier for a compiler to auto-vectorize with SIMD over a big flat array than over many short std::vector<>
.
A cache line is only 64B. That's 8 double
s. Locality on a page level matters because of limited TLB entries, and prefetching requires sequential accesses, but you'll get that anyway (close enough) with nested vectors that are allocated all at once with most malloc implementations. (This is a trivial microbenchmark that doesn't do anything before allocating its vector
s. In a real program that allocates and frees some memory before making a lot of small allocations, some of them might be scattered around more.)
Besides locality, the extra levels of indirection are potentially problematic.
A reference / pointer to a std::vector just points to the the fixed-size block that holds the current size, allocated space, and the pointer to the buffer. IDK if any implementations place the buffer right after the control data as part of the same malloced block, but probably that's impossible because sizeof(std::vector<int>)
has to be constant so you can have a vector of vectors. Check out the asm on godbolt: A function that just returns v[10]
takes one load with an array arg, but two loads with a std::vector arg.
In the nested-vector implementation, loading v[x][y][z]
requires 4 steps (assuming a pointer or reference to v
is already in a register).
- load
v.buffer_pointer
or v.bp
or whatever the implementation calls it. (A pointer to an array of std::vector<std::vector<double>>
)
- load
v.bp[x].bp
(A pointer to an array of std::vector<double>
)
- load
v.bp[x].bp[y].bp
(A pointer to an array of double
)
- load
v.bp[x].bp[y].bp[z]
(The double
we want)
A proper 3D array, simulated with a single std::vector
, just does:
- load
v.bp
(A pointer to an array of double
)
- load
v.bp[(x*w + y)*h + z]
(The double
we want)
Multiple accesses to the same simulated 3D array with different x and y require computing a new index, but v.bp
will stay in a register. So instead of 3 cache misses, we only get one.
Traversing the 3D array in order hides the penalty of the nested-vector implementation, because there's a loop over all the values in the inner-most vector hiding the overhead of changing x and y. Prefetch of the contiguous pointers in the outer vectors helps here, and Z
is small enough in your testing that looping over one inner-most vector won't evict the pointer for the next y
value.
What Every Programmer Should Know About Memory is getting somewhat outdated, but it covers the details of caching and locality. Software-prefetching is not nearly as important as it was on P4, so don't pay too much attention to that part of the guide.
May I just be lucky and have all the memory contiguously allocated?
Probably yes. I've modified your sample a little bit, so we have a benchmark which concentrates more on the differences between the two approaches:
- array fill is done in a separate pass, so random generator speed is excluded
- removed volatile
- fixed a little bug (
tmp1
was printed instead of tmp2
)
- added a
#if 1
part, which can be used to randomize vec3D
placement in memory. As it turned out, it has a huge difference on my machine.
Without randomization (I've used parameters: 300 300 300):
Timing nested vectors...
Sum: -131835
Took: 2122 milliseconds
Timing flatten vector...
Sum: -131835
Took: 2085 milliseconds
So there is a little difference, flatten version is a little bit faster. (I've run several tests, and put the minimal time here).
With randomization:
Timing nested vectors...
Sum: -117685
Took: 3014 milliseconds
Timing flatten vector...
Sum: -117685
Took: 2085 milliseconds
So the cache effect can be seen here: nested version is ~50% slower. This is because CPU cannot predict which memory area will be used, so its cache prefetcher is not efficient.
Here's the modified code:
#include <chrono>
#include <cstddef>
#include <iostream>
#include <memory>
#include <random>
#include <vector>
template<typename T>
class Array3D
{
std::size_t _X, _Y, _Z;
std::vector<T> _vec;
public:
Array3D(std::size_t X, std::size_t Y, std::size_t Z):
_X(X), _Y(Y), _Z(Z), _vec(_X * _Y * _Z) {}
T& operator()(std::size_t x, std::size_t y, std::size_t z)
{
return _vec[(x * _Y + y) * _Z + z];
}
const T& operator()(std::size_t x, std::size_t y, std::size_t z) const
{
return _vec[(x * _Y + y) * _Z + z];
}
};
double nested(std::vector<std::vector<std::vector<double>>> &vec3D, std::size_t X, std::size_t Y, std::size_t Z) {
double tmp1 = 0;
for (int iter=0; iter<100; iter++)
for (std::size_t x = 0 ; x < X; ++x)
{
for (std::size_t y = 0 ; y < Y; ++y)
{
for (std::size_t z = 0 ; z < Z; ++z)
{
tmp1 += vec3D[x][y][z];
}
}
}
return tmp1;
}
double flatten(Array3D<double> &vec1D, std::size_t X, std::size_t Y, std::size_t Z) {
double tmp2 = 0;
for (int iter=0; iter<100; iter++)
for (std::size_t x = 0 ; x < X; ++x)
{
for (std::size_t y = 0 ; y < Y; ++y)
{
for (std::size_t z = 0 ; z < Z; ++z)
{
tmp2 += vec1D(x, y, z);
}
}
}
return tmp2;
}
int main(int argc, char** argv)
{
std::random_device rd{};
std::mt19937 rng{rd()};
std::uniform_real_distribution<double> urd(-1, 1);
const std::size_t X = std::stol(argv[1]);
const std::size_t Y = std::stol(argv[2]);
const std::size_t Z = std::stol(argv[3]);
std::vector<std::vector<std::vector<double>>>
vec3D(X, std::vector<std::vector<double>>(Y, std::vector<double>(Z)));
#if 1
for (std::size_t i = 0 ; i < X*Y; i++)
{
std::size_t xa = rand()%X;
std::size_t ya = rand()%Y;
std::size_t xb = rand()%X;
std::size_t yb = rand()%Y;
std::swap(vec3D[xa][ya], vec3D[xb][yb]);
}
#endif
// 3D wrapper around a 1D flat vector
Array3D<double> vec1D(X, Y, Z);
for (std::size_t x = 0 ; x < X; ++x)
{
for (std::size_t y = 0 ; y < Y; ++y)
{
for (std::size_t z = 0 ; z < Z; ++z)
{
vec3D[x][y][z] = vec1D(x, y, z) = urd(rng);
}
}
}
std::cout << "Timing nested vectors...\n";
auto start = std::chrono::steady_clock::now();
double tmp1 = nested(vec3D, X, Y, Z);
auto end = std::chrono::steady_clock::now();
std::cout << "\tSum: " << tmp1 << std::endl; // we make sure the loops are not optimized out
std::cout << "Took: ";
auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << ms << " milliseconds\n";
std::cout << "Timing flatten vector...\n";
start = std::chrono::steady_clock::now();
double tmp2 = flatten(vec1D, X, Y, Z);
end = std::chrono::steady_clock::now();
std::cout << "\tSum: " << tmp2 << std::endl; // we make sure the loops are not optimized out
std::cout << "Took: ";
ms = std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count();
std::cout << ms << " milliseconds\n";
}