In an answer, I've stated that unaligned access has almost the same speed as aligned access a long time (on x86/x86_64). I didn't have any numbers to back up this statement, so I've created a benchmark for it.
Do you see any flaws in this benchmark? Can you improve on it (I mean, to increase GB/sec, so it reflects the truth better)?
#include <sys/time.h>
#include <stdio.h>
template <int N>
__attribute__((noinline))
void loop32(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("mov (%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x04(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x08(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x0c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x10(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x14(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x18(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x1c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x20(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x24(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x28(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x2c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x30(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x34(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x38(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x3c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x40(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x44(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x48(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x4c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x50(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x54(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x58(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x5c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x60(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x64(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x68(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x6c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x70(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x74(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x78(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x7c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x80(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x84(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x88(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x8c(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x90(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x94(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x98(%0), %%eax" : : "r"(v) :"eax");
__asm__ ("mov 0x9c(%0), %%eax" : : "r"(v) :"eax");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop64(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("mov (%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x08(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x10(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x18(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x20(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x28(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x30(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x38(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x40(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x48(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x50(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x58(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x60(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x68(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x70(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x78(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x80(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x88(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x90(%0), %%rax" : : "r"(v) :"rax");
__asm__ ("mov 0x98(%0), %%rax" : : "r"(v) :"rax");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop128a(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("movaps (%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movaps 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
v += 160;
}
}
template <int N>
__attribute__((noinline))
void loop128u(const char *v) {
for (int i=0; i<N; i+=160) {
__asm__ ("movups (%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x10(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x20(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x30(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x40(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x50(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x60(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x70(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x80(%0), %%xmm0" : : "r"(v) :"xmm0");
__asm__ ("movups 0x90(%0), %%xmm0" : : "r"(v) :"xmm0");
v += 160;
}
}
long long int t() {
struct timeval tv;
gettimeofday(&tv, 0);
return (long long int)tv.tv_sec*1000000 + tv.tv_usec;
}
int main() {
const int ITER = 10;
const int N = 1600000000;
char *data = reinterpret_cast<char *>(((reinterpret_cast<unsigned long long>(new char[N+32])+15)&~15));
for (int i=0; i<N+16; i++) data[i] = 0;
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop32<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop32<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop32<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop32<N>(data+1);
}
long long int t4 = t();
printf(" 32-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf(" 32-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop64<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop64<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop64<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop64<N>(data+1);
}
long long int t4 = t();
printf(" 64-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf(" 64-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
{
long long int t0 = t();
for (int i=0; i<ITER*100000; i++) {
loop128a<N/100000>(data);
}
long long int t1 = t();
for (int i=0; i<ITER*100000; i++) {
loop128u<N/100000>(data+1);
}
long long int t2 = t();
for (int i=0; i<ITER; i++) {
loop128a<N>(data);
}
long long int t3 = t();
for (int i=0; i<ITER; i++) {
loop128u<N>(data+1);
}
long long int t4 = t();
printf("128-bit, cache: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t1-t0)/1000, (double)N*ITER/(t2-t1)/1000, 100.0*(t2-t1)/(t1-t0)-100.0f);
printf("128-bit, mem: aligned: %8.4f GB/sec unaligned: %8.4f GB/sec, difference: %0.3f%%\n", (double)N*ITER/(t3-t2)/1000, (double)N*ITER/(t4-t3)/1000, 100.0*(t4-t3)/(t3-t2)-100.0f);
}
}
Timing method. I probably would have set it up so the test was selected by a command-line arg, so I could time it with
perf stat ./unaligned-test
, and get perf counter results instead of just wall-clock times for each test. That way, I wouldn't have to care about turbo / power-saving, since I could measure in core clock cycles. (Not the same thing asgettimeofday
/rdtsc
reference cycles unless you disable turbo and other frequency-variation.)You're only testing throughput, not latency, because none of the loads are dependent.
Your cache numbers will be worse than your memory numbers, but you maybe won't realize that it's because your cache numbers may be due to bottlenecking on the number of split-load registers that handle loads/stores that cross a cache-line boundary. For sequential-read, the outer levels of cache are still always just going to see a sequence of requests for whole cache lines. It's only the execution units getting data from L1D that have to care about alignment. To test misalignment for the non-cached case, you could do scattered loads, so cache-line splits would need to bring two cache lines into L1.
Cache lines are 64B wide1, so you're always testing a mix of cache-line splits and within-a-cache-line accesses. Testing always-split loads would bottleneck harder on the split-load microarchitectural resources. (Actually, depending on your CPU, the cache-fetch width might be narrower than the line size. Recent Intel CPUs can fetch any unaligned chunk from inside a cache line, but that's because they have special hardware to make that fast. Other CPUs may only be at their fastest when fetching within a naturally-aligned 16B chunk or something. @BeeOnRope says that AMD CPUs may care about 16B and 32B boundaries.)
You're not testing store->load forwarding at all. For existing tests, and a nice way to visualize results for different alignments, see this stuffedcow.net blog post: Store-to-Load Forwarding and Memory Disambiguation in x86 Processors.
Passing data through memory is an important use-case, and misalignment + cache-line splits can interfere with store-forwarding on some CPUs. To properly test this, make sure you test different misalignments, not just 1:15 (vector) or 1:3 (integer). (You currently only test a +1 offset relative to 16B-alignment).
I forget if it's just for store-forwarding, or for regular loads, but there may be less penalty when a load is split evenly across a cache-line boundary (an 8:8 vector, and maybe also 4:4 or 2:2 integer splits). You should test this. (I might be thinking of P4
lddqu
or Core 2movqdu
)Intel's optimization manual has big tables of misalignment vs. store-forwarding from a wide store to narrow reloads that are fully contained in it. On some CPUs, this works in more cases when the wide store was naturally-aligned, even if it doesn't cross any cache-line boundaries. (Maybe on SnB/IvB, since they use a banked L1 cache with 16B banks, and splits across those can affect store forwarding. I didn't re-check the manual, but if you really want to test this experimentally, that's something you should be looking for.)
Which reminds me, misaligned loads are more likely to provoke cache-bank conflicts on SnB/IvB (because one load can touch two banks). But you won't see this loading from a single stream, because accessing the same bank in the same line twice in one cycle is fine. It's only accessing the same bank in different lines that can't happen in the same cycle. (e.g. when two memory accesses are a multiple of 128B apart.)
You don't make any attempt to test 4k page-splits. They's slower than regular cache-line splits, because they also need two TLB checks. (Skylake improved them from ~100 cycle penalty to ~5 cycle penalty beyond the normal load-use latency, though)
You fail to test
movups
on aligned addresses, so you wouldn't detect thatmovups
is slower thanmovaps
on Core2 and earlier even when the memory is aligned at runtime. (I think unalignedmov
loads up to 8 bytes were fine even in Core2, as long as they didn't cross a cache-line boundary. IDK how old a CPU you'd have to look at to find a problem with non-vector loads within a cache line. It would be a 32-bit only CPU, but you could still test 8B loads with MMX or SSE, or even x87. P5 Pentium and later guarantee that aligned 8B loads/stores are atomic, but P6 and newer guarantee that cached 8B loads/stores are atomic as long as no cache-line boundary is crossed. Unlike AMD where 8B boundaries matter for atomicity guarantees even in cacheable memory. Why is integer assignment on a naturally aligned variable atomic on x86?)Go look at Agner Fog's stuff to learn more about how unaligned loads can be slower, and cook up tests to exercise those cases. Actually, Agner may not be the best resource for that, since his microarch guide mostly focuses on getting uops through the pipeline. Just a brief mention of the cost of cache-line splits, nothing in-depth about throughput vs. latency.
See also: Cacheline splits, take two, from Dark Shikari's blog (x264 lead developer), talking about unaligned load strategies on Core2: it was worth it to check for alignment and use a different strategy for the block.
Footnotes:
See also uarch-bench results for Skylake. Apparently someone has already written a tester that checks every possible misalignment relative to a cache-line boundary.
My testing on Skylake desktop (i7-6700k):
Addressing mode affects load-use latency, exactly as Intel documents in their optimization manual. I tested with integer
mov rax, [rax+...]
, and withmovzx/sx
(in that case using the loaded value as an index, since it's too narrow to be a pointer).Then run with
In this case, I was testing
mov rax, [rax]
, naturally-aligned, so cycles = 4*L1-dcache-loads. 4c latency. I didn't disable turbo or anything like that. Since nothing is going off the core, core clock cycles is the best way to measure.[base + 0..2047]
: 4c load-use latency, 11c cache-line split, 11c 4k-page split (even when inside the same hugepage). See Is there a penalty when base+offset is in a different page than the base? for more details: ifbase+disp
turns out to be in a different page thanbase
, the load uop has to be replayed.[rax - 16]
. It's not disp8 vs. disp32 that makes the difference.So: hugepages don't help avoid page-split penalties (at least not when both pages are hot in the TLB). A cache-line split makes addressing mode irrelevant, but "fast" addressing modes have 1c lower latency for normal and page-split loads.
4k-split handling is fantastically better than before, see @harold's numbers where Haswell has ~32c latency for a 4k-split. (And older CPUs may be even worse than that. I thought pre-SKL it was supposed to be ~100 cycle penalty.)
Throughput (regardless of addressing mode), measured by using a destination other than
rax
so the loads are independent:Same throughput/latency for
movzx/movsx
(including WORD splits), as expected because they're handled in the load port (unlike some AMD CPUs, where there's also an ALU uop).Cache-line split loads get replayed from the RS (Reservation Station). counters for
uops_dispatched_port.port_2
+port_3
= 2x number ofmov rdi, [rdi]
, in another test using basically the same loop. (This was a dependent-load case, not throughput limited.) You can't detect a split load until after AGU.Presumably when a load uop finds out that it needs data from a 2nd line, it looks for a split register (the buffer that Intel CPUs use to handle split loads), and puts the needed part of the data from the first line into that split reg. And also signals back to the RS that it needs to be replayed. (This is guesswork.)
See also Weird performance effects from nearby dependent stores in a pointer-chasing loop on IvyBridge. Adding an extra load speeds it up? for more about uop replays. (But note that's for uops dependent on a load, not the load uop itself. I think a cache miss load sets up everything to use the data when it does arrive, without having to be replayed itself. The issue is that the scheduler aggressively schedules uops consuming the data to dispatch in the cycle when load data might arrive from L2 cache, instead of waiting one extra cycle to see if it did or not. In that Q&A, the dependent uops are also mostly loads.)
So I think even if neither cache line is present, the split-load replay should happen within a few cycles, so demand-load requests for both sides of the split can be in flight at once.
SKL has two hardware page-walk units, which is probably related to the massive improvement in 4k-split performance. Even when there are no TLB misses, presumably older CPUs had to account for the fact that there might be.
It's interesting that the 4k-split throughput is non-integer. I think my measurements had enough precision and repeatability to say this. Remember this is with every load being a 4k-split, and no other work going on (except for being inside a small dec/jnz loop). If you ever have this in real code, you're doing something really wrong.
I don't have any solid guesses at why it might be non-integer, but clearly there's a lot that has to happen microarchitecturally for a 4k-split. It's still a cache-line split, and it has to check the TLB twice.
I'm putting my little bit improved benchmark here. Still measures throughput only (and only unaligned offset 1). Based on the other answers, I've added measuring 64- and 4096-byte splits.
For 4k splits, there's a huge difference! But if the data doesn't cross the 64 byte boundary, there's no speed loss at all (at least for these 2 processors I've tested).
Looking at these numbers (and numbers at other answers), my conclusion is that unaligned access is fast on average (both throughput and latency), but there are cases when it can be much slower. But this doesn't mean that their usage is discouraged.
Raw numbers produced by my benchmark should be taken with a grain of salt (it is highly likely that a properly written asm code outperforms it), but these results mostly agree with harold's answer for Haswell (difference column).
Here's the code:
Testing 64bit loads for various offsets (code below), my raw results on Haswell are:
apply rounding as you see fit, most of them should obviously be rounded down but .3 and .2 (from the page boundary crossing) are perhaps too significant to be noise. This only tested loads with simple addresses, and only "pure loads", no forwarding.
I conclude that alignment within a cache line is not relevant for scalar loads, only crossing cache line boundaries and (especially, and for obvious reasons) crossing page boundaries matters. There seems to be no difference between crossing a cache line boundary exactly in the middle or somewhere else in this case.
AMD occasionally has some funny effects with 16-byte boundaries but I cannot test that.
And here are raw(!) xmm vector results which include the effects of
pextrq
, so subtract 2 cycles of latency:The testing code was
For vectors largely similar but with
pextrq
in the latency test.With some data prepared at various offsets, for example:
To focus a bit more on the new title, I'll describe what this is trying to do and why.
First off, there is a latency test. Loading a million things into
eax
from some pointer that isn't ineax
(as the code in the question does) tests throughput, which is only half of the picture. For scalar loads that is trivial, for vector loads I used pairs of:The latency of
pextrq
is 2, that's why the latency figures for vector loads are all 2 too high as noted.In order to make it easy to do this latency test, the data is a self-referential pointer. That's a fairly atypical scenario, but it shouldn't affect the timing characteristics of the loads.
The throughput test has two loads per loop instead of one to avoid being bottlenecked by the loop overhead. More loads could be used, but that isn't necessary on Haswell (or anything I can think of, but in theory an µarch with a lower branch throughput or a higher load throughput could exist).
I'm not super careful about fencing in the TSC read or compensating for its overhead (or other overhead). I also didn't disable Turbo, I just let it run at turbo frequency and divided by the ratio between the TSC rate and turbo freq, which could affects timings a bit. All of these effects are all tiny compared to a benchmark on the order of 1E7, and the results can be rounded anyway.
All times were best-of-30, things such as average and variance are pointless on these micro benchmarks since the ground truth is not a random process with parameters that we want to estimate but some fixed integer[1] (or integer multiple of a fraction, for throughput). Almost all noise is positive, except the (relatively theoretical) case of instructions from the benchmark "leaking" in front of the first TSC read (this could even be avoided if necessary), so taking the minimum is appropriate.
Note 1: except crossing a 4k boundary apparently, something strange is happening there.