Let's say I have the following main loop
.L2:
vmulps ymm1, ymm2, [rdi+rax]
vaddps ymm1, ymm1, [rsi+rax]
vmovaps [rdx+rax], ymm1
add rax, 32
jne .L2
The way I would time this is to put it in another long loop like this
;align 32
.L1:
mov rax, rcx
neg rax
align 32
.L2:
vmulps ymm1, ymm2, [rdi+rax]
vaddps ymm1, ymm1, [rsi+rax]
vmovaps [rdx+rax], ymm1
add rax, 32
jne .L2
sub r8d, 1 ; r8 contains a large integer
jnz .L1
What I'm finding is that the alignment I choose can have a significant effect on the timing (up to +-10%). It's not clear to me how to choose the code alignment. There are three places I can think of where I might want to align the code
- At the entry to the function (see e.g.
triad_fma_asm_repeat
in the code below) - At the start of the outer loop (
.L1
above) which repeats my main loop - At the start of my main loop (
.L2
above).
Another things I have found is that if I put another routine in my source file that changing one instruction (e.g. removing an instruction) can have a significant effect on the timing of the next function even when they are independent functions. I have even seen this in the past affect a routine in another object file.
I have read section 11.5 "Alignment of code" in Agner Fog's optimizing assembly manual but it's still not clear to me the best way to align my code for testing performance. He give an example, 11.5, of timing an inner loop which I don't really follow.
Currently getting the highest performance from my code is a game of guessing different values and locations of alignment.
I would like to know if there is an intelligent method to choose the alignment? Should I align the inner and outerloop? Just the inner loop? The entry to the function as well? Do using short or long NOPs matter?
I'm mostly interested in Haswell, followed by SNB/IVB, and then Core2.
I have tried both NASM and YASM and have discovered that this is one area where they differ significantly. NASM only inserts one byte NOP instructions where YASM inserts multi-byte NOP. For example by aligning both the the inner and outer loop above to 32 bytes NASM inserted 20 NOP (0x90) instructions where as YASM inserted the following (from objdump)
2c: 66 66 66 66 66 66 2e data16 data16 data16 data16 data16 nopw %cs:0x0(%rax,%rax,1)
33: 0f 1f 84 00 00 00 00
3a: 00
3b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
So far I have not observed a significant difference in performance with this. It appears that it's alignment that matters not the instruction length. But Agner writes in the aligning code section:
It is more efficient to use longer instructions that do nothing than to use a lot of single-byte NOP's.
If you want to play with the alignment and see the effects yourself bellow you can find both the assembly and C code I use. Replace double frequency = 3.6
with the effective frequency of your CPU. You may want to disable turbo.
;nasm/yasm -f elf64 align_asm.asm`
global triad_fma_asm_repeat
;RDI x, RSI y, RDX z, RCX n, R8 repeat
;z[i] = y[i] + 3.14159*x[i]
pi: dd 3.14159
section .text
align 16
triad_fma_asm_repeat:
shl rcx, 2
add rdi, rcx
add rsi, rcx
add rdx, rcx
vbroadcastss ymm2, [rel pi]
;neg rcx
;align 32
.L1:
mov rax, rcx
neg rax
align 32
.L2:
vmulps ymm1, ymm2, [rdi+rax]
vaddps ymm1, ymm1, [rsi+rax]
vmovaps [rdx+rax], ymm1
add rax, 32
jne .L2
sub r8d, 1
jnz .L1
vzeroupper
ret
global triad_fma_store_asm_repeat
;RDI x, RSI y, RDX z, RCX n, R8 repeat
;z[i] = y[i] + 3.14159*x[i]
align 16
triad_fma_store_asm_repeat:
shl rcx, 2
add rcx, rdx
sub rdi, rdx
sub rsi, rdx
vbroadcastss ymm2, [rel pi]
;align 32
.L1:
mov r9, rdx
align 32
.L2:
vmulps ymm1, ymm2, [rdi+r9]
vaddps ymm1, ymm1, [rsi+r9]
vmovaps [r9], ymm1
add r9, 32
cmp r9, rcx
jne .L2
sub r8d, 1
jnz .L1
vzeroupper
ret
Here is the C code I use to call the assembly routines and time them
//gcc -std=gnu99 -O3 -mavx align.c -lgomp align_asm.o -o align_avx
//gcc -std=gnu99 -O3 -mfma -mavx2 align.c -lgomp align_asm.o -o align_fma
#include <stdio.h>
#include <string.h>
#include <omp.h>
float triad_fma_asm_repeat(float *x, float *y, float *z, const int n, int repeat);
float triad_fma_store_asm_repeat(float *x, float *y, float *z, const int n, int repeat);
float triad_fma_repeat(float *x, float *y, float *z, const int n, int repeat)
{
float k = 3.14159f;
int r;
for(r=0; r<repeat; r++) {
int i;
__m256 k4 = _mm256_set1_ps(k);
for(i=0; i<n; i+=8) {
_mm256_store_ps(&z[i], _mm256_add_ps(_mm256_load_ps(&x[i]), _mm256_mul_ps(k4, _mm256_load_ps(&y[i]))));
}
}
}
int main (void )
{
int bytes_per_cycle = 0;
double frequency = 3.6;
#if (defined(__FMA__))
bytes_per_cycle = 96;
#elif (defined(__AVX__))
bytes_per_cycle = 48;
#else
bytes_per_cycle = 24;
#endif
double peak = frequency*bytes_per_cycle;
const int n =2048;
float* z2 = (float*)_mm_malloc(sizeof(float)*n, 64);
char *mem = (char*)_mm_malloc(1<<18,4096);
char *a = mem;
char *b = a+n*sizeof(float);
char *c = b+n*sizeof(float);
float *x = (float*)a;
float *y = (float*)b;
float *z = (float*)c;
for(int i=0; i<n; i++) {
x[i] = 1.0f*i;
y[i] = 1.0f*i;
z[i] = 0;
}
int repeat = 1000000;
triad_fma_repeat(x,y,z2,n,repeat);
while(1) {
double dtime, rate;
memset(z, 0, n*sizeof(float));
dtime = -omp_get_wtime();
triad_fma_asm_repeat(x,y,z,n,repeat);
dtime += omp_get_wtime();
rate = 3.0*1E-9*sizeof(float)*n*repeat/dtime;
printf("t1 rate %6.2f GB/s, efficency %6.2f%%, error %d\n", rate, 100*rate/peak, memcmp(z,z2, sizeof(float)*n));
memset(z, 0, n*sizeof(float));
dtime = -omp_get_wtime();
triad_fma_store_asm_repeat(x,y,z,n,repeat);
dtime += omp_get_wtime();
rate = 3.0*1E-9*sizeof(float)*n*repeat/dtime;
printf("t2 rate %6.2f GB/s, efficency %6.2f%%, error %d\n", rate, 100*rate/peak, memcmp(z,z2, sizeof(float)*n));
puts("");
}
}
I'm bothered by the following statement in the NASM manual
A final caveat: ALIGN and ALIGNB work relative to the beginning of the section, not the beginning of the address space in the final executable. Aligning to a 16-byte boundary when the section you're in is only guaranteed to be aligned to a 4-byte boundary, for example, is a waste of effort. Again, NASM does not check that the section's alignment characteristics are sensible for the use of ALIGN or ALIGNB.
I'm not sure the code segment is getting an absolute 32-byte aligned address or only a relative one.
Your loop should ideally (just about) execute in one iteration per clock-cycle, having four mu-ops (add/jne being one). A critical question is the predictability of the inner-loop branch. Up to 16 iterations it should be predicted in the timing code, being always the same, but after that you might be struggling. Firstly, to answer your question, the key alignments for timing are to ensure that neither the code after the jne .L2, nor the first instruction after .L2 cross a 32-byte boundary. I presume that the real question is how to make it run faster, and if my guess of > 16 iterations is correct, the key objective is to make the branch prediction work. To make your timing times shorter should be easy - it is sufficient to have several branches that are all predictable. To make the final code run faster, however, depends on how the real-world values of rax vary, and this will depend also on the routine that calls the loop.
Regarding your last question about relative (within-section) alignment and absolute (in memory at runtime) - you don't have to worry too much. Just below the section of the manual you quoted which warns about
ALIGN
not checking the section alignment, you have this:So basically
ALIGN
doesn't check that the alignment is sensible, but it does call theSECTALIGN
macro so that the alignment will be sensible. In particular, all the implicitSECTALIGN
calls should insure that the section is aligned to the largest alignment specified by any align call.The warning about
ALIGN
not checking then probably only applies to more obscure cases, e.g., when assembling into formats that don't support section alignment, when specifying an alignment larger than that supported by a section, or whenSECTALIGN OFF
has been called to disableSECTALIGN
.