gcc -O0 outperforming -O3 on matrix sizes that are

2019-03-09 18:55发布

问题:

(For testing purposes) I have written a simple Method to calculate the transpose of a nxn Matrix

void transpose(const size_t _n, double* _A) {
    for(uint i=0; i < _n; ++i) {
        for(uint j=i+1; j < _n; ++j) {
            double tmp  = _A[i*_n+j];
            _A[i*_n+j] = _A[j*_n+i];
            _A[j*_n+i] = tmp;
        }
    }
}

When using optimization levels O3 or Ofast I expected the compiler to unroll some loops which would lead to higher performance especially when the matrix size is a multiple of 2 (i.e., the double loop body can be performed each iteration) or similar. Instead what I measured was the exact opposite. Powers of 2 actually show a significant spike in execution time.

These spikes are also at regular intervals of 64, more pronounced at intervals of 128 and so on. Each spike extends to the neighboring matrix sizes like in the following table

size n  time(us)
1020    2649
1021    2815
1022    3100
1023    5428
1024    15791
1025    6778
1026    3106
1027    2847
1028    2660
1029    3038
1030    2613

I compiled with a gcc version 4.8.2 but the same thing happens with a clang 3.5 so this might be some generic thing?

So my question basically is: Why is there this periodic increase in execution time? Is it some generic thing coming with any of the optimization options (as it happens with clang and gcc alike)? If so which optimization option is causing this?

And how can this be so significant that even the O0 version of the program outperforms the 03 version at multiples of 512?


EDIT: Note the magnitude of the spikes in this (logarithmic) plot. Transposing a 1024x1024 matrix with optimization actually takes as much time as transposing a 1300x1300 matrix without optimization. If this is a cache-fault / page-fault problem, then someone needs to explain to me why the memory layout is so significantly different for the optimized version of the program, that it fails for powers of two, just to recover high performance for slightly larger matrices. Shouldn't cache-faults create more of a step-like pattern? Why does the execution times go down again at all? (and why should optimization create cache-faults that weren't there before?)


EDIT: the following should be the assembler codes that gcc produced

no optimization (O0):

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbp
    .cfi_def_cfa_offset 16
    .cfi_offset 6, -16
    mov rbp, rsp
    .cfi_def_cfa_register 6
    mov QWORD PTR [rbp-24], rdi
    mov QWORD PTR [rbp-32], rsi
    mov DWORD PTR [rbp-4], 0
    jmp .L2
.L5:
    mov eax, DWORD PTR [rbp-4]
    add eax, 1
    mov DWORD PTR [rbp-8], eax
    jmp .L3
.L4:
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rax, rdx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rbp-16], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-4]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-8]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-32]
    mov rcx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rsi, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rsi
    sal rax, 3
    add rax, rcx
    mov rax, QWORD PTR [rax]
    mov QWORD PTR [rdx], rax
    mov rax, QWORD PTR [rbp-32]
    mov rdx, QWORD PTR [rax]
    mov eax, DWORD PTR [rbp-8]
    imul    rax, QWORD PTR [rbp-24]
    mov rcx, rax
    mov eax, DWORD PTR [rbp-4]
    add rax, rcx
    sal rax, 3
    add rdx, rax
    mov rax, QWORD PTR [rbp-16]
    mov QWORD PTR [rdx], rax
    add DWORD PTR [rbp-8], 1
.L3:
    mov eax, DWORD PTR [rbp-8]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L4
    add DWORD PTR [rbp-4], 1
.L2:
    mov eax, DWORD PTR [rbp-4]
    cmp rax, QWORD PTR [rbp-24]
    jb  .L5
    pop rbp
    .cfi_def_cfa 7, 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

with optimization (O3)

_Z9transposemRPd:
.LFB0:
    .cfi_startproc
    push    rbx
    .cfi_def_cfa_offset 16
    .cfi_offset 3, -16
    xor r11d, r11d
    xor ebx, ebx
.L2:
    cmp r11, rdi
    mov r9, r11
    jae .L10
    .p2align 4,,10
    .p2align 3
.L7:
    add ebx, 1
    mov r11d, ebx
    cmp rdi, r11
    mov rax, r11
    jbe .L2
    mov r10, r9
    mov r8, QWORD PTR [rsi]
    mov edx, ebx
    imul    r10, rdi
    .p2align 4,,10
    .p2align 3
.L6:
    lea rcx, [rax+r10]
    add edx, 1
    imul    rax, rdi
    lea rcx, [r8+rcx*8]
    movsd   xmm0, QWORD PTR [rcx]
    add rax, r9
    lea rax, [r8+rax*8]
    movsd   xmm1, QWORD PTR [rax]
    movsd   QWORD PTR [rcx], xmm1
    movsd   QWORD PTR [rax], xmm0
    mov eax, edx
    cmp rdi, rax
    ja  .L6
    cmp r11, rdi
    mov r9, r11
    jb  .L7
.L10:
    pop rbx
    .cfi_def_cfa_offset 8
    ret
    .cfi_endproc
.LFE0:
    .size   _Z9transposemRPd, .-_Z9transposemRPd
    .ident  "GCC: (Debian 4.8.2-15) 4.8.2"
    .section    .note.GNU-stack,"",@progbits

回答1:

The periodic increase of execution time must be due to the cache being only N-way associative instead of fully associative. You are witnessing hash collision related to cache line selection algorithm.

The fastest L1 cache has a smaller number of cache lines than the next level L2. In each level each cache line can be filled only from a limited set of sources.

Typical HW implementations of cache line selection algorithms will just use few bits from the memory address to determine in which cache slot the data should be written -- in HW bit shifts are free.

This causes a competition between memory ranges e.g. between addresses 0x300010 and 0x341010. In fully sequential algorithm this doesn't matter -- N is large enough for practically all algorithms of the form:

 for (i=0;i<1000;i++) a[i] += b[i] * c[i] + d[i];

But when the number of the inputs (or outputs) gets larger, which happens internally when the algorithm is optimized, having one input in the cache forces another input out of the cache.

 // one possible method of optimization with 2 outputs and 6 inputs
 // with two unrelated execution paths -- should be faster, but maybe it isn't
 for (i=0;i<500;i++) { 
       a[i]     += b[i]     * c[i]     + d[i];
       a[i+500] += b[i+500] * c[i+500] + d[i+500];
 }

A graph in Example 5: Cache Associativity illustrates 512 byte offset between matrix lines being a global worst case dimension for the particular system. When this is known, a working mitigation is to over-allocate the matrix horizontally to some other dimension char matrix[512][512 + 64].



回答2:

The improvement in performance is likely related to CPU/RAM caching.

When the data is not a power of 2, a cache line load (like 16, 32, or 64 words) transfers more than the data that is required tying up the bus—uselessly as it turns out. For a data set which is a power of 2, all of the pre-fetched data is used.

I bet if you were to disable L1 and L2 caching, the performance would be completely smooth and predictable. But it would be much slower. Caching really helps performance!



回答3:

Comment with code: In the -O3 case, with

#include <cstdlib>

extern void transpose(const size_t n, double* a)
{
    for (size_t i = 0; i < n; ++i) {
        for (size_t j = i + 1; j < n; ++j) {
            std::swap(a[i * n + j], a[j * n + i]); // or your expanded version.
        }
    }
}

compiling with

$ g++ --version
g++ (Ubuntu/Linaro 4.8.1-10ubuntu9) 4.8.1
...
$ g++ -g1 -std=c++11 -Wall -o test.S -S test.cpp -O3

I get

_Z9transposemPd:
.LFB68:
    .cfi_startproc
.LBB2:
    testq   %rdi, %rdi
    je  .L1
    leaq    8(,%rdi,8), %r10
    xorl    %r8d, %r8d
.LBB3:
    addq    $1, %r8
    leaq    -8(%r10), %rcx
    cmpq    %rdi, %r8
    leaq    (%rsi,%rcx), %r9
    je  .L1
    .p2align 4,,10
    .p2align 3
.L10:
    movq    %r9, %rdx
    movq    %r8, %rax
    .p2align 4,,10
    .p2align 3
.L5:
.LBB4:
    movsd   (%rdx), %xmm1
    movsd   (%rsi,%rax,8), %xmm0
    movsd   %xmm1, (%rsi,%rax,8)
.LBE4:
    addq    $1, %rax
.LBB5:
    movsd   %xmm0, (%rdx)
    addq    %rcx, %rdx
.LBE5:
    cmpq    %rdi, %rax
    jne .L5
    addq    $1, %r8
    addq    %r10, %r9
    addq    %rcx, %rsi
    cmpq    %rdi, %r8
    jne .L10
.L1:
    rep ret
.LBE3:
.LBE2:
    .cfi_endproc

And something quite different if I add -m32.

(Note: it makes no difference to the assembly whether I use std::swap or your variant)

In order to understand what is causing the spikes, though, you probably want to visualize the memory operations going on.



回答4:

To add to others: g++ -std=c++11 -march=core2 -O3 -c -S - gcc version 4.8.2 (MacPorts gcc48 4.8.2_0) - x86_64-apple-darwin13.0.0 :

__Z9transposemPd:
LFB0:
        testq   %rdi, %rdi
        je      L1
        leaq    8(,%rdi,8), %r10
        xorl    %r8d, %r8d
        leaq    -8(%r10), %rcx
        addq    $1, %r8
        leaq    (%rsi,%rcx), %r9
        cmpq    %rdi, %r8
        je      L1
        .align 4,0x90
L10:
        movq    %r9, %rdx
        movq    %r8, %rax
        .align 4,0x90
L5:
        movsd   (%rdx), %xmm0
        movsd   (%rsi,%rax,8), %xmm1
        movsd   %xmm0, (%rsi,%rax,8)
        addq    $1, %rax
        movsd   %xmm1, (%rdx)
        addq    %rcx, %rdx
        cmpq    %rdi, %rax
        jne     L5
        addq    $1, %r8
        addq    %r10, %r9
        addq    %rcx, %rsi
        cmpq    %rdi, %r8
        jne     L10
L1:
        rep; ret

Basically the same as @ksfone's code, for:

#include <cstddef>

void transpose(const size_t _n, double* _A) {
    for(size_t i=0; i < _n; ++i) {
        for(size_t j=i+1; j < _n; ++j) {
            double tmp  = _A[i*_n+j];
            _A[i*_n+j] = _A[j*_n+i];
            _A[j*_n+i] = tmp;
        }
    }
}

Apart from the Mach-O 'as' differences (extra underscore, align and DWARF locations), it's the same. But very different from the OP's assembly output. A much 'tighter' inner loop.