sum of overlapping arrays, auto-vectorization, and

Arstechnia recently had an article Why are some programming languages faster than others. It compares Fortran and C and mentions summing arrays. In Fortran it's assumed that arrays don't overlap so that allows further optimization. In C/C++ pointers to the same type may overlap so this optimization can't be used in general. However, in C/C++ one can use the restrict or __restrict keyword to tell the compiler not to assume the pointers overlap. So I started looking into this in regards to auto-vectorization.

The following code vectorizes in GCC and MSVC

void dot_int(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        c[i] = a[i] + b[i];
    }
}

I tested this with and without overlapping arrays and it gets the correct result. However, the way I would vectorize this loop manually with SSE does not handle overlapping arrays.

int i=0;    
for(; i<n-3; i+=4) {
    __m128i a4 = _mm_loadu_si128((__m128i*)&a[i]);
    __m128i b4 = _mm_loadu_si128((__m128i*)&b[i]);
    __m128i c4 = _mm_add_epi32(a4,b4);
    _mm_storeu_si128((__m128i*)c, c4);
}
for(; i<n; i++) {
    c[i] = a[i] + b[i];
}

Next I tried using __restrict. I assumed that since the compiler could assume the arrays don't overlap it would not handle overlapping arrays but both GCC and MSVC still get the correct result for overlapping arrays even with __restrict.

void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
    for(int i=0; i<n; i++) {
        c[i] = a[i] + b[i];
    }
}

Why does the auto-vectorized code with and without __restrict get the correct result for overlapping arrays?.

Here is the full code I used to test this:

#include <stdio.h>
#include <immintrin.h>
void dot_int(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; i++) {
        c[i] = a[i] + b[i];
    }
    for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); 
}

void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
    for(int i=0; i<n; i++) {
        c[i] = a[i] + b[i];
    }
    for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); 
}

void dot_int_SSE(int *a, int *b, int *c, int n) {
    int i=0;    
    for(; i<n-3; i+=4) {
        __m128i a4 = _mm_loadu_si128((__m128i*)&a[i]);
        __m128i b4 = _mm_loadu_si128((__m128i*)&b[i]);
        __m128i c4 = _mm_add_epi32(a4,b4);
        _mm_storeu_si128((__m128i*)c, c4);
    }
    for(; i<n; i++) {
        c[i] = a[i] + b[i];
    }
    for(int i=0; i<8; i++) printf("%d ", c[i]); printf("\n"); 
}

int main() {
    const int n = 100;
    int a[] = {1,1,1,1,1,1,1,1};
    int b1[] = {1,1,1,1,1,1,1,1,1};
    int b2[] = {1,1,1,1,1,1,1,1,1};
    int b3[] = {1,1,1,1,1,1,1,1,1};

    int c[8];
    int *c1 = &b1[1];
    int *c2 = &b2[1];
    int *c3 = &b3[1];

    dot_int(a,b1,c, 8);
    dot_int_SSE(a,b1,c,8);

    dot_int(a,b1,c1, 8);
    dot_int_restrict(a,b2,c2,8);
    dot_int_SSE(a,b3,c3,8);

}

The output (from MSVC)

2 2 2 2 2 2 2 2 //no overlap default
2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code
2 3 4 5 6 7 8 9 //overlap default
2 3 4 5 6 7 8 9 //overlap with restrict
3 2 2 2 1 1 1 1 //manual SSE vector code

Edit:

Here is another inserting version which produces much simpler code

void dot_int(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
    a = (int*)__builtin_assume_aligned (a, 16);
    b = (int*)__builtin_assume_aligned (b, 16);
    c = (int*)__builtin_assume_aligned (c, 16);
    for(int i=0; i<n; i++) {
        c[i] = a[i] + b[i];
    }
}

标签： c++ c optimization sse auto-vectorization

3条回答

甜甜的少女心

2楼-- · 2019-01-15 19:01

If you use "restrict" with overlapping arrays, you will get undefined behaviour. That's what you get in the "overlap restrict" case. Undefined behaviour means anything can happen. And it did. Just by coincidence, the behaviour was the same as without "restrict". Absolutely correct. It falls straight under the definition of "anything can happen". Nothing to complain about.

0人赞添加讨论(0) 举报

手持菜刀，她持情操

3楼-- · 2019-01-15 19:01

I think I understand what is going on now. It turns out that MSVC and GCC give different results with __restrict. MSVC gets the correct answer with the overlap and GCC does not. I think it's fair to conclude then that MSVC is ignoring the __restrict keyword and GCC is using it to optimize further.

The output from GCC

2 2 2 2 2 2 2 2 //no overlap default
2 2 2 2 2 2 2 2 //no overlap with manual SSE vector code
2 3 4 5 6 7 8 9 //overlap without __restrict
2 2 2 2 3 2 2 2 //overlap with __restrict
3 2 2 2 1 1 1 1 //manual SSE vector code

We can produce a purely vectorized function which gives almost as many assembly lines as C doing (all the code is generated with -O3 in GCC 4.9.0) this:

void dot_int(int * __ restrict a, int * __restrict b, int * __restrict c) {
    a = (int*)__builtin_assume_aligned (a, 16);
    b = (int*)__builtin_assume_aligned (b, 16);
    c = (int*)__builtin_assume_aligned (c, 16);
    for(int i=0; i<1024; i++) {
        c[i] = a[i] + b[i];
    }
}

Produces

dot_int(int*, int*, int*):
    xorl    %eax, %eax
.L2:
    movdqa  (%rdi,%rax), %xmm0
    paffffd   (%rsi,%rax), %xmm0
    movaps  %xmm0, (%rdx,%rax)
    addq    $16, %rax
    cmpq    $4096, %rax
    jne .L2
    rep ret

However, if we remove the __restrict on a which allows a to overlap with c I have determined by looking at the assembly that

void dot_int(int * a, int * __restrict b, int * __restrict c) {
        a = (int*)__builtin_assume_aligned (a, 16);
        b = (int*)__builtin_assume_aligned (b, 16);
        c = (int*)__builtin_assume_aligned (c, 16);
        for(int i=0; i<1024; i++) {
            c[i] = a[i] + b[i];
        }
    }

Is identical to

__attribute__((optimize("no-tree-vectorize")))
inline void dot_SSE(int * __restrict a, int * __restrict b, int * __restrict c) {
    for(int i=0; i<1024; i+=4) {    
        __m128i a4 = _mm_load_si128((__m128i*)&a[i]);
        __m128i b4 = _mm_load_si128((__m128i*)&a[i]);
        __m128i c4 = _mm_add_epi32(a4,b4);
        _mm_store_si128((__m128i*)&c[i],c4);
    }
}
__attribute__((optimize("no-tree-vectorize")))
void dot_int(int * __restrict a, int * __restrict b, int * __restrict c) {
    a = (int*)__builtin_assume_aligned (a, 16);
    b = (int*)__builtin_assume_aligned (b, 16);
    c = (int*)__builtin_assume_aligned (c, 16);
    int pass = 1;
    if((c+4)<a || (a+4)<c) pass = 0;
    if(pass) {
        for(int i=0; i<1024; i++) {
            c[i] = a[i] + b[i];
        }   
    }
    else {
        dot_SSE(a,b,c);
    }
}

In other words if the a and c pointers are within 16 bytes of each other (|a-c|<4) then the code branches to a non-vectorized form. This confirms my guess that the auto-vectorized code includes both a vectorized and non-vectorized version to handle overlap.

On the overlapping arrays this gets the correct result: 2 3 4 5 6 7 8 9

0人赞添加讨论(0) 举报

来，给爷笑一个

4楼-- · 2019-01-15 19:04

I don't get what the problem is. Testing on Linux/64 bit, GCC 4.6, -O3, -mtune=native, -msse4.1 (i.e. a very old compiler/system), this code

void dot_int(int *a, int *b, int *c, int n) {
    for(int i=0; i<n; ++i) {
        c[i] = a[i] + b[i];
    }
}

compiles to this inner loop:

.L4:
    movdqu  (%rdi,%rax), %xmm1
    addl    $1, %r8d
    movdqu  (%rsi,%rax), %xmm0
    paffffd   %xmm1, %xmm0
    movdqu  %xmm0, (%rdx,%rax)
    addq    $16, %rax
    cmpl    %r8d, %r10d
    ja      .L4
    cmpl    %r9d, %ecx
    je      .L1

While this code

void dot_int_restrict(int * __restrict a, int * __restrict b, int * __restrict c, int n) {
    for(int i=0; i<n; ++i) {
        c[i] = a[i] + b[i];
    }
}

compiles to this:

.L15:
    movdqu  (%rbx,%rax), %xmm0
    addl    $1, %r8d
    paffffd   0(%rbp,%rax), %xmm0
    movdqu  %xmm0, (%r11,%rax)
    addq    $16, %rax
    cmpl    %r10d, %r8d
    jb      .L15
    addl    %r12d, %r9d
    cmpl    %r12d, %r13d
    je      .L10

As you can clearly see there's one less load. I guess it correclty estimated that there's no need to explicitely load memory before performing the sum, as the result won't overwrite anythng.

There's also room for way more optimizations -- GCC doesn't know that the parameters are f.i. 128 bit aligned, hence it must generate a huge preamble to check that there are no alignment issues (YMMV), and a postable to deal with extra unaligned parts (or less wide than 128 bits). This actually happens with both versions above. This is the complete code generated for dot_int:

dot_int:
.LFB626:
        .cfi_startproc
        testl   %ecx, %ecx
        pushq   %rbx
        .cfi_def_cfa_offset 16
        .cfi_offset 3, -16
        jle     .L1
        leaq    16(%rdx), %r11
        movl    %ecx, %r10d
        shrl    $2, %r10d
        leal    0(,%r10,4), %r9d
        testl   %r9d, %r9d
        je      .L6
        leaq    16(%rdi), %rax
        cmpl    $6, %ecx
        seta    %r8b
        cmpq    %rax, %rdx
        seta    %al
        cmpq    %r11, %rdi
        seta    %bl
        orl     %ebx, %eax
        andl    %eax, %r8d
        leaq    16(%rsi), %rax
        cmpq    %rax, %rdx
        seta    %al
        cmpq    %r11, %rsi
        seta    %r11b
        orl     %r11d, %eax
        testb   %al, %r8b
        je      .L6
        xorl    %eax, %eax
        xorl    %r8d, %r8d
        .p2align 4,,10
        .p2align 3
.L4:
        movdqu  (%rdi,%rax), %xmm1
        addl    $1, %r8d
        movdqu  (%rsi,%rax), %xmm0
        paffffd   %xmm1, %xmm0
        movdqu  %xmm0, (%rdx,%rax)
        addq    $16, %rax
        cmpl    %r8d, %r10d
        ja      .L4
        cmpl    %r9d, %ecx
        je      .L1
.L3:
        movslq  %r9d, %r8
        xorl    %eax, %eax
        salq    $2, %r8
        addq    %r8, %rdx
        addq    %r8, %rdi
        addq    %r8, %rsi
        .p2align 4,,10
        .p2align 3
.L5:
        movl    (%rdi,%rax,4), %r8d
        addl    (%rsi,%rax,4), %r8d
        movl    %r8d, (%rdx,%rax,4)
        addq    $1, %rax
        leal    (%r9,%rax), %r8d
        cmpl    %r8d, %ecx
        jg      .L5
.L1:
        popq    %rbx
        .cfi_remember_state
        .cfi_def_cfa_offset 8
        ret
.L6:
        .cfi_restore_state
        xorl    %r9d, %r9d
        jmp     .L3
        .cfi_endproc

Now in your case the ints effectively not aligned (as they're on the stack), but if you can make them aligned and tell GCC so, then you can improve code generation:

typedef int intvec __attribute__((vector_size(16)));

void dot_int_restrict_alig(intvec * restrict a, 
                           intvec * restrict b, 
                           intvec * restrict c, 
                           unsigned int n) {
    for(unsigned int i=0; i<n; ++i) {
        c[i] = a[i] + b[i];
    }
}

This generates this code, with no preamble:

dot_int_restrict_alig:
.LFB628:
        .cfi_startproc
        testl   %ecx, %ecx
        je      .L23
        subl    $1, %ecx
        xorl    %eax, %eax
        addq    $1, %rcx
        salq    $4, %rcx
        .p2align 4,,10
        .p2align 3
.L25:
        movdqa  (%rdi,%rax), %xmm0
        paffffd   (%rsi,%rax), %xmm0
        movdqa  %xmm0, (%rdx,%rax)
        addq    $16, %rax
        cmpq    %rcx, %rax
        jne     .L25
.L23:
        rep
        ret
        .cfi_endproc

Note the usage of the aligned 128 bit load instructions (movdqa, a as aligned, vs movdqu, unaligned).

0人赞添加讨论(0) 举报

sum of overlapping arrays, auto-vectorization, and

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间