How can I improve performance compiling for SSE an

2019-07-24 07:19发布

My new PC has a Core i7 CPU and I am running my benchmarks, including newer versions that use AVX instructions. I have installed Visual Studio 2013 to use a newer compiler, as my last one could not fully compile for full SSE SIMD operation. Below is some code used in one of my benchmarks (MPMFLOPS), and compile and link commands used. Tests were run with the first command to use SSE instructions. When xtra is 16 or less, the benchmark produces 24.4 GFLOPS. CPU runs at 3.9 GHz, so result is good at 6.25 calculations per cycle, compared with maximum of four multiples and four adds. Increasing xtra to greater than 16, produces 2.6 GFLOPS. Reducing words to much lower values makes to difference to speed.

 /*  
 Visual Studio 2013
 C/C++ Optimizing Compiler Version 18.00.21005.1 for x64

 cl /O2 /Oi /MD /W4 /TP /EHsc /Zi /Fa /c mflops.c
 cl /O2 /Oi /MD /W4 /TP /EHsc /Zi /Fa /arch:AVX /c mflops.c

 link /LARGEADDRESSAWARE mflops.obj CPUasm.obj asmtimeavx.obj 
       BUFFEROVERFLOWU.LIB
 link Includes CPUID information with identification of AVX and timer
 */

 #include <stdio.h>
 #include <stdlib.h>
 #include "asmtimeavx.h"
 #include <windows.h>
 #include <time.h>
 #include <malloc.h>



int main()
{
    float   *x;
    float   a = 0.000020f;
    float   b = 0.999980f;
    float   c = 0.000011f;
    float   d = 1.000011f;
    float   e = 0.000012f;
    float   f = 0.999992f;
    float   mflops;
    int i, j;
    int xtra = 16; //  24447 MFLOPS, > 16 around 2600 MFLOPS
    int words = 1000000;

    x = (float *)_aligned_malloc(words * 4, 16);

    for (i = 0; i < words; i++) x[i] = 0.999999f;

    start_time();

    for (j = 0; j < xtra; j++)
    {
            for (i = 0; i < words; i++)
        {
                    x[i] = (x[i] + a)*b - (x[i] + c)*d + (x[i] + e)*f;
            }
    }

    end_time();
    mflops = (float)words * (float)xtra * 8.0f / 1000000.0f / (float)secs;

    printf("%18.8f, %18.8f, %10.7f secs, %8.2f mflops\n\n", x[0], 
            x[words-1], secs, mflops);

    _aligned_free(x);

    return 0;
 }

Following shows assembly code generated, where such as mulps is full SIMD with four values in 128 bit registers, with mulss using one floating point number (SISD).

Windows SSE
words = 1000000                  
xtra  = 16                             xtra > 16

call    start_time
npad    10
$LL6@main:
mov     rcx, rsi
mov     edx, 125000
npad    8
$LL3@main:
movups  xmm1, XMMWORD PTR [rcx]
add     rcx, 32                         movaps  xmm1, xmm2
movaps  xmm2, xmm1                      movaps  xmm0, xmm2
movaps  xmm0, xmm1                      addss   xmm2, xmm8
addps   xmm1, xmm10                     addss   xmm0, xmm6
addps   xmm2, xmm6                      addss   xmm1, xmm4
addps   xmm0, xmm8                      dec     rax
mulps   xmm1, xmm11                     mulss   xmm2, xmm9
mulps   xmm2, xmm7                      mulss   xmm0, xmm7
mulps   xmm0, xmm9                      mulss   xmm1, xmm5
subps   xmm2, xmm0                      subss   xmm1, xmm0
addps   xmm2, xmm1                      addss   xmm1, xmm2
movups  XMMWORD PTR [rcx-32], xmm2      movaps  xmm2, xmm1
movups  xmm1, XMMWORD PTR [rcx-16]      movaps  xmm0, xmm1
movaps  xmm2, xmm1                      addss   xmm1, xmm8
movaps  xmm0, xmm1                      addss   xmm2, xmm4
addps   xmm2, xmm6                      addss   xmm0, xmm6
addps   xmm0, xmm8                      mulss   xmm1, xmm9
addps   xmm1, xmm10                     mulss   xmm0, xmm7
mulps   xmm2, xmm7                      mulss   xmm2, xmm5
mulps   xmm0, xmm9                      subss   xmm2, xmm0
mulps   xmm1, xmm11                     addss   xmm2, xmm1
subps   xmm2, xmm0                      movaps  xmm3, xmm2
addps   xmm2, xmm1                      movaps  xmm0, xmm2
movups  XMMWORD PTR [rcx-16], xmm2      addss   xmm2, xmm8
dec     rdx
jne     SHORT $LL3@main                 More of the same
dec     rbx                             Loop 82 lines
jne     SHORT $LL6@main
call    end_time

Next I compiled the program to use AVX instructions, but this produced the same speed as using SSE. Following is assembly code generated. Also shown is much faster code included in that generated via Linux (GCC with Ubuntu 14.04). Linux speeds were reduced somewhat with a greater number of words but the parameters shown produced SISD type results via Windows.

Note that Windows code is using 128 bit xmm registers but Linux employs 256 bit ymm registers. Does anyone have an explanation of what is going on or suggestions to improve performance using this sample program.

Windows AVX                             Linux AVX
Only uses xmm registers                 Part can use ymm registers
                                        Othe parts imcludes xmm
                                        xtra = 1000000; words = 10000; 44271 MFLOPS
call    start_time                      xtra = 10000000; words = 1000; 45653 MFLOPS
npad    6                               SSE
$LL6@main:                               xtra = 1000000; words = 10000; 24492 MFLOPS
mov     rcx, rsi
mov     edx, 125000
npad    8
$LL3@main:                              .L24:
vmovups xmm0, XMMWORD PTR [rcx]         vmovaps (%rcx,%rax), %ymm6
lea     rcx, QWORD PTR [rcx+32]         addl    $1, %edx
vmovups xmm5, xmm0                      vaddps  %ymm5, %ymm6, %ymm13
vaddps  xmm1, xmm0, xmm6                vaddps  %ymm3, %ymm6, %ymm7
vaddps  xmm0, xmm0, xmm8                vaddps  %ymm1, %ymm6, %ymm6
vmulps  xmm2, xmm0, xmm9                vmulps  %ymm4, %ymm13, %ymm13
vmulps  xmm3, xmm1, xmm7                vmulps  %ymm2, %ymm7, %ymm7
vsubps  xmm4, xmm3, xmm2                vmulps  %ymm0, %ymm6, %ymm6
vaddps  xmm1, xmm5, xmm10               vsubps  %ymm7, %ymm13, %ymm7
vmulps  xmm0, xmm1, xmm11               vaddps  %ymm6, %ymm7, %ymm6
vaddps  xmm2, xmm4, xmm0                vmovaps %ymm6, (%rcx,%rax)
vmovups XMMWORD PTR [rcx-32], xmm2      addq    $32, %rax
vmovups xmm0, XMMWORD PTR [rcx-16]      cmpl    %edx, %esi
vaddps  xmm1, xmm0, xmm6                ja      .L24
vmovups xmm5, xmm0
vaddps  xmm0, xmm0, xmm8
vmulps  xmm2, xmm0, xmm9
vmulps  xmm3, xmm1, xmm7
vaddps  xmm1, xmm5, xmm10
vsubps  xmm4, xmm3, xmm2
vmulps  xmm0, xmm1, xmm11
vaddps  xmm2, xmm4, xmm0
vmovups XMMWORD PTR [rcx-16], xmm2
dec     rdx
jne     SHORT $LL3@main
dec     rbx
jne     $LL6@main
call    end_time

0条回答
登录 后发表回答