Using ARM NEON intrinsics to add alpha and permute

I'm developing an iOS app that needs to convert images from RGB -> BGRA fairly quickly. I would like to use NEON intrinsics if possible. Is there a faster way than simply assigning the components?

void neonPermuteRGBtoBGRA(unsigned char* src, unsigned char* dst, int numPix)
{
    numPix /= 8; //process 8 pixels at a time

    uint8x8_t alpha = vdup_n_u8 (0xff);

    for (int i=0; i<numPix; i++)
    {
        uint8x8x3_t rgb  = vld3_u8 (src);
        uint8x8x4_t bgra;

        bgra.val[0] = rgb.val[2]; //these lines are slow
        bgra.val[1] = rgb.val[1]; //these lines are slow 
        bgra.val[2] = rgb.val[0]; //these lines are slow

        bgra.val[3] = alpha;

        vst4_u8(dst, bgra);

        src += 8*3;
        dst += 8*4;
    }


}

回答1:

The ARMCC disassembly isn't that fast either :

It isn't using the most appropriate instructions
It mixes VFP instructions with NEON ones which causes huge hiccups every time

Try this :

  mov r2, r2, lsr #3
  vmov.u8, d3, #0xff
loop:
  vld3.8 {d0-d2}, [r0]!
  subs r2, r2, #1
  vswp d0, d2
  vst4.8 {d0-d3}, [r1]!
  bgt loop

  bx lr

My suggested code isn't fully optimized either, but further "real" optimizations would harm the readability seriously. So I stop here.

回答2:

This depends on the compiler. For example when I compile the code above with armcc (5.01) and disassemble it, what I get looks like (I'm just putting the loop and I moved alpha assignment outside of the loop)

  18:   f420440d    vld3.8  {d4-d6}, [r0]!
  1c:   e2822001    add r2, r2, #1  ; 0x1
  20:   eeb01b45    fcpyd   d1, d5
  24:   eeb00b46    fcpyd   d0, d6
  28:   eeb02b44    fcpyd   d2, d4
  2c:   f401000d    vst4.8  {d0-d3}, [r1]!
  30:   e1520003    cmp r2, r3
  34:   bafffff7    blt 18 <neonPermuteRGBtoBGRA_armcc+0x18>

If I compile the code with gcc (4.4.3) and disassemble again I get,

  40:   f967 040f   vld3.8  {d16-d18}, [r7]
  44:   46d6        mov lr, sl
  46:   ecca 0b06   vstmia  sl, {d16-d18}
  4a:   9d02        ldr r5, [sp, #8]
  4c:   ed8d 8b1a   vstr    d8, [sp, #104]
  50:   3718        adds    r7, #24
  52:   e8be 000f   ldmia.w lr!, {r0, r1, r2, r3}
  56:   f108 0801   add.w   r8, r8, #1  ; 0x1
  5a:   c50f        stmia   r5!, {r0, r1, r2, r3}
  5c:   eddd 0b24   vldr    d16, [sp, #144]
  60:   e89e 0003   ldmia.w lr, {r0, r1}
  64:   edcd 0b16   vstr    d16, [sp, #88]
  68:   eddd 0b22   vldr    d16, [sp, #136]
  6c:   edcd 0b18   vstr    d16, [sp, #96]
  70:   e885 0003   stmia.w r5, {r0, r1}
  74:   ed9d 0b26   vldr    d0, [sp, #152]
  78:   9d03        ldr r5, [sp, #12]
  7a:   ed8d 0b14   vstr    d0, [sp, #80]
  7e:   cd0f        ldmia   r5!, {r0, r1, r2, r3}
  80:   46ae        mov lr, r5
  82:   465d        mov r5, fp
  84:   c50f        stmia   r5!, {r0, r1, r2, r3}
  86:   e89e 000f   ldmia.w lr, {r0, r1, r2, r3}
  8a:   e885 000f   stmia.w r5, {r0, r1, r2, r3}
  8e:   9501        str r5, [sp, #4]
  90:   465d        mov r5, fp
  92:   2100        movs    r1, #0
  94:   2220        movs    r2, #32
  96:   4620        mov r0, r4
  98:   f7ff fffe   bl  0 <memset>
  9c:   cd0f        ldmia   r5!, {r0, r1, r2, r3}
  9e:   4625        mov r5, r4
  a0:   c50f        stmia   r5!, {r0, r1, r2, r3}
  a2:   f8dd c004   ldr.w   ip, [sp, #4]
  a6:   e89c 000f   ldmia.w ip, {r0, r1, r2, r3}
  aa:   e885 000f   stmia.w r5, {r0, r1, r2, r3}
  ae:   ecd4 0b08   vldmia  r4, {d16-d19}
  b2:   f946 000f   vst4.8  {d16-d19}, [r6]
  b6:   3620        adds    r6, #32
  b8:   45c8        cmp r8, r9
  ba:   dbc1        blt.n   40 <memset+0x40>

And the execution time was 10 times faster with armcc.

If I compile armcc produced assembly code for the function (it looks like now alpha is back in loop :)) with gcc (inline assembly)

  void neonPermuteRGBtoBGRA_gas(unsigned char* src, unsigned char* dst,
    int numPix) {
    asm(
            "        ASR      r3,r2,#31\n"
            "        VMOV.I8  d1,#0xff\n"
            "        ADD      r2,r2,r3,LSR #29\n"
            "        ASR      r3,r2,#3\n"
            "        MOV      r2,#0\n"
            "        CMP      r3,#0\n"
            "        BLE      end\n"
            "loop:\n"
            "        VLD3.8   {d4,d5,d6},[r0]!\n"
            "        ADD      r2,r2,#1\n"
            "        CMP      r3,r2\n"
            "        VMOV.F64 d3,d5\n"
            "        VMOV.F64 d2,d6\n"
            "        VMOV.F64 d5,d1\n"
            "        VMOV.F64 d0,d4\n"
            "        VST4.8   {d2,d3,d4,d5},[r1]!\n"
            "        BGT      loop\n"
            "end:\n"
            );
  }

I get the same execution time with gcc as well.

At the end what I suggest you is either disassemble your binary and check if the compiler produces what you want or use assembly.

Btw if you want to improve the execution time of this function even further, I suggest you to look into

arm's PLD (preload data) instruction
utilize all the possible neon instructions in the loop, like loop unrolling (you'll notice that actually bandwidth will be the data load time from memory)