I'm developing an iOS app that needs to convert images from RGB -> BGRA fairly quickly. I would like to use NEON intrinsics if possible. Is there a faster way than simply assigning the components?
void neonPermuteRGBtoBGRA(unsigned char* src, unsigned char* dst, int numPix)
{
numPix /= 8; //process 8 pixels at a time
uint8x8_t alpha = vdup_n_u8 (0xff);
for (int i=0; i<numPix; i++)
{
uint8x8x3_t rgb = vld3_u8 (src);
uint8x8x4_t bgra;
bgra.val[0] = rgb.val[2]; //these lines are slow
bgra.val[1] = rgb.val[1]; //these lines are slow
bgra.val[2] = rgb.val[0]; //these lines are slow
bgra.val[3] = alpha;
vst4_u8(dst, bgra);
src += 8*3;
dst += 8*4;
}
}
The ARMCC disassembly isn't that fast either :
Try this :
mov r2, r2, lsr #3
vmov.u8, d3, #0xff
loop:
vld3.8 {d0-d2}, [r0]!
subs r2, r2, #1
vswp d0, d2
vst4.8 {d0-d3}, [r1]!
bgt loop
bx lr
My suggested code isn't fully optimized either, but further "real" optimizations would harm the readability seriously. So I stop here.
This depends on the compiler. For example when I compile the code above with armcc (5.01) and disassemble it, what I get looks like (I'm just putting the loop and I moved alpha assignment outside of the loop)
18: f420440d vld3.8 {d4-d6}, [r0]!
1c: e2822001 add r2, r2, #1 ; 0x1
20: eeb01b45 fcpyd d1, d5
24: eeb00b46 fcpyd d0, d6
28: eeb02b44 fcpyd d2, d4
2c: f401000d vst4.8 {d0-d3}, [r1]!
30: e1520003 cmp r2, r3
34: bafffff7 blt 18 <neonPermuteRGBtoBGRA_armcc+0x18>
If I compile the code with gcc (4.4.3) and disassemble again I get,
40: f967 040f vld3.8 {d16-d18}, [r7]
44: 46d6 mov lr, sl
46: ecca 0b06 vstmia sl, {d16-d18}
4a: 9d02 ldr r5, [sp, #8]
4c: ed8d 8b1a vstr d8, [sp, #104]
50: 3718 adds r7, #24
52: e8be 000f ldmia.w lr!, {r0, r1, r2, r3}
56: f108 0801 add.w r8, r8, #1 ; 0x1
5a: c50f stmia r5!, {r0, r1, r2, r3}
5c: eddd 0b24 vldr d16, [sp, #144]
60: e89e 0003 ldmia.w lr, {r0, r1}
64: edcd 0b16 vstr d16, [sp, #88]
68: eddd 0b22 vldr d16, [sp, #136]
6c: edcd 0b18 vstr d16, [sp, #96]
70: e885 0003 stmia.w r5, {r0, r1}
74: ed9d 0b26 vldr d0, [sp, #152]
78: 9d03 ldr r5, [sp, #12]
7a: ed8d 0b14 vstr d0, [sp, #80]
7e: cd0f ldmia r5!, {r0, r1, r2, r3}
80: 46ae mov lr, r5
82: 465d mov r5, fp
84: c50f stmia r5!, {r0, r1, r2, r3}
86: e89e 000f ldmia.w lr, {r0, r1, r2, r3}
8a: e885 000f stmia.w r5, {r0, r1, r2, r3}
8e: 9501 str r5, [sp, #4]
90: 465d mov r5, fp
92: 2100 movs r1, #0
94: 2220 movs r2, #32
96: 4620 mov r0, r4
98: f7ff fffe bl 0 <memset>
9c: cd0f ldmia r5!, {r0, r1, r2, r3}
9e: 4625 mov r5, r4
a0: c50f stmia r5!, {r0, r1, r2, r3}
a2: f8dd c004 ldr.w ip, [sp, #4]
a6: e89c 000f ldmia.w ip, {r0, r1, r2, r3}
aa: e885 000f stmia.w r5, {r0, r1, r2, r3}
ae: ecd4 0b08 vldmia r4, {d16-d19}
b2: f946 000f vst4.8 {d16-d19}, [r6]
b6: 3620 adds r6, #32
b8: 45c8 cmp r8, r9
ba: dbc1 blt.n 40 <memset+0x40>
And the execution time was 10 times faster with armcc.
If I compile armcc produced assembly code for the function (it looks like now alpha is back in loop :)) with gcc (inline assembly)
void neonPermuteRGBtoBGRA_gas(unsigned char* src, unsigned char* dst,
int numPix) {
asm(
" ASR r3,r2,#31\n"
" VMOV.I8 d1,#0xff\n"
" ADD r2,r2,r3,LSR #29\n"
" ASR r3,r2,#3\n"
" MOV r2,#0\n"
" CMP r3,#0\n"
" BLE end\n"
"loop:\n"
" VLD3.8 {d4,d5,d6},[r0]!\n"
" ADD r2,r2,#1\n"
" CMP r3,r2\n"
" VMOV.F64 d3,d5\n"
" VMOV.F64 d2,d6\n"
" VMOV.F64 d5,d1\n"
" VMOV.F64 d0,d4\n"
" VST4.8 {d2,d3,d4,d5},[r1]!\n"
" BGT loop\n"
"end:\n"
);
}
I get the same execution time with gcc as well.
At the end what I suggest you is either disassemble your binary and check if the compiler produces what you want or use assembly.
Btw if you want to improve the execution time of this function even further, I suggest you to look into
- arm's PLD (preload data) instruction
- utilize all the possible neon instructions in the loop, like loop unrolling (you'll notice that actually bandwidth will be the data load time from memory)