In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE. Assume a 32-bit machine, and would like to use intrinsics. I'm having difficulty aligning both source and destination buffers to work with 128-bit registers, and seek for other savvy vectorization solutions.
The routine to be vectorized is as follows...
void RGB8ToBGRX8(int w, const void *in, void *out)
{
int i;
int width = w;
const unsigned char *src= (const unsigned char*) in;
unsigned int *dst= (unsigned int*) out;
unsigned int invalue, outvalue;
for (i=0; i<width; i++, src+=3, dst++)
{
invalue = src[0];
outvalue = (invalue<<16);
invalue = src[1];
outvalue |= (invalue<<8);
invalue = src[2];
outvalue |= (invalue);
*dst = outvalue | 0xff000000;
}
}
This routine gets used primarly for large textures (512KB), so if I can parallelize some of the operations, it may be beneficial to process more pixels at a go. Of course, I'll need to profile. :)
Edit:
My compilation arguments...
gcc -O2 main.c
This is an example of using SSSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.
#include <tmmintrin.h>
/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
const __m128i *in_vec = in;
__m128i *out_vec = out;
w /= 16;
while (w-- > 0) {
/* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
* in_vec[0] Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
* in_vec[1] Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
* in_vec[2] Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
*/
__m128i in1, in2, in3;
__m128i out;
in1 = in_vec[0];
out = _mm_shuffle_epi8(in1,
_mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[0] = out;
in2 = in_vec[1];
in1 = _mm_and_si128(in1,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in2,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
out = _mm_or_si128(out, in1);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[1] = out;
in3 = in_vec[2];
in_vec += 3;
in2 = _mm_and_si128(in2,
_mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
out = _mm_and_si128(in3,
_mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
out = _mm_or_si128(out, in2);
out = _mm_shuffle_epi8(out,
_mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[2] = out;
out = _mm_shuffle_epi8(in3,
_mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
out = _mm_or_si128(out,
_mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
out_vec[3] = out;
out_vec += 4;
}
}
I don't have a complete understanding of what you're asking for, and I'm eagerly awaiting a proper response to your question. In the meantime, I've come up with am implementation that is roughly 8 to 10% faster on average. I'm running Win7 64bit, using VS2010, compiling with C++ for release with the fast option.
#pragma pack(push, 1)
struct RGB {
unsigned char r, g, b;
};
struct BGRA {
unsigned char b, g, r, a;
};
#pragma pack(pop)
void RGB8ToBGRX8(int width, const void* in, void* out)
{
const RGB* src = (const RGB*)in;
BGRA* dst = (BGRA*)out;
do {
dst->r = src->r;
dst->g = src->g;
dst->b = src->b;
dst->a = 0xFF;
src++;
dst++;
} while (--width);
}
This may or may not help, but I hope it does. Please don't down vote me if it doesn't, I'm just trying to move this along.
My motivation for using structs is to allow the compiler to efficiently as possible advance the pointers src and dst. Another motivation is to limit the number of arithmetic operations.
I personally found that implementing the following gave me the best result for converting BGR-24 to ARGB-32.
This code runs at about 8.8ms on an image whereas the 128-bit vectorization code presented above came in at 14.5ms per image.
void PixelFix(u_int32_t *buff,unsigned char *diskmem)
{
int i,j;
int picptr, srcptr;
int w = 1920;
int h = 1080;
for (j=0; j<h; j++) {
for (i=0; i<w; i++) {
buff[picptr++]=(diskmem[srcptr]<<24) | (diskmem[srcptr+1]<<16) | diskmem[srcptr+2]<<8 | 0xff;
srcptr+=3;
}
}
}
Previously, I had been using this routine (about 13.2ms per image). Here, buff is an unsigned char*.
for (j=0; j<h; j++) {
int srcptr = (h-j-1)*w*3; // remove if you don't want vertical flipping
for (i=0; i<w; i++) {
buff[picptr+3]=diskmem[srcptr++]; // b
buff[picptr+2]=diskmem[srcptr++]; // g
buff[picptr+1]=diskmem[srcptr++]; // r
buff[picptr+0]=255; // a
picptr+=4;
}
}
Running a 2012 MacMini 2.6ghz/i7.