Fast vectorized conversion from RGB to BGRA

2019-01-24 13:49发布

问题:

In a follow-up to some previous questions on converting RGB to RGBA, and ARGB to BGR, I would like to speed up a RGB to BGRA conversion with SSE. Assume a 32-bit machine, and would like to use intrinsics. I'm having difficulty aligning both source and destination buffers to work with 128-bit registers, and seek for other savvy vectorization solutions.

The routine to be vectorized is as follows...

    void RGB8ToBGRX8(int w, const void *in, void *out)
    {
        int i;
        int width = w;
        const unsigned char *src= (const unsigned char*) in;
        unsigned int *dst= (unsigned int*) out;
        unsigned int invalue, outvalue;

        for (i=0; i<width; i++, src+=3, dst++)
        {
                invalue = src[0];
                outvalue = (invalue<<16);
                invalue = src[1];
                outvalue |= (invalue<<8);
                invalue = src[2];
                outvalue |= (invalue);
                *dst = outvalue | 0xff000000;
        }
      }

This routine gets used primarly for large textures (512KB), so if I can parallelize some of the operations, it may be beneficial to process more pixels at a go. Of course, I'll need to profile. :)

Edit:

My compilation arguments...

gcc -O2 main.c

回答1:

This is an example of using SSSE3 intrinsics to perform the requested operation. The input and output pointers must be 16-byte aligned, and it operates on a block of 16 pixels at a time.

#include <tmmintrin.h>

/* in and out must be 16-byte aligned */
void rgb_to_bgrx_sse(unsigned w, const void *in, void *out)
{
    const __m128i *in_vec = in;
    __m128i *out_vec = out;

    w /= 16;

    while (w-- > 0) {
        /*             0  1  2  3  4  5  6  7  8  9  10 11 12 13 14 15
         * in_vec[0]   Ra Ga Ba Rb Gb Bb Rc Gc Bc Rd Gd Bd Re Ge Be Rf
         * in_vec[1]   Gf Bf Rg Gg Bg Rh Gh Bh Ri Gi Bi Rj Gj Bj Rk Gk
         * in_vec[2]   Bk Rl Gl Bl Rm Gm Bm Rn Gn Bn Ro Go Bo Rp Gp Bp
         */
        __m128i in1, in2, in3;
        __m128i out;

        in1 = in_vec[0];

        out = _mm_shuffle_epi8(in1,
            _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 5, 0xff, 0, 1, 2));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[0] = out;

        in2 = in_vec[1];

        in1 = _mm_and_si128(in1,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in2,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in1);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 5, 6, 7, 0xff, 2, 3, 4, 0xff, 15, 0, 1, 0xff, 12, 13, 14));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[1] = out;

        in3 = in_vec[2];
        in_vec += 3;

        in2 = _mm_and_si128(in2,
            _mm_set_epi8(0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0));
        out = _mm_and_si128(in3,
            _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff));
        out = _mm_or_si128(out, in2);
        out = _mm_shuffle_epi8(out,
            _mm_set_epi8(0xff, 1, 2, 3, 0xff, 14, 15, 0, 0xff, 11, 12, 13, 0xff, 8, 9, 10));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[2] = out;

        out = _mm_shuffle_epi8(in3,
            _mm_set_epi8(0xff, 13, 14, 15, 0xff, 10, 11, 12, 0xff, 7, 8, 9, 0xff, 4, 5, 6));
        out = _mm_or_si128(out,
            _mm_set_epi8(0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0, 0xff, 0, 0, 0));
        out_vec[3] = out;

        out_vec += 4;
    }
}


回答2:

I don't have a complete understanding of what you're asking for, and I'm eagerly awaiting a proper response to your question. In the meantime, I've come up with am implementation that is roughly 8 to 10% faster on average. I'm running Win7 64bit, using VS2010, compiling with C++ for release with the fast option.

#pragma pack(push, 1)
    struct RGB {
        unsigned char r, g, b;
    };

    struct BGRA {
        unsigned char b, g, r, a;
    };
#pragma pack(pop)

    void RGB8ToBGRX8(int width, const void* in, void* out)
    {
        const RGB* src = (const RGB*)in;
        BGRA* dst = (BGRA*)out; 
        do {        
            dst->r = src->r;
            dst->g = src->g;
            dst->b = src->b;
            dst->a = 0xFF;
            src++;
            dst++;
        } while (--width);
    }

This may or may not help, but I hope it does. Please don't down vote me if it doesn't, I'm just trying to move this along.

My motivation for using structs is to allow the compiler to efficiently as possible advance the pointers src and dst. Another motivation is to limit the number of arithmetic operations.



回答3:

I personally found that implementing the following gave me the best result for converting BGR-24 to ARGB-32.

This code runs at about 8.8ms on an image whereas the 128-bit vectorization code presented above came in at 14.5ms per image.

void PixelFix(u_int32_t *buff,unsigned char *diskmem)
{
    int i,j;
    int picptr, srcptr;
    int w = 1920;
    int h = 1080;

    for (j=0; j<h; j++) {
        for (i=0; i<w; i++) {
            buff[picptr++]=(diskmem[srcptr]<<24) | (diskmem[srcptr+1]<<16) | diskmem[srcptr+2]<<8 | 0xff;
            srcptr+=3;
        }
    }
}

Previously, I had been using this routine (about 13.2ms per image). Here, buff is an unsigned char*.

for (j=0; j<h; j++) {
    int srcptr = (h-j-1)*w*3;  // remove if you don't want vertical flipping
    for (i=0; i<w; i++) {
        buff[picptr+3]=diskmem[srcptr++]; // b
        buff[picptr+2]=diskmem[srcptr++]; // g
        buff[picptr+1]=diskmem[srcptr++]; // r
        buff[picptr+0]=255;               // a
        picptr+=4;
    }
}

Running a 2012 MacMini 2.6ghz/i7.



回答4:

Ummm... using vImageConvert_RGB888toARGB8888 is VERY VERY fast (15X speedup).

Above PixelFix code (≈6ms per image, now on newer hardware)


  1. 6.373520 ms
  2. 6.383363 ms
  3. 6.413560 ms
  4. 6.278606 ms
  5. 6.293607 ms
  6. 6.368118 ms
  7. 6.338904 ms
  8. 6.389385 ms
  9. 6.365495 ms

Using vImageConvert_RGB888toARGB888, threaded (on newer hardware)


  1. 0.563649 ms
  2. 0.400387 ms
  3. 0.375198 ms
  4. 0.360898 ms
  5. 0.391278 ms
  6. 0.396797 ms
  7. 0.405534 ms
  8. 0.386495 ms
  9. 0.367621 ms

Need I say more?