SSE: How to reduce a _m128i._i32[4] to _m128i._i8

2019-05-12 01:04发布

I'm very new to SSE - coding: And i want to store the result of _m128i[4] with int32 type to one _m128i with int8 type. (The values of _m128i[j]._i32[k] are all between (-127 and + 127 )

I think in pseudo-code this is the following:

 result._i8 = { 
           vec1._i8[0], vec1._i8[4], vec1._i8[8], vec1._i8[12],
           vec2._i8[0], vec2._i8[4], vec2._i8[8], vec2._i8[12],
           vec3._i8[0], vec3._i8[4], vec3._i8[8], vec3._i8[12],          
           vec4._i8[0], vec4._i8[4], vec4._i8[8], vec4._i8[12]};

The only way i found is this messy shuffling.

    __m128i mmResult, mmResult0_3, mmResult4_7, mmResult8_11, mmResult12_15;
    //some calculation ...

    __m128i mmShuffler0_3 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,-1,12, 8, 4, 0);
    __m128i mmShuffler4_7 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1);
    __m128i mmShuffler8_11 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0, -1, -1, -1, -1, -1, -1, -1, -1);
    __m128i mmShuffler12_15 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);

    mmResult0_3 = _mm_shuffle_epi8(mmResult0_3, mmShuffler0_3);
    mmResult4_7 = _mm_shuffle_epi8(mmResult4_7, mmShuffler4_7);
    mmResult8_11 = _mm_shuffle_epi8(mmResult8_11, mmShuffler8_11);
    mmResult12_15 = _mm_shuffle_epi8(mmResult12_15, mmShuffler12_15);

    mmResult = _mm_or_si128(_mm_or_si128(mmResult0_3, mmResult4_7), _mm_or_si128(mmResult8_11, mmResult12_15));

Is there a of doing this more beautiful? Is there a way of doing this faster?

标签: c++ x86 sse simd
1条回答
霸刀☆藐视天下
2楼-- · 2019-05-12 01:16

You can just use packing operations to reduce your 32 bit values to 8 bits, e.g. assuming you have 4 vectors of 32 bit int elements, v0...v3:

__m128i v01 = _mm_packs_epi32(v0, v1);        // pack v0, v1 to 16 bits
__m128i v23 = _mm_packs_epi32(v2, v3);        // pack v2, v3 to 16 bits
__m128i v0123 = _mm_packs_epi16(v01, v23);    // pack v0...v3 to 8 bits

Example:

#include <xmmintrin.h>
#include <stdio.h>

int main(void)
{
    __m128i v0 = _mm_setr_epi32(-8, -7, -6, -5);
    __m128i v1 = _mm_setr_epi32(-4, -3, -2, -1);
    __m128i v2 = _mm_setr_epi32( 0,  1,  2,  3);
    __m128i v3 = _mm_setr_epi32( 4,  5,  6,  7);

    printf("v0 = %vld\n", v0);
    printf("v1 = %vld\n", v1);
    printf("v2 = %vld\n", v2);
    printf("v3 = %vld\n", v3);

    __m128i v01 = _mm_packs_epi32(v0, v1);        // pack v0, v1 to 16 bits
    __m128i v23 = _mm_packs_epi32(v2, v3);        // pack v2, v3 to 16 bits
    __m128i v0123 = _mm_packs_epi16(v01, v23);    // pack v0...v3 to 8 bits

    printf("v0123 = %vd\n", v01234);

    return 0;
}

Compile and test:

$ gcc -Wall pack_32_8.c && ./a.out
v0 = -8 -7 -6 -5
v1 = -4 -3 -2 -1
v2 = 0 1 2 3
v3 = 4 5 6 7
v0123 = -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5 6 7
查看更多
登录 后发表回答