I tried a few different ways of implementing this and benchmarked them with a couple of different compilers on an early Core i7 @ 2.67 GHz and a recent Haswell @ 3.6 GHz:
//
// mask_shift_0
//
// use PSHUFB (note: SSSE3 required)
//
inline __m128i mask_shift_0(uint32_t n)
{
const __m128i vmask = _mm_set1_epi8(255);
const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127);
__m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n));
return _mm_shuffle_epi8(vmask, vp);
}
//
// mask_shift_1
//
// use 16 element LUT
//
inline __m128i mask_shift_1(uint32_t n)
{
static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = {
{ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 },
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 }
};
return _mm_load_si128((__m128i *)&mask_lut[n]);
}
//
// mask_shift_2
//
// use misaligned load from 2 vector LUT
//
inline __m128i mask_shift_2(uint32_t n)
{
static const int8_t mask_lut[32] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n));
}
//
// mask_shift_3
//
// use compare and single vector LUT
//
inline __m128i mask_shift_3(uint32_t n)
{
const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
__m128i vn = _mm_set1_epi8(n);
return _mm_cmpgt_epi8(vm, vn);
}
//
// mask_shift_4
//
// use jump table and immediate shifts
//
inline __m128i mask_shift_4(uint32_t n)
{
const __m128i vmask = _mm_set1_epi8(-1);
switch (n)
{
case 0:
return vmask;
case 1:
return _mm_slli_si128(vmask, 1);
case 2:
return _mm_slli_si128(vmask, 2);
case 3:
return _mm_slli_si128(vmask, 3);
case 4:
return _mm_slli_si128(vmask, 4);
case 5:
return _mm_slli_si128(vmask, 5);
case 6:
return _mm_slli_si128(vmask, 6);
case 7:
return _mm_slli_si128(vmask, 7);
case 8:
return _mm_slli_si128(vmask, 8);
case 9:
return _mm_slli_si128(vmask, 9);
case 10:
return _mm_slli_si128(vmask, 10);
case 11:
return _mm_slli_si128(vmask, 11);
case 12:
return _mm_slli_si128(vmask, 12);
case 13:
return _mm_slli_si128(vmask, 13);
case 14:
return _mm_slli_si128(vmask, 14);
case 15:
return _mm_slli_si128(vmask, 15);
}
}
//
// lsb_mask_0
//
// Contributed by by @Leeor/@dtb
//
// uses _mm_set_epi64x
//
inline __m128i lsb_mask_0(int n)
{
if (n >= 8)
return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1);
else
return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8));
}
//
// lsb_mask_1
//
// Contributed by by @Leeor/@dtb
//
// same as lsb_mask_0 but uses conditional operator instead of if/else
//
inline __m128i lsb_mask_1(int n)
{
return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8));
}
Results were interesting:
Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc -O3)
mask_shift_0: 2.23377 ns
mask_shift_1: 2.14724 ns
mask_shift_2: 2.14270 ns
mask_shift_3: 2.15063 ns
mask_shift_4: 2.98304 ns
lsb_mask_0: 2.15782 ns
lsb_mask_1: 2.96628 ns
Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)
mask_shift_0: 1.35014 ns
mask_shift_1: 1.12789 ns
mask_shift_2: 1.04329 ns
mask_shift_3: 1.09258 ns
mask_shift_4: 2.01478 ns
lsb_mask_0: 1.70573 ns
lsb_mask_1: 1.84337 ns
Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc -O2)
mask_shift_0: 0.851416 ns
mask_shift_1: 0.575245 ns
mask_shift_2: 0.577746 ns
mask_shift_3: 0.850086 ns
mask_shift_4: 1.398270 ns
lsb_mask_0: 1.359660 ns
lsb_mask_1: 1.709720 ns
So mask_shift_4
(switch/case) seems to be the slowest method in all cases, whereas the others are pretty similar. The LUT-based methods seem to be consistently the fastest overall.
NB: I get some suspiciously fast numbers with clang -O3
and gcc -O3
(gcc 4.7.2 only) - I need to look at the generated assembly for these cases to see what the compiler is doing, and make sure it is not doing anything "clever", such as optimise away some part of the timing test harness.
If anyone else has any further ideas on this or has another mask_shift implementation they'd like to try I would be happy to add it to the test suite and update the results.