I've been trying to implement shift by vector in SSE2 intrinsics, but from experimentation and the intel intrinsic guide, it appears to only use the least-significant part of the vector.
To reword my question, given a vector {v1, v2, ..., vn} and a set of shifts {s1, s2, ..., sn}, how do I calculate a result {r1, r2, ..., rn} such that:
r1 = v1 << s1
r2 = v2 << s2
...
rn = vn << sn
since it appears that _mm_sll_epi* performs this:
r1 = v1 << s1
r2 = v2 << s1
...
rn = vn << s1
Thanks in advance.
EDIT:
Here's the code I have:
#include <iostream>
#include <cstdint>
#include <mmintrin.h>
#include <emmintrin.h>
namespace SIMD {
using namespace std;
class SSE2 {
public:
// flipped operands due to function arguments
SSE2(uint64_t a, uint64_t b, uint64_t c, uint64_t d) { low = _mm_set_epi64x(b, a); high = _mm_set_epi64x(d, c); }
uint64_t& operator[](int idx)
{
switch (idx) {
case 0:
_mm_storel_epi64((__m128i*)result, low);
return result[0];
case 1:
_mm_store_si128((__m128i*)result, low);
return result[1];
case 2:
_mm_storel_epi64((__m128i*)result, high);
return result[0];
case 3:
_mm_store_si128((__m128i*)result, high);
return result[1];
}
/* Undefined behaviour */
return 0;
}
SSE2& operator<<=(const SSE2& rhs)
{
low = _mm_sll_epi64(low, rhs.getlow());
high = _mm_sll_epi64(high, rhs.gethigh());
return *this;
}
void print()
{
uint64_t a[2];
_mm_store_si128((__m128i*)a, low);
cout << hex;
cout << a[0] << ' ' << a[1] << ' ';
_mm_store_si128((__m128i*)a, high);
cout << a[0] << ' ' << a[1] << ' ';
cout << dec;
}
__m128i getlow() const
{
return low;
}
__m128i gethigh() const
{
return high;
}
private:
__m128i low, high;
uint64_t result[2];
};
}
int main()
{
cout << "operator<<= test: vector << vector: ";
{
auto x = SIMD::SSE2(7, 8, 15, 10);
auto y = SIMD::SSE2(4, 5, 6, 7);
x.print();
y.print();
x <<= y;
if (x[0] != 112 || x[1] != 256 || x[2] != 960 || x[3] != 1280) {
cout << "FAILED: ";
x.print();
cout << endl;
} else {
cout << "PASSED" << endl;
}
}
return 0;
}
What should be happening gets results of {7 << 4 = 112, 8 << 5 = 256, 15 << 6 = 960, 10 << 7 = 1280}. The results seem to be {7 << 4 = 112, 8 << 4 = 128, 15 << 6 = 960, 15 << 6 = 640}, which isn't what I want.
Hope this helps, Jens.