I'm looping some data, calculating some double and every 2 __m128d
operations, I want to store the data on a __m128
float.
So 64+64 + 64+64 (2 __m128d
) stored into 1 32+32+32+32 __m128
.
I do somethings like this:
__m128d v_result;
__m128 v_result_float;
...
// some operations on v_result
// store the first two "slot" on float
v_result_float = _mm_cvtpd_ps(v_result);
// some operations on v_result
// I need to store the last two "slot" on float
v_result_float = _mm_cvtpd_ps(v_result); ?!?
But it overwrite (obviously) the first 2 float "slots" everytime.
How can I "space" the _mm_cvtpd_ps
to start insert values to the 3° and 4° "slot", the second time?
Here's the complete code:
__m128d v_pA;
__m128d v_pB;
__m128d v_result;
__m128 v_result_float;
float *pCEnd = pTest + roundintup8(blockSize);
for (; pTest < pCEnd; pA += 8, pB += 8, pTest += 8) {
v_pA = _mm_load_pd(pA);
v_pB = _mm_load_pd(pB);
v_result = _mm_add_pd(v_pA, v_pB);
v_result = _mm_max_pd(v_boundLower, v_result);
v_result = _mm_min_pd(v_boundUpper, v_result);
v_result = _mm_mul_pd(v_rangeLn2per12, v_result);
v_result = _mm_add_pd(v_minLn2per12, v_result);
// two double processed: store in 1° and 2° float slot
v_result_float = _mm_cvtpd_ps(v_result);
v_pA = _mm_load_pd(pA + 2);
v_pB = _mm_load_pd(pB + 2);
v_result = _mm_add_pd(v_pA, v_pB);
v_result = _mm_max_pd(v_boundLower, v_result);
v_result = _mm_min_pd(v_boundUpper, v_result);
v_result = _mm_mul_pd(v_rangeLn2per12, v_result);
v_result = _mm_add_pd(v_minLn2per12, v_result);
// another two double processed: store in 3° and 4° float slot
v_result_float = _mm_cvtpd_ps(v_result); // fail
v_result_float = someFunction(v_result_float);
_mm_store_ps(pTest, v_result_float);
v_pA = _mm_load_pd(pA + 4);
v_pB = _mm_load_pd(pB + 4);
v_result = _mm_add_pd(v_pA, v_pB);
v_result = _mm_max_pd(v_boundLower, v_result);
v_result = _mm_min_pd(v_boundUpper, v_result);
v_result = _mm_mul_pd(v_rangeLn2per12, v_result);
v_result = _mm_add_pd(v_minLn2per12, v_result);
// two double processed: store in 1° and 2° float slot
v_result_float = _mm_cvtpd_ps(v_result);
v_pA = _mm_load_pd(pA + 6);
v_pB = _mm_load_pd(pB + 6);
v_result = _mm_add_pd(v_pA, v_pB);
v_result = _mm_max_pd(v_boundLower, v_result);
v_result = _mm_min_pd(v_boundUpper, v_result);
v_result = _mm_mul_pd(v_rangeLn2per12, v_result);
v_result = _mm_add_pd(v_minLn2per12, v_result);
// another two double processed: store in 3° and 4° float slot
v_result_float = _mm_cvtpd_ps(v_result); // fail
v_result_float = someFunction(v_result_float);
_mm_store_ps(pTest + 4, v_result_float);
}