I am trying to improve the performance of some algorithm. So for easy comparison, I made two versions code, one is just normal execution, the other one is using sse. however, sse version is 8X slower than the normal version, i couldn't find out the reason, could anyone point it out for me?
Normal Version (takes 2 seconds):
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <malloc.h>
typedef struct
{
unsigned int L;
unsigned int M;
unsigned int H;
}ResultCounter;
void add(unsigned int N, ResultCounter *counter);
void resetCounter(ResultCounter *counter);
void add(unsigned int N, ResultCounter *counter)
{
// low bit half adder
unsigned int Lcarry = counter->L &N;
counter->L = counter->L ^ N;
// middle bit half adder
unsigned int Mcarry = counter->M &Lcarry;
counter->M = counter->M ^ Lcarry;
// last bit saturates
counter->H = counter->H | Mcarry;
}
void resetCounter(ResultCounter *counter){
counter->L = 0;
counter->M = 0;
counter->H = 0;
}
int main()
{
int i;
ResultCounter *counter = (ResultCounter*)malloc(sizeof(ResultCounter));
resetCounter(counter);
for(i=0;i<=500000000;i++)
{
add(i,counter);
add(i+1,counter);
add(i-2,counter);
add(i-3,counter);
}
printf ("counter.L:%d,counter.M:%d,counter.H:%d\n",counter->L,counter->M,counter->H);
free(counter);
return 0;
}
SSE version (takes 16 seconds):
#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <malloc.h>
#include <x86intrin.h>
typedef struct
{
unsigned int L;
unsigned int M;
unsigned int H;
unsigned int unused;
}ResultCounter;
void add_sse (unsigned int first, unsigned int second, unsigned int third, unsigned int fourth, ResultCounter *c);
void resetCounter(ResultCounter *c);
void add_sse(unsigned int first, unsigned int second, unsigned int third, unsigned int fourth, ResultCounter *c){
__attribute__((align(16))) int m_intarray[4] = {first, second, third,fourth};
__attribute__((align(16))) int m_Larray[4] = {c[0].L, c[1].L, c[2].L,c[3].L};
__attribute__((align(16))) int m_Marray[4] = {c[0].M, c[1].M, c[2].M,c[3].M};
__attribute__((align(16))) int m_Harray[4] = {c[0].H, c[1].H, c[2].H,c[3].H};
__m128i N = _mm_load_si128(&m_intarray[0]);
__m128i L = _mm_load_si128(&m_Larray[0]);
__m128i M = _mm_load_si128(&m_Marray[0]);
__m128i H = _mm_load_si128(&m_Harray[0]);
__m128i Lcarry = _mm_and_si128 (L, N);
L = _mm_xor_si128 (L, N);
__m128i Mcarry = _mm_and_si128 (M, Lcarry);
M = _mm_xor_si128 (M, Lcarry);
H = _mm_or_si128 (H,Mcarry);
_mm_store_si128(&m_Larray[0], L);
_mm_store_si128(&m_Marray[0], M);
_mm_store_si128(&m_Harray[0], H);
c[0].L = m_Larray[0];
c[0].H = m_Harray[0];
c[1].L = m_Larray[1];
c[1].H = m_Harray[1];
c[2].L = m_Larray[2];
c[2].H = m_Harray[2];
c[3].L = m_Larray[3];
c[3].H = m_Harray[3];
c[0].M = m_Marray[0];
c[1].M = m_Marray[1];
c[2].M = m_Marray[2];
c[3].M = m_Marray[3];
}
void resetCounter(ResultCounter *c){
int i;
for (i = 0; i < 4; i++){
c[i].L = c[i].M = c[i].H = c[i].unused=0;
}
}
void printCounter(ResultCounter *c){
int i;
for (i = 0; i < 4; i++){
printf ("***\n%d::L=%d,M=%d,H=%d\n",i,c[i].L, c[i].M ,c[i].H);
}
}
int main()
{
int i;
ResultCounter *c_sse=(ResultCounter *)memalign(16,sizeof(ResultCounter)*4);
resetCounter(c_sse);
for(i=0;i<=500000000;i++)
{
add_sse(i, i+1, i-2, i-3, c_sse);
}
printf ("c_sse[0].L:%d,c_sse[0].M:%d,c_sse[0].H:%d\n",c_sse[0].L,c_sse[0].M,c_sse[0].H);
free(c_sse);
return 0;
}