SSE performance vs normal code

I am trying to improve the performance of some algorithm. So for easy comparison, I made two versions code, one is just normal execution, the other one is using sse. however, sse version is 8X slower than the normal version, i couldn't find out the reason, could anyone point it out for me?

Normal Version (takes 2 seconds):

#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <malloc.h>

typedef struct
{
 unsigned int L;
 unsigned int M;
 unsigned int H;

}ResultCounter;


void add(unsigned int N, ResultCounter *counter);
void resetCounter(ResultCounter *counter);

void add(unsigned int N, ResultCounter *counter)
{

// low bit half adder
 unsigned int Lcarry = counter->L &N;
 counter->L = counter->L ^ N;
 // middle bit half adder
 unsigned int Mcarry = counter->M &Lcarry;
 counter->M = counter->M ^ Lcarry;
 // last bit saturates
 counter->H = counter->H | Mcarry;
}


void resetCounter(ResultCounter *counter){
counter->L = 0;
counter->M = 0;
counter->H = 0;
}


int main()
{
  int i;


  ResultCounter *counter = (ResultCounter*)malloc(sizeof(ResultCounter));
  resetCounter(counter);

for(i=0;i<=500000000;i++)
 {
   add(i,counter);
   add(i+1,counter);
   add(i-2,counter);
   add(i-3,counter);
 }
printf ("counter.L:%d,counter.M:%d,counter.H:%d\n",counter->L,counter->M,counter->H);
free(counter);

return 0;
}

SSE version (takes 16 seconds):

#include <stdio.h>
#include <pthread.h>
#include <stdlib.h>
#include <malloc.h>
#include <x86intrin.h>


typedef struct
{
unsigned int L;
unsigned int M;
unsigned int H;
unsigned int unused;

}ResultCounter;

 void add_sse (unsigned int first, unsigned int second, unsigned int third, unsigned int fourth, ResultCounter *c);
void resetCounter(ResultCounter *c);



 void add_sse(unsigned int first, unsigned int second, unsigned int third, unsigned int fourth, ResultCounter *c){


__attribute__((align(16))) int m_intarray[4] = {first, second, third,fourth};
__attribute__((align(16))) int m_Larray[4] = {c[0].L, c[1].L, c[2].L,c[3].L};
__attribute__((align(16))) int m_Marray[4] = {c[0].M, c[1].M, c[2].M,c[3].M};
__attribute__((align(16))) int m_Harray[4] = {c[0].H, c[1].H, c[2].H,c[3].H};



__m128i N = _mm_load_si128(&m_intarray[0]);
__m128i L = _mm_load_si128(&m_Larray[0]);
__m128i M = _mm_load_si128(&m_Marray[0]);
__m128i H = _mm_load_si128(&m_Harray[0]);




__m128i Lcarry = _mm_and_si128 (L, N);
L = _mm_xor_si128 (L, N);
__m128i Mcarry = _mm_and_si128 (M, Lcarry); 
M = _mm_xor_si128 (M, Lcarry);
H = _mm_or_si128 (H,Mcarry);


_mm_store_si128(&m_Larray[0], L);
_mm_store_si128(&m_Marray[0], M);
_mm_store_si128(&m_Harray[0], H);


c[0].L = m_Larray[0];

c[0].H = m_Harray[0];   
c[1].L = m_Larray[1];
c[1].H = m_Harray[1];
c[2].L = m_Larray[2];
c[2].H = m_Harray[2];
c[3].L = m_Larray[3];
c[3].H = m_Harray[3];

 c[0].M = m_Marray[0];
 c[1].M = m_Marray[1];
 c[2].M = m_Marray[2];
 c[3].M = m_Marray[3];

}


void resetCounter(ResultCounter *c){
int i;
 for (i = 0; i < 4; i++){

    c[i].L = c[i].M = c[i].H = c[i].unused=0;
 }
}
void printCounter(ResultCounter *c){
int i;
 for (i = 0; i < 4; i++){
  printf ("***\n%d::L=%d,M=%d,H=%d\n",i,c[i].L, c[i].M ,c[i].H);
 }
}

int main()
{
   int i;


    ResultCounter *c_sse=(ResultCounter *)memalign(16,sizeof(ResultCounter)*4); 
    resetCounter(c_sse);

    for(i=0;i<=500000000;i++)
    {
       add_sse(i, i+1, i-2, i-3, c_sse);
    }
    printf ("c_sse[0].L:%d,c_sse[0].M:%d,c_sse[0].H:%d\n",c_sse[0].L,c_sse[0].M,c_sse[0].H);
     free(c_sse);

     return 0;
}