In order to make some speed comparisons between Cython with SIMD intrinsics (AVX) VS Numpy methods (which from what i know, also provides vectorizing), i have build this simple sum function:
import time
import numpy as np
cimport numpy as np
cimport cython
cdef extern from 'immintrin.h':
ctypedef double __m256d
__m256d __cdecl _mm256_load_pd(const double *to_load) nogil
void __cdecl _mm256_store_pd(double *to_store, __m256d __M) nogil
__m256d __cdecl _mm256_add_pd(__m256d __M1, __m256d __M2) nogil
@cython.boundscheck(False) # Deactivate bounds checking
@cython.wraparound(False) # Deactivate negative indexing
def sum(len,n_sums):
cdef __m256d __madd1, __madd2 , __msum
cdef double sum4[4]
len=len-(len%4)
a=np.random.rand(len)
cdef double [::1] a_view=a
cdef np.ndarray[np.float64_t,ndim=1] ca=np.copy(a)
cdef long N=n_sums, L=len, i
cdef double s=0.0
cdef Py_ssize_t j
t1=time.clock()
for i in range(N):
s=0.0
with nogil:
__madd1=_mm256_load_pd(&a_view[0])
for j in range(4,L,4):
__madd2=_mm256_load_pd(&a_view[j])
__msum=_mm256_add_pd(__madd1,__madd2)
__madd1=__msum
_mm256_store_pd(&sum4[0],__msum)
s=sum4[0]+sum4[1]+sum4[2]+sum4[3]
t2=time.clock()
print(s, sum4)
print("Cython sum", t2-t1)
t1=time.clock()
for i in range(N):
s=np.sum(ca)
t2=time.clock()
print(s)
print("np sum", t2-t1)
Which gives nice speed up with cython sum, especially with small size arrays.
Now to go further, i would like to parallelize the nogil block with OpenMP. Unfortunately, i can't find a way to do this with cython.parallel.prange/parallel