What numpy optimizations does cython do?

2019-05-29 04:15发布

问题:

I was a little surprised to find that:

# fast_ops_c.pyx
cimport cython
cimport numpy as np

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def c_iseq_f1(np.ndarray[np.double_t, ndim=1, cast=False] x, double val):
    # Test (x==val) except gives NaN where x is NaN
    cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
    cdef size_t i = 0
    cdef double _x = 0
    for i in range(len(x)):
        _x = x[i]
        result[i] = (_x-_x) + (_x==val)
    return result

is orders or magnitude faster than:

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
@cython.nonecheck(False)
def c_iseq_f2(np.ndarray[np.double_t, ndim=1, cast=False] x, double val):
    cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
    cdef size_t i = 0
    cdef double _x = 0
    for _x in x:        # Iterate over elements
        result[i] = (_x-_x) + (_x==val)
    return result

(for large arrays). I'm using the following to test the performance:

# fast_ops.py
try:
    import pyximport
    pyximport.install(setup_args={"include_dirs": np.get_include()}, reload_support=True)
except Exception:
    pass

from fast_ops_c import *
import math
import nump as np

NAN = float("nan")

import unittest
class FastOpsTest(unittest.TestCase):

    def test_eq_speed(self):
        from timeit import timeit
        a = np.random.random(500000)
        a[1] = 2.
        a[2] = NAN

        a2 = c_iseq_f(a, 2.)
        def f1(): c_iseq_f2(a, 2.)
        def f2(): c_iseq_f1(a, 2.)

        # warm up
        [f1() for x in range(20)]
        [f2() for x in range(20)]

        n=1000
        dur = timeit(f1, number=n)
        print dur, "DUR1 s/iter", dur/n

        dur = timeit(f2, number=n)
        print dur, "DUR2 s/iter", dur/n

        dur = timeit(f1, number=n)

        print dur, "DUR1 s/iter", dur/n
        assert dur/n <= 0.005

        dur = timeit(f2, number=n)
        print dur, "DUR2 s/iter", dur/n

        print a2[:10]
        assert a2[0] == 0.
        assert a2[1] == 1.
        assert math.isnan(a2[2])

I'm guessing that for _x in x is interpreted as execute the python iterator for x, and for i in range(n): is interpreted as a C for loop, and x[i] interpreted as C's x[i] array indexing.

However, I'm kinda guessing and trying to follow by example. In its working with numpy (and here) docs, Cython is a little quiet on what's optimized with respect to numpy, and what's not. Is there a guide to what is optimized.


Similarly, the following, which assumes contiguous array memory, is considerably faster that either of the above.

@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False)  # turn off negative index wrapping for entire function
def c_iseq_f(np.ndarray[np.double_t, ndim=1, cast=False, mode="c"] x not None, double val):
    cdef np.ndarray[np.double_t, ndim=1] result = np.empty_like(x)
    cdef size_t i = 0

    cdef double* _xp = &x[0]
    cdef double* _resultp = &result[0]
    for i in range(len(x)):
        _x = _xp[i]
        _resultp[i] = (_x-_x) + (_x==val)
    return result

回答1:

The reason for this surprise is that x[i] is more subtle as it looks. Let's take a look at the following cython function:

%%cython
def cy_sum(x):
   cdef double res=0.0
   cdef int i
   for i in range(len(x)):
         res+=x[i]
   return res

And measure its performance:

import numpy as np
a=np.random.random((2000,))
%timeit cy_sum(a)

>>>1000 loops, best of 3: 542 µs per loop

This is pretty slow! If you look into the produced C-code, you will see, that x[i] uses the __getitem()__ functionality, which takes a C-double, creates a python-Float object, registers it in the garbage collector, casts it back to a C-double and destroys the temporary python-float. Pretty much overhead for a single double-addition!

Let's make it clear to cython, that x is a typed memory view:

%%cython
def cy_sum_memview(double[::1] x):
   cdef double res=0.0
   cdef int i
   for i in range(len(x)):
         res+=x[i]
   return res

with a much better performance:

%timeit cy_sum_memview(a)   
>>> 100000 loops, best of 3: 4.21 µs per loop

So what happened? Because cython know, that x is a typed memory view (I would rather use typed memory view than numpy-array in the signature of the cython-functions), it no longer must use the python-functionality __getitem__ but can access the C-double value directly without the need to create an intermediate python-float.

But back to the numpy-arrays. Numpy arrays can be intepreted by cython as typed memory views and thus x[i] can be translated into a direct/fast access to the underlying memory.

So what about for-range?

%%cython
cimport array
def cy_sum_memview_for(double[::1] x):
    cdef double res=0.0
    cdef double x_
    for x_ in x:
          res+=x_
    return res

%timeit cy_sum_memview_for(a)
>>> 1000 loops, best of 3: 736 µs per loop

It is slow again. So cython seems not to be clever enough to replace the for-range through direct/fast access and once again uses python-functionality with the resulting overhead.

I'm must confess I'm as surprised as you are, because at first sight there is no good reason why cython should not be able to use fast access in the case of the for-range. But this is how it is...


I'm not sure, that this is the reason but the situation is not that simple with two dimensional arrays. Consider the following code:

import numpy as np
a=np.zeros((5,1), dtype=int)
for d in a:
    print(int(d)+1)

This code works, because d is a 1-length array and thus can be be converted to Python scalar via int(d).

However,

for d in a.T:
    print(int(d)+1)

throws, because now d's length is 5 and thus it cannot be converted to a Python scalar.

Because we want this code have the same behavior as pure Python when cythonized and it can be determined only during the runtime whether the conversion to int is Ok or not, we have use a Python-object for d first and only than can we access the content of this array.



回答2:

Cython can translate range(len(x)) loops into nearly onLy C Code:

for i in range(len(x)):

Generated code:

  __pyx_t_6 = PyObject_Length(((PyObject *)__pyx_v_x)); if (unlikely(__pyx_t_6 == -1)) __PYX_ERR(0, 17, __pyx_L1_error)
  for (__pyx_t_7 = 0; __pyx_t_7 < __pyx_t_6; __pyx_t_7+=1) {
    __pyx_v_i = __pyx_t_7;

But this remains Python:

 for _x in x:        # Iterate over elements

Generated code:

  if (likely(PyList_CheckExact(((PyObject *)__pyx_v_x))) || PyTuple_CheckExact(((PyObject *)__pyx_v_x))) {
    __pyx_t_1 = ((PyObject *)__pyx_v_x); __Pyx_INCREF(__pyx_t_1); __pyx_t_6 = 0;
    __pyx_t_7 = NULL;
  } else {
    __pyx_t_6 = -1; __pyx_t_1 = PyObject_GetIter(((PyObject *)__pyx_v_x)); if (unlikely(!__pyx_t_1)) __PYX_ERR(0, 12, __pyx_L1_error)
    __Pyx_GOTREF(__pyx_t_1);
    __pyx_t_7 = Py_TYPE(__pyx_t_1)->tp_iternext; if (unlikely(!__pyx_t_7)) __PYX_ERR(0, 12, __pyx_L1_error)
  }
  for (;;) {
    if (likely(!__pyx_t_7)) {
      if (likely(PyList_CheckExact(__pyx_t_1))) {
        if (__pyx_t_6 >= PyList_GET_SIZE(__pyx_t_1)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_3 = PyList_GET_ITEM(__pyx_t_1, __pyx_t_6); __Pyx_INCREF(__pyx_t_3); __pyx_t_6++; if (unlikely(0 < 0)) __PYX_ERR(0, 12, __pyx_L1_error)
        #else
        __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_3);
        #endif
      } else {
        if (__pyx_t_6 >= PyTuple_GET_SIZE(__pyx_t_1)) break;
        #if CYTHON_ASSUME_SAFE_MACROS && !CYTHON_AVOID_BORROWED_REFS
        __pyx_t_3 = PyTuple_GET_ITEM(__pyx_t_1, __pyx_t_6); __Pyx_INCREF(__pyx_t_3); __pyx_t_6++; if (unlikely(0 < 0)) __PYX_ERR(0, 12, __pyx_L1_error)
        #else
        __pyx_t_3 = PySequence_ITEM(__pyx_t_1, __pyx_t_6); __pyx_t_6++; if (unlikely(!__pyx_t_3)) __PYX_ERR(0, 12, __pyx_L1_error)
        __Pyx_GOTREF(__pyx_t_3);
        #endif
      }
    } else {
      __pyx_t_3 = __pyx_t_7(__pyx_t_1);
      if (unlikely(!__pyx_t_3)) {
        PyObject* exc_type = PyErr_Occurred();
        if (exc_type) {
          if (likely(exc_type == PyExc_StopIteration || PyErr_GivenExceptionMatches(exc_type, PyExc_StopIteration))) PyErr_Clear();
          else __PYX_ERR(0, 12, __pyx_L1_error)
        }
        break;
      }
      __Pyx_GOTREF(__pyx_t_3);
    }
    __pyx_t_8 = __pyx_PyFloat_AsDouble(__pyx_t_3); if (unlikely((__pyx_t_8 == (double)-1) && PyErr_Occurred())) __PYX_ERR(0, 12, __pyx_L1_error)
    __Pyx_DECREF(__pyx_t_3); __pyx_t_3 = 0;
    __pyx_v__x = __pyx_t_8;
/* … */
  }
  __Pyx_DECREF(__pyx_t_1); __pyx_t_1 = 0;

Generating this output is typically the best way to find out.