Is there a way to make numpy.argmin() as fast as m

I'm trying to find the minimum array indices along one dimension of a very large 2D numpy array. I'm finding that this is very slow (already tried speeding it up with bottleneck, which was only a minimal improvement). However, taking the straight minimum appears to be an order of magnitude faster:

import numpy as np
import time

randvals = np.random.rand(3000,160000)
start = time.time()
minval = randvals.min(axis=0)
print "Took {0:.2f} seconds to compute min".format(time.time()-start)
start = time.time()
minindex = np.argmin(randvals,axis=0)
print "Took {0:.2f} seconds to compute argmin".format(time.time()-start)

On my machine this outputs:

Took 0.83 seconds to compute min
Took 9.58 seconds to compute argmin

Is there any reason why argmin is so much slower? Is there any way to speed it up to comparable to min?

回答1:

In [1]: import numpy as np

In [2]: a = np.random.rand(3000, 16000)

In [3]: %timeit a.min(axis=0)
1 loops, best of 3: 421 ms per loop

In [4]: %timeit a.argmin(axis=0)
1 loops, best of 3: 1.95 s per loop

In [5]: %timeit a.min(axis=1)
1 loops, best of 3: 302 ms per loop

In [6]: %timeit a.argmin(axis=1)
1 loops, best of 3: 303 ms per loop

In [7]: %timeit a.T.argmin(axis=1)
1 loops, best of 3: 1.78 s per loop

In [8]: %timeit np.asfortranarray(a).argmin(axis=0)
1 loops, best of 3: 1.97 s per loop

In [9]: b = np.asfortranarray(a)

In [10]: %timeit b.argmin(axis=0)
1 loops, best of 3: 329 ms per loop

Maybe min is smart enough to do its job sequentially over the array (hence with cache locality), and argmin is jumping around the array (causing a lot of cache misses)?

Anyway, if you're willing to keep randvals as a Fortran-ordered array from the start, it'll be faster, though copying into Fortran-ordered doesn't help.

回答2:

I just took a look at the source code, and while I don't fully understand why things are being done the way they are, this is what happens:

np.min is basically a call to np.minimum.reduce.
np.argmin first moves the axis you want to operate on to the end of the shape tuple, then makes it a contiguous array, which of course triggers a copy of the full array unless the axis was the last one to begin with.

Since a copy is being made, you can get creative and try to instantiate cheaper arrays:

a = np.random.rand(1000, 2000)

def fast_argmin_axis_0(a):
    matches = np.nonzero((a == np.min(a, axis=0)).ravel())[0]
    rows, cols = np.unravel_index(matches, a.shape)
    argmin_array = np.empty(a.shape[1], dtype=np.intp)
    argmin_array[cols] = rows
    return argmin_array

In [8]: np.argmin(a, axis=0)
Out[8]: array([230, 532, 815, ..., 670, 702, 989], dtype=int64)

In [9]: fast_argmin_axis_0(a)
Out[9]: array([230, 532, 815, ..., 670, 702, 989], dtype=int64)

In [10]: %timeit np.argmin(a, axis=0)
10 loops, best of 3: 27.3 ms per loop

In [11]: %timeit fast_argmin_axis_0(a)
100 loops, best of 3: 15 ms per loop

I wouldn't go as far as calling the current implementation a bug, since there may be good reasons for numpy doing what it does the way it does it, but that this kind of trickery can speed up what should be a highly optimized function, strongly suggests that things could be done better.

回答3:

I was just hitting the same problem, and found the large difference in performance when axis 0 is selected for finding the minimum. As suggested, the problem seems to be related to internally copying the array.

I devised a rather simple-minded workaround using Cython that simultaneously establishes the minimum values and their position in a 2D numpy array along a given axis. Note that for axis = 0, the algorithm works on an array of columns (maximum number specified by the constant blocksize - here set equivalent to 8 kByte of data) simultaneously to make good use of the cache:

%%cython -c=-O2 -c=-march=native

import numpy as np
cimport numpy as np 
cimport cython
from libc.stdint cimport uint32_t

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_64_axis_0(uint32_t[:] minloc, double[:] minimum, double[:, :] arr) nogil:
    """
    Find the minimum and argmin for a 2D array of 64-bit floats along axis 0
    Parameters:
    -----------
    arr: numpy_array, dtype=np.float64, shape=(m, n)
       The array for which the minima should be computed.
    minloc: numpy_array, dtype=np.uint32, shape=(n)
       Stores the rows where the minima occur for each column.
    minimum: numpy_array, dtype=np.float64, shape=(n)
       The minima along each column.
    """

    # Columns of the matrix are accessed in blocks to increase cache performance.
    # Specify the number of columns here:

    DEF blocksize = 1024

    cdef int i, j, k
    cdef double minim[blocksize]
    cdef uint32_t minimloc[blocksize]

    # Read blocks of data to make good use of the cache

    cdef int blocks
    blocks  = arr.shape[1] / blocksize
    remains = arr.shape[1] % blocksize

    for i in xrange(0, blocksize * blocks, blocksize):

        for k in xrange(blocksize):
            minim[k]    = arr[0, i + k]
            minimloc[k] = 0

        for j in xrange(1, arr.shape[0]):

            for k in xrange(blocksize):

                if arr[j, i + k] < minim[k]:
                    minim[k] = arr[j, i + k]
                    minimloc[k] = j

        for k in xrange(blocksize):
            minimum[i + k] = minim[k]
            minloc[i + k]  = minimloc[k]

    # Work on the final 'incomplete' block

    i = blocksize * blocks

    for k in xrange(remains):
        minim[k]    = arr[0, i + k]
        minimloc[k] = 0

    for j in xrange(1, arr.shape[0]):

        for k in xrange(remains):

            if arr[j, i + k] < minim[k]:
                minim[k] = arr[j, i + k]
                minimloc[k] = j

    for k in xrange(remains):
        minimum[i + k] = minim[k]
        minloc[i + k]  = minimloc[k]

    # Done!


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_64_axis_1(uint32_t[:] minloc, double[:] minimum, double[:, :] arr) nogil:
    """
    Find the minimum and argmin for a 2D array of 64-bit floats along axis 1
    Parameters:
    -----------
    arr: numpy_array, dtype=np.float64, shape=(m, n)
       The array for which the minima should be computed.
    minloc: numpy_array, dtype=np.uint32, shape=(m)
       Stores the rows where the minima occur for each row.
    minimum: numpy_array, dtype=np.float64, shape=(m)
       The minima along each row.
    """
    cdef int i
    cdef int j
    cdef double minim
    cdef uint32_t minimloc

    for i in xrange(arr.shape[0]):
        minim = arr[i, 0]
        minimloc = 0

        for j in xrange(1, arr.shape[1]):
            if arr[i, j] < minim:
                minim = arr[i, j]
                minimloc = j

        minimum[i] = minim
        minloc[i]  = minimloc


@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_32_axis_0(uint32_t[:] minloc, float[:] minimum, float[:, :] arr) nogil:
    """
    Find the minimum and argmin for a 2D array of 32-bit floats along axis 0
    Parameters:
    -----------
    arr: numpy_array, dtype=np.float32, shape=(m, n)
       The array for which the minima should be computed.
    minloc: numpy_array, dtype=np.uint32, shape=(n)
       Stores the rows where the minima occur for each column.
    minimum: numpy_array, dtype=np.float32, shape=(n)
       The minima along each column.
    """

    # Columns of the matrix are accessed in blocks to increase cache performance.
    # Specify the number of columns here:  

    DEF blocksize = 2048

    cdef int i, j, k
    cdef float minim[blocksize]
    cdef uint32_t minimloc[blocksize]

    # Read blocks of data to make good use of the cache

    cdef int blocks
    blocks  = arr.shape[1] / blocksize
    remains = arr.shape[1] % blocksize

    for i in xrange(0, blocksize * blocks, blocksize):

        for k in xrange(blocksize):
            minim[k]    = arr[0, i + k]
            minimloc[k] = 0

        for j in xrange(1, arr.shape[0]):

            for k in xrange(blocksize):

                if arr[j, i + k] < minim[k]:
                    minim[k] = arr[j, i + k]
                    minimloc[k] = j

        for k in xrange(blocksize):
            minimum[i + k] = minim[k]
            minloc[i + k]  = minimloc[k]

    # Work on the final 'incomplete' block

    i = blocksize * blocks

    for k in xrange(remains):
        minim[k]    = arr[0, i + k]
        minimloc[k] = 0

    for j in xrange(1, arr.shape[0]):

        for k in xrange(remains):

            if arr[j, i + k] < minim[k]:
                minim[k] = arr[j, i + k]
                minimloc[k] = j

    for k in xrange(remains):
        minimum[i + k] = minim[k]
        minloc[i + k]  = minimloc[k]

    # Done!

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void _minargmin_2d_32_axis_1(uint32_t[:] minloc, float[:] minimum, float[:, :] arr) nogil:
    """
    Find the minimum and argmin for a 2D array of 32-bit floats along axis 1
    Parameters:
    -----------
    arr: numpy_array, dtype=np.float32, shape=(m, n)
       The array for which the minima should be computed.
    minloc: numpy_array, dtype=np.uint32, shape=(m)
       Stores the rows where the minima occur for each row.
    minimum: numpy_array, dtype=np.float32, shape=(m)
       The minima along each row.
    """
    cdef int i
    cdef int j
    cdef float minim
    cdef uint32_t minimloc

    for i in xrange(arr.shape[0]):
        minim = arr[i, 0]
        minimloc = 0

        for j in xrange(1, arr.shape[1]):
            if arr[i, j] < minim:
                minim = arr[i, j]
                minimloc = j

        minimum[i] = minim
        minloc[i]  = minimloc

def Min_Argmin(array_2d, axis = 1):
    """
    Find the minima and corresponding locations (argmin) of a two-dimensional
    numpy array of floats along a given axis simultaneously, and returns them
    as a tuple of arrays (min_2d, argmin_2d).

    (Note: float16 arrays will be internally transformed to float32 arrays.)
    ----------
    array_2d: numpy_array, dtype=np.float32 or np.float64, shape=(m, n)
       The array for which the minima should be computed.
    axis : int, 0 or 1 (default) : 
        The axis along which minima are computed.
    min_2d: numpy_array, dtype=np.uint8, shape=(m) or shape=(n):
       The array where the minima along the given axis are stored.
    argmin_2d:
       The array storing the row/column where the minimum occurs.
    """

    # Sanity checks:
    if len(array_2d.shape) != 2:
        raise IndexError('Min_Argmin: Number of dimensions of array must be 2')

    if not (axis == 0 or axis == 1):
        raise ValueError('Min_Argmin: Invalid axis specified')

    arr_type = array_2d.dtype

    if not arr_type in ('float16', 'float32', 'float64'):
        raise ValueError('Min_Argmin: Not a float array')

    # Transform float16 arrays
    if arr_type == 'float16':
        array_2d = array_2d.astype(np.float32)

    # Run analysis

    if arr_type == 'float64':

        # Double accuracy

        min_array    = np.zeros(array_2d.shape[1 - axis], dtype = np.float64)
        argmin_array = np.zeros(array_2d.shape[1 - axis], dtype = np.uint32)

        if (axis == 0):
            _minargmin_2d_64_axis_0(argmin_array, min_array, array_2d)

        else:
            _minargmin_2d_64_axis_1(argmin_array, min_array, array_2d)

    else:

        # Single accuracy

        min_array    = np.zeros(array_2d.shape[1 - axis], dtype = np.float32)
        argmin_array = np.zeros(array_2d.shape[1 - axis], dtype = np.uint32)

        if (axis == 0):
            _minargmin_2d_32_axis_0(argmin_array, min_array, array_2d)

        else:
            _minargmin_2d_32_axis_1(argmin_array, min_array, array_2d)

        # Transform back to float16 type as necessary

        if arr_type == 'float16':
            min_array = min_array.astype(np.float16)


    # Return results
    return min_array, argmin_array

The code can be placed and compiled in an IPython notebook cell after loading Cython support:

%load_ext Cython

and then called in the form:

min_array, argmin_array = Min_Argmin(two_dim_array, axis = 0 or 1)

Timing example:

random_array = np.random.rand(20000, 20000).astype(np.float32)

%timeit min_array, argmin_array = Min_Argmin(random_array, axis = 0)
%timeit min_array, argmin_array = Min_Argmin(random_array, axis = 1)

1 loops, best of 3: 405 ms per loop
1 loops, best of 3: 307 ms per loop

For comparison:

%%timeit 
min_array    = random_array.min(axis = 0)
argmin_array = random_array.argmin(axis = 0)

1 loops, best of 3: 10.3 s per loop

%%timeit 
min_array    = random_array.min(axis = 1)
argmin_array = random_array.argmin(axis = 1)

1 loops, best of 3: 423 ms per loop

So, there is a significant speedup for axis = 0 (and still a small advantage for axis = 1, if one is interested in both minimum and location).