可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I have a 3D numpy array like a = np.zeros((100,100, 20))
. I want to perform an operation over every x,y
position that involves all the elements over the z
axis and the result is stored in an array like b = np.zeros((100,100))
on the same corresponding x,y
position.
Now i'm doing it using a for loop:
d_n = np.array([...]) # a parameter with the same shape as b
for (x,y), v in np.ndenumerate(b):
C = a[x,y,:]
### calculate some_value using C
minv = sys.maxint
depth = -1
C = a[x,y,:]
for d in range(len(C)):
e = 2.5 * float(math.pow(d_n[x,y] - d, 2)) + C[d] * 0.05
if e < minv:
minv = e
depth = d
some_value = depth
if depth == -1:
some_value = len(C) - 1
###
b[x,y] = some_value
The problem now is that this operation is much slower than others done the pythonic way, e.g. c = b * b
(I actually profiled this function and it's around 2 orders of magnitude slower than others using numpy built in functions and vectorized functions, over a similar number of elements)
How can I improve the performance of such kind of functions mapping a 3D array to a 2D one?
回答1:
What is usually done in 3D images is to swap the Z
axis to the first index:
>>> a = a.transpose((2,0,1))
>>> a.shape
(20, 100, 100)
And now you can easily iterate over the Z axis:
>>> for slice in a:
do something
The slice
here will be each of your 100x100
fractions of your 3D matrix. Additionally, by transpossing allows you to access each of the 2D slices directly by indexing the first axis. For example a[10]
will give you the 11th 2D 100x100
slice.
Bonus: If you store the data contiguosly, without transposing (or converting to a contiguous array using a = np.ascontiguousarray(a.transpose((2,0,1)))
the access to you 2D slices will be faster since they are mapped contiguosly in memory.
回答2:
Obviously you want to get rid of the explicit for
loop, but I think whether this is possible depends on what calculation you are doing with C. As a simple example,
a = np.zeros((100,100, 20))
a[:,:] = np.linspace(1,20,20) # example data: 1,2,3,.., 20 as "z" for every "x","y"
b = np.sum(a[:,:]**2, axis=2)
will fill the 100
by 100
array b
with the sum of the squared "z" values of a
, that is 1+4+9+...+400 = 2870.
回答3:
If your inner calculation is sufficiently complex, and not amenable to vectorization, then your iteration structure is good, and does not contribute significantly to the calculation time
for (x,y), v in np.ndenumerate(b):
C = a[x,y,:]
...
for d in range(len(C)):
... # complex, not vectorizable calc
...
b[x,y] = some_value
There doesn't appear to be a special structure in the 1st 2 dimensions, so you could just as well think of it as 2D mapping on to 1D, e.g. mapping a (N,20)
array onto a (N,)
array. That doesn't speed up anything, but may help highlight the essential structure of the problem.
One step is to focus on speeding up that C
to some_value
calculation. There are functions like cumsum
and cumprod
that help you do sequential calculations on a vector. cython
is also a good tool.
A different approach is to see if you can perform that internal calculation over the N
values all at once. In other words, if you must iterate, it is better to do so over the smallest dimension.
In a sense this a non-answer. But without full knowledge of how you get some_value
from C
and d_n
I don't think we can do more.
It looks like e
can be calculated for all points at once:
e = 2.5 * float(math.pow(d_n[x,y] - d, 2)) + C[d] * 0.05
E = 2.5 * (d_n[...,None] - np.arange(a.shape[-1]))**2 + a * 0.05 # (100,100,20)
E.min(axis=-1) # smallest value along the last dimension
E.argmin(axis=-1) # index of where that min occurs
On first glance it looks like this E.argmin
is the b
value that you want (tweaked for some boundary conditions if needed).
I don't have realistic a
and d_n
arrays, but with simple test ones, this E.argmin(-1)
matches your b
, with a 66x speedup.
回答4:
How can I improve the performance of such kind of functions mapping a 3D array to a 2D one?
Many functions in Numpy are "reduction" functions*, for example sum
, any
, std
, etc. If you supply an axis
argument other than None
to such a function it will reduce the dimension of the array over that axis. For your code you can use the argmin
function, if you first calculate e
in a vectorized way:
d = np.arange(a.shape[2])
e = 2.5 * (d_n[...,None] - d)**2 + a*0.05
b = np.argmin(e, axis=2)
The indexing with [...,None]
is used to engage broadcasting. The values in e
are floating point values, so it's a bit strange to compare to sys.maxint
but there you go:
I, J = np.indices(b.shape)
b[e[I,J,b] >= sys.maxint] = a.shape[2] - 1
* Strickly speaking a reduction function is of the form reduce(operator, sequence)
so technically not std
and argmin