可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I want to find and replace multiple values in an 1D array / list with new ones.
In example for a list
a=[2, 3, 2, 5, 4, 4, 1, 2]
I would like to replace
val_old=[1, 2, 3, 4, 5]
with
val_new=[2, 3, 4, 5, 1]
Therefore the new array is:
a_new=[3, 4, 3, 1, 5, 5, 2, 3]
What is the fastest way to do this (for very large lists, i.e. with 50000 values to find and replace)?
Comment of the anwsers
Thank you to all for a quick response! I checked the proposed solutions with the following:
N = 10**4
N_val = 0.5*N
a = np.random.randint(0, N_val, size=N)
val_old = np.arange(N_val, dtype=np.int)
val_new = np.arange(N_val, dtype=np.int)
np.random.shuffle(val_new)
a1 = list(a)
val_old1 = list(val_old)
val_new1 = list(val_new)
def Ashwini_Chaudhary(a, val_old, val_new):
arr = np.empty(a.max()+1, dtype=val_new.dtype)
arr[val_old] = val_new
return arr[a]
def EdChum(a, val_old, val_new):
df = pd.Series(a, dtype=val_new.dtype)
d = dict(zip(val_old, val_new))
return df.map(d).values
def xxyzzy(a, val_old, val_new):
return [val_new[val_old.index(x)] for x in a]
def Shashank_and_Hackaholic(a, val_old, val_new):
d = dict(zip(val_old, val_new))
return [d.get(e, e) for e in a]
def itzmeontv(a, val_old, val_new):
return [val_new[val_old.index(i)] if i in val_old else i for i in a]
def swenzel(a, val_old, val_new):
return val_new[np.searchsorted(val_old,a)]
def Divakar(a, val_old, val_new):
C,R = np.where(a[:,np.newaxis] == val_old[np.newaxis,:])
a[C] = val_new[R]
return a
The results:
%timeit -n100 Ashwini_Chaudhary(a, val_old, val_new)
100 loops, best of 3: 77.6 µs per loop
%timeit -n100 swenzel(a, val_old, val_new)
100 loops, best of 3: 703 µs per loop
%timeit -n100 Shashank_and_Hackaholic(a1, val_old1, val_new1)
100 loops, best of 3: 1.7 ms per loop
%timeit -n100 EdChum(a, val_old, val_new)
100 loops, best of 3: 17.6 ms per loop
%timeit -n10 Divakar(a, val_old, val_new)
10 loops, best of 3: 209 ms per loop
%timeit -n10 xxyzzy(a1, val_old1, val_new1)
10 loops, best of 3: 429 ms per loop
%timeit -n10 itzmeontv(a1, val_old1, val_new1)
10 loops, best of 3: 847 ms per loop
The relative difference in performance increases with biger N
, i.e. if N=10**7
, then the result by Ashwini_Chaudhary takes 207 ms
and the result by swenzel 6.89 s
.
回答1:
>>> arr = np.empty(a.max() + 1, dtype=val_new.dtype)
>>> arr[val_old] = val_new
>>> arr[a]
array([3, 4, 3, 1, 5, 5, 2, 3])
回答2:
In vanilla Python, without the speed of numpy
or pandas
, this is one way:
a = [2, 3, 2, 5, 4, 4, 1, 2]
val_old = [1, 2, 3, 4, 5]
val_new = [2, 3, 4, 5, 1]
expected_a_new = [3, 4, 3, 1, 5, 5, 2, 3]
d = dict(zip(val_old, val_new))
a_new = [d.get(e, e) for e in a]
print a_new # [3, 4, 3, 1, 5, 5, 2, 3]
print a_new == expected_a_new # True
The average time complexity for this algorithm is O(M + N)
where M
is the length of your "translation list" and N
is the length of list a
.
回答3:
Assuming that your val_old
array is sorted (which is the case here, but if later on it's not, then don't forget to sort val_new
along with it!), you can use numpy.searchsorted
and then access val_new
with the results.
This does not work if a number has no mapping, you will have to provide 1to1 mappings in that case.
In [1]: import numpy as np
In [2]: a = np.array([2, 3, 2, 5, 4, 4, 1, 2])
In [3]: old_val = np.array([1, 2, 3, 4, 5])
In [4]: new_val = np.array([2, 3, 4, 5, 1])
In [5]: a_new = np.array([3, 4, 3, 1, 5, 5, 2, 3])
In [6]: i = np.searchsorted(old_val,a)
In [7]: a_replaced = new_val[i]
In [8]: all(a_replaced == a_new)
Out[8]: True
50k numbers? No problem!
In [23]: def timed():
t0 = time.time()
i = np.searchsorted(old_val, a)
a_replaced = new_val[i]
t1 = time.time()
print('%s Seconds'%(t1-t0))
....:
In [24]: a = np.random.choice(old_val, 50000)
In [25]: timed()
0.00288081169128 Seconds
500k? You won't notice the difference!
In [26]: a = np.random.choice(old_val, 500000)
In [27]: timed()
0.019248008728 Seconds
回答4:
Try this for your expected output, works even if elements
not in value_old
.
>>>[val_new[val_old.index(i)] if i in val_old else i for i in a]
[3, 4, 3, 1, 5, 5, 2, 3]
回答5:
The numpy_indexed package (disclaimer: I am its author) provides an elegant and efficient vectorized solution to this type of problem:
import numpy_indexed as npi
remapped_a = npi.remap(a, val_old, val_new)
The method implemented is based on searchsorted like that of swenzel and should have similar good performance, but more general. For instance, the items of the array do not need to be ints, but can be any type, even nd-subarrays themselves.
If all values in 'a' are expected to be present in 'val_old', you can set the optional 'missing' kwarg to 'raise' (default is 'ignore'). Performance will be slightly better, and you will get a KeyError if that assumption is not satisfied.
回答6:
To replace values in a list using two other lists as key:value pairs there are several approaches. All of them use "list compression".
Using list.index():
a=[2, 3, 2, 5, 4, 4, 1, 2]
val_old=[1, 2, 3, 4, 5]
val_new=[2, 3, 4, 5, 1]
a_new=[val_new[val_old.index(x)] for x in a]
Using your special case:
a=[2, 3, 2, 5, 4, 4, 1, 2]
a_new=[x % 5 + 1 for x in a]
回答7:
I tried like this:
>>> val_old=[1, 2, 3, 4, 5]
>>> val_new=[2, 3, 4, 5, 1]
>>> a=[2, 3, 2, 5, 4, 4, 1, 2]
>>> my_dict = dict(zip(val_old, val_new))
>>> [my_dict.get(x,x) for x in a]
[3, 4, 3, 1, 5, 5, 2, 3]
回答8:
In pandas I would create a dict from the 2 lists and then call map
which will perform a lookup and replace the values:
In [6]:
df = pd.Series([2, 3, 2, 5, 4, 4, 1, 2])
df
Out[6]:
0 2
1 3
2 2
3 5
4 4
5 4
6 1
7 2
dtype: int64
In [7]:
val_old=[1, 2, 3, 4, 5]
val_new=[2, 3, 4, 5, 1]
d = dict(zip(val_old,val_new ))
d
Out[7]:
{1: 2, 2: 3, 3: 4, 4: 5, 5: 1}
In [9]:
df.map(d)
Out[9]:
0 3
1 4
2 3
3 1
4 5
5 5
6 2
7 3
dtype: int64
For a 80000 element series this takes 3.4ms:
In [14]:
%timeit df.map(d)
100 loops, best of 3: 3.4 ms per loop
This is a vectorised approach and will scale much better than any iteration based method
回答9:
For numpy arrays
, this could be one approach -
%// Find row and column IDs for matches between "a" and "val_old"
C,R = np.where(a[:,np.newaxis] == val_old[np.newaxis,:])
%// Index into "a" with the column indices and
%// set those to "val_new" elements indexed by "R"
a[C] = val_new[R]
Sample run and timing
For inputs:
a = np.random.randint(10000,size=(100000))
val_old = np.random.randint(10000,size=(1000))
val_new = np.random.randint(10000,size=(1000))
Runtimes at each code line were -
%timeit C,R = np.where(a[:,np.newaxis] == val_old[np.newaxis,:])
1 loops, best of 3: 292 ms per loop
%timeit a[C] = val_new[R]
10000 loops, best of 3: 43 µs per loop
回答10:
list(map(lambda x:val_new[val_old.index(x)], a))