可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I have a list of tuples, each containing two elements. The first element of few sublists is common. I want to compare the first element of these sublists and append the second element in one lists. Here is my list:
myList=[(1,2),(1,3),(1,4),(1,5),(2,6),(2,7),(2,8),(3,9),(3,10)]
I would like to make a list of lists out of it which looks something like this:`
NewList=[(2,3,4,5),(6,7,8),(9,10)]
I hope if there is any efficient way.
回答1:
You can use an OrderedDict to group the elements by the first subelement of each tuple:
myList=[(1,2),(1,3),(1,4),(1,5),(2,6),(2,7),(2,8),(3,9),(3,10)]
from collections import OrderedDict
od = OrderedDict()
for a,b in myList:
od.setdefault(a,[]).append(b)
print(list(od.values()))
[[2, 3, 4, 5], [6, 7, 8], [9, 10]]
If you really want tuples:
print(list(map(tuple,od.values())))
[(2, 3, 4, 5), (6, 7, 8), (9, 10)]
If you did not care about the order the elements appeared and just wanted the most efficient way to group you could use a collections.defaultdict:
from collections import defaultdict
od = defaultdict(list)
for a,b in myList:
od[a].append(b)
print(list(od.values()))
Lastly, if your data is in order as per your input example i.e sorted you could simply use itertools.groupby to group by the first subelement from each tuple and extract the second element from the grouped tuples:
from itertools import groupby
from operator import itemgetter
print([tuple(t[1] for t in v) for k,v in groupby(myList,key=itemgetter(0))])
Output:
[(2, 3, 4, 5), (6, 7, 8), (9, 10)]
Again the groupby will only work if your data is sorted by at least the first element.
Some timings on a reasonable sized list:
In [33]: myList = [(randint(1,10000),randint(1,10000)) for _ in range(100000)]
In [34]: myList.sort()
In [35]: timeit ([tuple(t[1] for t in v) for k,v in groupby(myList,key=itemgetter(0))])
10 loops, best of 3: 44.5 ms per loop
In [36]: %%timeit od = defaultdict(list)
for a,b in myList:
od[a].append(b)
....:
10 loops, best of 3: 33.8 ms per loop
In [37]: %%timeit
dictionary = OrderedDict()
for x, y in myList:
if x not in dictionary:
dictionary[x] = [] # new empty list
dictionary[x].append(y)
....:
10 loops, best of 3: 63.3 ms per loop
In [38]: %%timeit
od = OrderedDict()
for a,b in myList:
od.setdefault(a,[]).append(b)
....:
10 loops, best of 3: 80.3 ms per loop
If order matters and the data is sorted, go with the groupby, it will get even closer to the defaultdict approach if it is necessary to map all the elements to tuple in the defaultdict.
If the data is not sorted or you don't care about any order, you won't find a faster way to group than using the defaultdict approach.
回答2:
This feels like a task for a dictionary (if you don't know dictionaries yet, look them up on python.org). This is a very verbose example, so it's not what I'd write in everyday coding, but it's better to be verbose than unclear:
dictionary = collections.OrderedDict()
for x, y in myList:
if not dictionary.has_key(x):
dictionary[x] = [] # new empty list
# append y to that list
dictionary[x].append(y)
回答3:
Having thought about this, the most efficient approach is probably this one-liner (assuming dictionary
is an empty dict
, i.e. dictionary = {}
or dictionary = OrderedDict()
like in Padraic' excellent answer):
for x,y in myList: dictionary.setdefault(x,[]).append(y)
I'm not saying this is the easiest to read approach, but I like it :)
EDIT Ha! Benchmarking proved me wrong; the setdefault
approach is slower than the if not dictionary.has_key(x): dictionary[x]=[]
approach:
>>> timeit.timeit("for x,y in myList:\n if not dictionary.has_key(x):\n dictionary[x]=[]\n dictionary[x].append(y)", "from collections import OrderedDict\nmyList=[(1,2),(1,3),(
1,4),(1,5),(2,6),(2,7),(2,8),(3,9),(3,10)]\ndictionary=OrderedDict()")
2.2573769092559814
>>> timeit.timeit("for x,y in myList: dictionary.setdefault(x,[]).append(y)", "from collections import OrderedDict\nmyList=[(1,2),(1,3),(1,4),(1,5),(2,6),(2,7),(2,8),(3,9),(3,10)]\ndictiona
ry=OrderedDict()")
3.3534231185913086
Of course, Padraic was still right: his defaultdict
approach uses but 0.82s on my machine, so it's faster by a factor of 3.
Also, as Padraic pointed out: dict.has_key(x)
has been deprecated, and one should use x in dict
instead; however, I couldn't measure a speed difference.
回答4:
The following should work:
import itertools
myList = [(1,2),(1,3),(1,4),(1,5),(2,6),(2,7),(2,8),(3,9),(3,10)]
print [tuple(x[1] for x in g) for k, g in itertools.groupby(myList, key=lambda x: x[0])]
Which displays:
[(2, 3, 4, 5), (6, 7, 8), (9, 10)]