可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

Lets say I have a1 and a2:

a1 = [1,2,3]
a2 = [4,2,5]

To see if a1 shares any elements with a2, I can loop over each and compare each element:

def intersect?(x,y)
  a1.each do |x|
    a2.each do |y|
      if x == y return true
    end
  end
  false
end

But even easier, (a1.to_set & a2.to_set).present? gives me the same answer.

I'm assuming that the set operation is quicker and more efficient? If this is true, is it still true taking into account to overhead (if any) of the .to_set operation on each array?

tia

回答1:

Surprisingly the & method of Array is faster than that of Set for quite large collections:

require 'set'
require 'benchmark'
f = 10_000
ar1 = (1..(10*f)).to_a # 100_000 elements
ar2 = ((5*f)..(15*f)).to_a # also 100_000 elements
set1 = ar1.to_set
set2 = ar2.to_set
n = 10

Benchmark.bm(10) do |testcase|
  testcase.report('Array'){ n.times{ ar1 & ar2 } }
  testcase.report('Set'){ n.times{ set1 & set2 } }
end

Result:

                 user     system      total        real
Array        1.380000   0.030000   1.410000 (  1.414634)
Set          2.310000   0.020000   2.330000 (  2.359317)

回答2:

steenslag's answer had an interesting observation that array & array was faster that set & set. It looks like most of that penalty appears to be the expense of obtaining keys from the underlying hash of the first set to enumerate on. A hybrid approach using the array for the left side of the operation and set for the right hand is faster yet. If you only want to know if there's any intersection, the same approach with #any? is even quicker:

#!/usr/bin/env ruby

require 'set'
require 'benchmark'

f = 10_000
ar1 = (1..(10*f)).to_a # 100_000 elements
ar2 = ((5*f)..(15*f)).to_a # also 100_000 elements
set1 = ar1.to_set
set2 = ar2.to_set
n = 10

Benchmark.bm(10) do |testcase|
  testcase.report('Array'){ n.times{ ar1 & ar2 } }
  testcase.report('Set'){ n.times{ set1 & set2 } }
  testcase.report('Set2'){ n.times{ ar1.select{ |element| set2.include? element } } }
  testcase.report('Set2present'){ n.times{ ar1.any?{ |element| set2.include? element } } }
end


$ ruby -v => ruby 1.9.2p290 (2011-07-09 revision 32553) [x86_64-darwin10.8.0]

                user     system      total        real
Array       0.680000   0.030000   0.710000 (  0.720882)
Set         1.130000   0.020000   1.150000 (  1.150571)
Set2        0.430000   0.000000   0.430000 (  0.434957)
Set2present  0.210000   0.010000   0.220000 (  0.220990)

回答3:

It should be faster for large arrays. Your method runs in O(m*n) time because it has to loop over both arrays. For tables of 3 elements each this is basically negligible, but for larger tables it can be very expensive.

The second method will use hash lookups which are much faster, but first the arrays have to be put in sets.

What you should do is try both methods using arrays of sizes you expect to see in your application and see which is faster. If they're about the same size you can just pick whichever one you think is clearer.

回答4:

I just want to elaborate upon the excellent answers by steenslag and dbenhur. Specifically, I wanted to know if SortedSet would perform any better. It actually surprised me initially that the Ruby Set type was not implemented as a sorted set, since I come from C++; the STL by default uses an ordered set, and you generally have to specify unordered_set if you don't want ordering.

I also wanted to know if the size of the set made a difference, as suggested in some other answers.

require 'set'
require 'benchmark'

f = 20 # 10_000
ar1 = (1..(10*f)).to_a # 100_000 elements
ar2 = ((5*f)..(15*f)).to_a # also 100_000 elements
set1 = ar1.to_set
set2 = ar2.to_set
sset1 = SortedSet.new(ar1)
sset2 = SortedSet.new(ar2)
n = 20000 # 10

Benchmark.bm(10) do |testcase|
  testcase.report('Array'){ n.times{ ar1 & ar2 } }
  testcase.report('Set'){ n.times{ set1 & set2 } }
  testcase.report('SortedSet') { n.times{ sset1 & sset2 } }
  testcase.report('Set2'){ n.times{ ar1.select{ |element| set2.include? element } } }
  testcase.report('Set2present'){ n.times{ ar1.any?{ |element| set2.include? element } } }

  testcase.report('SortedSet2'){ n.times{ ar1.select{ |element| sset2.include? element } } }
  testcase.report('SortedSet2present'){ n.times{ ar1.any?{ |element| sset2.include? element } } }
end

Here are the results for f=20; n=20000:

$ ruby set.rb
                 user     system      total        real
Array        1.950000   0.010000   1.960000 (  1.963030)
Set          3.330000   0.040000   3.370000 (  3.374105)
SortedSet    3.810000   0.040000   3.850000 (  3.860340)
Set2         1.410000   0.010000   1.420000 (  1.427221)
Set2present  0.760000   0.000000   0.760000 (  0.759447)
SortedSet2   1.420000   0.000000   1.420000 (  1.446559)
SortedSet2present  0.770000   0.010000   0.780000 (  0.770939)

And here are the results for f=10000; n=10:

$ ruby set.rb
                 user     system      total        real
Array        0.910000   0.020000   0.930000 (  0.939325)
Set          1.270000   0.010000   1.280000 (  1.293581)
SortedSet    1.220000   0.010000   1.230000 (  1.229650)
Set2         0.550000   0.000000   0.550000 (  0.552708)
Set2present  0.290000   0.010000   0.300000 (  0.291845)
SortedSet2   0.550000   0.000000   0.550000 (  0.561049)
SortedSet2present  0.330000   0.000000   0.330000 (  0.339950)

So, for large sets, looks like Set does better than SortedSet; and for smaller sets, SortedSet does better than Set. When using the & notation, Array is faster than either. Looks like SortedSet2present performs significantly more efficiently with large sets, whereas Set2present performs more efficiently with small sets.

Whereas Set is implemented using Hash, SortedSet is an RBTree (implemented in C). In both cases, & is implemented in Ruby rather than C.

回答5:

The truth is with arrays this small, they either will be essentially the same or the lists will be faster than the sets.

A decent set implementation will do set operations faster than you can do list operations BUT there's some overhead. If you want to know what your implementation will do, use big sets/lists and test.