vectorized upper bound for segmented data in CUDA

2019-06-06 00:09发布

站内文章 / C++

24 0

姐就是有狂的资本

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I have the following input data:

e = 0 0 0 0 0 0 | 1 1 1
t = 1 1 4 4 4 5 | 1 6 7
i = 0 1 2 3 4 5 | 6 7 8 // indices from [0,n-1]

The data is first sorted by e, then by t. e is the key which identifies segments in the data. In this case:

segment_0 = [0,5]
segment_1 = [6,8]

Each segment is again segmented by t. In this case:

sub_segment_0_0 = [0,1] // t==1
sub_segment_0_1 = [2,4] // t==4
sub_segment_0_2 = [5,5] // t==5

sub_segment_1_0 = [6,6] // t==1
sub_segment_1_1 = [7,7] // t==6
sub_segment_1_2 = [8,8] // t==7

I want to create the following output sequences:

f = 2 2 5 5 5 6 | 7 8 9
l = 6 6 6 6 6 6 | 9 9 9

f contains the start index of the next sub_segment within the current segment.

l contains (the end index of the last sub_segment within the current segment) + 1.

For the last sub_segment of each segment both values should point to its end index.

In order to calculate f, I tried using thrust::upper_bound, but this only works if I have just one sub_segment:

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <stdint.h>
#include <iostream>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0};
    uint32_t t[] = {1,1,4,4,4,5};
    uint32_t i[] = {0,1,2,3,4,5};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec upper(size);
    thrust::upper_bound(d_t.begin(), d_t.end(), d_t.begin(), d_t.end(), upper.begin());
    PRINTER(upper);

    return 0;
}

output:

d_e:    0   0   0   0   0   0   
d_t:    1   1   4   4   4   5   
d_i:    0   1   2   3   4   5   
upper:  2   2   5   5   5   6

If I use the input data containing two sub_segments, it won't work anymore, since there is no thrust::upper_bound_by_key:

// replace in the code above
uint32_t e[] = {0,0,0,0,0,0,1,1,1};
uint32_t t[] = {1,1,4,4,4,5,1,6,7};
uint32_t i[] = {0,1,2,3,4,5,6,7,8};

output

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
upper:  2   2   7   7   7   7   2   8   9

How would a upper_bound_by_key be implemented for my data?

And how can I efficiently calculate l?

I am open to any solution, thrust is not a necessity.

回答1:

Here is one possible approach:

Mark the end of your (t-)segments. I assume that it's possible for an e-segment to have a single t-segment. If that's the case, then adjacent e-segments could have t-segments of the same numerical value (1 presumably). Therefore marking the end of segments needs to consider both e and t. I use a method basically like adjacent difference, except it considers both e and t using thrust::transform and shifted representations of e and t.
Determine the value that f will hold for each segment. Now that we know the end of each (t-)segment, we can simply pick the next value out of i (using copy_if, and the segment end markers as our stencil) as the f value for the preceding segment. To facilitate this, and since your i is just an index sequence, I create an i vector that is one element longer than what you have shown.
Create a numerically increasing index for each segment. This is just an exclusive scan on the vector created in step 1.
Use the index sequence created in step 3, to "scatter" the f segment values created int step 2, into our f result ("scatter" is done with thrust::copy and a permuation iterator).

Here's a worked example, borrowing from your code:

$ cat t835.cu
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <stdint.h>
#include <iostream>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/sequence.h>
#include <thrust/scan.h>


using namespace thrust::placeholders;

struct my_semarker_func
{
template <typename T>
  __host__ __device__
  uint32_t operator()(const T &d1, const T &d2){
    if (thrust::get<0>(d1) != thrust::get<0>(d2)) return 1;
    if (thrust::get<1>(d1) != thrust::get<1>(d2)) return 1;
    return 0;}
};


#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};

    int size = sizeof(t)/sizeof(t[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i(size+1);
    DVec d_e = h_e;
    DVec d_t = h_t;
    thrust::sequence(d_i.begin(), d_i.end());
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

// create segment end markers
    DVec d_s(size,1);
    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(d_e.begin(), d_t.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_e.end()-1, d_t.end()-1)), thrust::make_zip_iterator(thrust::make_tuple(d_e.begin()+1, d_t.begin()+1)), d_s.begin(), my_semarker_func());
// create segment f values
    DVec d_g(size);
    thrust::copy_if(d_i.begin()+1, d_i.end(), d_s.begin(), d_g.begin(), _1 == 1);
// create segment indices
    DVec d_h(size);
    thrust::exclusive_scan(d_s.begin(), d_s.end(), d_h.begin());
// create f
    DVec d_f(size);
    thrust::copy_n(thrust::make_permutation_iterator(d_g.begin(), d_h.begin()), size, d_f.begin());
    PRINTER(d_f);

    return 0;
}
$ nvcc -std=c++11 -o t835 t835.cu
$ ./t835
d_e:    0       0       0       0       0       0       1       1       1
d_t:    1       1       4       4       4       5       1       6       7
d_i:    0       1       2       3       4       5       6       7       8       9
d_f:    2       2       5       5       5       6       7       8       9
$

A very similar sequence could be used to create the l vector.

回答2:

I found another way to do this.

In order to be able to use lower_bound, I needed to make sure that t is globally sorted. In order to do that, I first find out the starting points of each sub_segment using adjacent_difference. After that, scatter_if copies increasing numbers from a counting_iterator for each starting point of a subsegment. Finally, inclusive_scan spreads same values for each subsegment. I combined the two steps before the inclusive_scan into the custom functor my_scatter to achieve better kernel fusing.

Now upper_bound is applied to these globally increasing values to calculate f.

l can be calculated by applying upper_bound on e.

I am not sure how the efficiency of my approach compares to the approach presented by @RobertCrovella.

output:

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
norm_t: 0   0   2   2   2   7   13  20  28  
d_f:    2   2   5   5   5   6   7   8   9   
d_l:    6   6   6   6   6   6   9   9   9

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/adjacent_difference.h>
#include <thrust/functional.h>
#include <stdint.h>
#include <iostream>
#include <thrust/scatter.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
    my_scatter(IteratorType first) : first(first)
    {
    }

   __host__ __device__
   IndexType operator()(const IndexType& i)
   {
      IndexType result = i;
      if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
      { 
          result = static_cast<IndexType>(0);
      }
      return result;
   }

   IteratorType first;
};

template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
  return my_scatter<IteratorType>(first);
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};
    uint32_t i[] = {0,1,2,3,4,5,6,7,8};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;    
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec norm_t(size);

    auto my_scatter_op =  make_my_scatter(zip(d_e.begin(), d_t.begin()));
    auto ti_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), my_scatter_op);
    auto ti_end = thrust::make_transform_iterator(thrust::make_counting_iterator(size), my_scatter_op);
    thrust::inclusive_scan(ti_begin, ti_end, norm_t.begin());
    PRINTER(norm_t);

    DVec d_f(size);
    thrust::upper_bound(norm_t.begin(), norm_t.end(), norm_t.begin(), norm_t.end(), d_f.begin());    
    PRINTER(d_f);

    DVec d_l(size);
    thrust::upper_bound(d_e.begin(), d_e.end(), d_e.begin(), d_e.end(), d_l.begin());    
    PRINTER(d_l);

    return 0;
}