vectorized upper bound for segmented data in CUDA

2019-06-06 00:09发布

问题:

I have the following input data:

e = 0 0 0 0 0 0 | 1 1 1
t = 1 1 4 4 4 5 | 1 6 7
i = 0 1 2 3 4 5 | 6 7 8 // indices from [0,n-1]

The data is first sorted by e, then by t. e is the key which identifies segments in the data. In this case:

segment_0 = [0,5]
segment_1 = [6,8]

Each segment is again segmented by t. In this case:

sub_segment_0_0 = [0,1] // t==1
sub_segment_0_1 = [2,4] // t==4
sub_segment_0_2 = [5,5] // t==5

sub_segment_1_0 = [6,6] // t==1
sub_segment_1_1 = [7,7] // t==6
sub_segment_1_2 = [8,8] // t==7

I want to create the following output sequences:

f = 2 2 5 5 5 6 | 7 8 9
l = 6 6 6 6 6 6 | 9 9 9

f contains the start index of the next sub_segment within the current segment.

l contains (the end index of the last sub_segment within the current segment) + 1.

For the last sub_segment of each segment both values should point to its end index.

In order to calculate f, I tried using thrust::upper_bound, but this only works if I have just one sub_segment:

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <stdint.h>
#include <iostream>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0};
    uint32_t t[] = {1,1,4,4,4,5};
    uint32_t i[] = {0,1,2,3,4,5};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec upper(size);
    thrust::upper_bound(d_t.begin(), d_t.end(), d_t.begin(), d_t.end(), upper.begin());
    PRINTER(upper);

    return 0;
}

output:

d_e:    0   0   0   0   0   0   
d_t:    1   1   4   4   4   5   
d_i:    0   1   2   3   4   5   
upper:  2   2   5   5   5   6

If I use the input data containing two sub_segments, it won't work anymore, since there is no thrust::upper_bound_by_key:

// replace in the code above
uint32_t e[] = {0,0,0,0,0,0,1,1,1};
uint32_t t[] = {1,1,4,4,4,5,1,6,7};
uint32_t i[] = {0,1,2,3,4,5,6,7,8};

output

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
upper:  2   2   7   7   7   7   2   8   9   

How would a upper_bound_by_key be implemented for my data?

And how can I efficiently calculate l?

I am open to any solution, thrust is not a necessity.

回答1:

Here is one possible approach:

  1. Mark the end of your (t-)segments. I assume that it's possible for an e-segment to have a single t-segment. If that's the case, then adjacent e-segments could have t-segments of the same numerical value (1 presumably). Therefore marking the end of segments needs to consider both e and t. I use a method basically like adjacent difference, except it considers both e and t using thrust::transform and shifted representations of e and t.

  2. Determine the value that f will hold for each segment. Now that we know the end of each (t-)segment, we can simply pick the next value out of i (using copy_if, and the segment end markers as our stencil) as the f value for the preceding segment. To facilitate this, and since your i is just an index sequence, I create an i vector that is one element longer than what you have shown.

  3. Create a numerically increasing index for each segment. This is just an exclusive scan on the vector created in step 1.

  4. Use the index sequence created in step 3, to "scatter" the f segment values created int step 2, into our f result ("scatter" is done with thrust::copy and a permuation iterator).

Here's a worked example, borrowing from your code:

$ cat t835.cu
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/device_vector.h>
#include <stdint.h>
#include <iostream>
#include <thrust/transform.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/sequence.h>
#include <thrust/scan.h>


using namespace thrust::placeholders;

struct my_semarker_func
{
template <typename T>
  __host__ __device__
  uint32_t operator()(const T &d1, const T &d2){
    if (thrust::get<0>(d1) != thrust::get<0>(d2)) return 1;
    if (thrust::get<1>(d1) != thrust::get<1>(d2)) return 1;
    return 0;}
};


#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};

    int size = sizeof(t)/sizeof(t[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i(size+1);
    DVec d_e = h_e;
    DVec d_t = h_t;
    thrust::sequence(d_i.begin(), d_i.end());
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

// create segment end markers
    DVec d_s(size,1);
    thrust::transform(thrust::make_zip_iterator(thrust::make_tuple(d_e.begin(), d_t.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_e.end()-1, d_t.end()-1)), thrust::make_zip_iterator(thrust::make_tuple(d_e.begin()+1, d_t.begin()+1)), d_s.begin(), my_semarker_func());
// create segment f values
    DVec d_g(size);
    thrust::copy_if(d_i.begin()+1, d_i.end(), d_s.begin(), d_g.begin(), _1 == 1);
// create segment indices
    DVec d_h(size);
    thrust::exclusive_scan(d_s.begin(), d_s.end(), d_h.begin());
// create f
    DVec d_f(size);
    thrust::copy_n(thrust::make_permutation_iterator(d_g.begin(), d_h.begin()), size, d_f.begin());
    PRINTER(d_f);

    return 0;
}
$ nvcc -std=c++11 -o t835 t835.cu
$ ./t835
d_e:    0       0       0       0       0       0       1       1       1
d_t:    1       1       4       4       4       5       1       6       7
d_i:    0       1       2       3       4       5       6       7       8       9
d_f:    2       2       5       5       5       6       7       8       9
$

A very similar sequence could be used to create the l vector.



回答2:

I found another way to do this.

In order to be able to use lower_bound, I needed to make sure that t is globally sorted. In order to do that, I first find out the starting points of each sub_segment using adjacent_difference. After that, scatter_if copies increasing numbers from a counting_iterator for each starting point of a subsegment. Finally, inclusive_scan spreads same values for each subsegment. I combined the two steps before the inclusive_scan into the custom functor my_scatter to achieve better kernel fusing.

Now upper_bound is applied to these globally increasing values to calculate f.

l can be calculated by applying upper_bound on e.

I am not sure how the efficiency of my approach compares to the approach presented by @RobertCrovella.


output:

d_e:    0   0   0   0   0   0   1   1   1   
d_t:    1   1   4   4   4   5   1   6   7   
d_i:    0   1   2   3   4   5   6   7   8   
norm_t: 0   0   2   2   2   7   13  20  28  
d_f:    2   2   5   5   5   6   7   8   9   
d_l:    6   6   6   6   6   6   9   9   9

#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/binary_search.h>
#include <thrust/device_vector.h>  
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/adjacent_difference.h>
#include <thrust/functional.h>
#include <stdint.h>
#include <iostream>
#include <thrust/scatter.h>
#include <thrust/scan.h>
#include <thrust/transform.h>

#define PRINTER(name) print(#name, (name))
template <template <typename...> class V, typename T, typename ...Args>
void print(const char* name, const V<T,Args...> & v)
{
    std::cout << name << ":\t";
    thrust::copy(v.begin(), v.end(), std::ostream_iterator<T>(std::cout, "\t"));
    std::cout << std::endl;
}

template <typename IteratorType, typename IndexType = uint32_t>
struct my_scatter : public thrust::unary_function<IndexType,IndexType>
{
    my_scatter(IteratorType first) : first(first)
    {
    }

   __host__ __device__
   IndexType operator()(const IndexType& i)
   {
      IndexType result = i;
      if (i > static_cast<IndexType>(0) && *(first+i) == *(first+i-static_cast<IndexType>(1)))
      { 
          result = static_cast<IndexType>(0);
      }
      return result;
   }

   IteratorType first;
};

template <typename IteratorType>
my_scatter<IteratorType> make_my_scatter(IteratorType first)
{
  return my_scatter<IteratorType>(first);
}

int main()
{
    uint32_t e[] = {0,0,0,0,0,0,1,1,1};
    uint32_t t[] = {1,1,4,4,4,5,1,6,7};
    uint32_t i[] = {0,1,2,3,4,5,6,7,8};

    int size = sizeof(i)/sizeof(i[0]);
    typedef thrust::host_vector<uint32_t> HVec;
    typedef thrust::device_vector<uint32_t> DVec;
    HVec h_i(i,i+size);
    HVec h_e(e,e+size);
    HVec h_t(t,t+size);
    DVec d_i = h_i;
    DVec d_e = h_e;
    DVec d_t = h_t;    
    PRINTER(d_e);
    PRINTER(d_t);
    PRINTER(d_i);

    DVec norm_t(size);

    auto my_scatter_op =  make_my_scatter(zip(d_e.begin(), d_t.begin()));
    auto ti_begin = thrust::make_transform_iterator(thrust::make_counting_iterator(0), my_scatter_op);
    auto ti_end = thrust::make_transform_iterator(thrust::make_counting_iterator(size), my_scatter_op);
    thrust::inclusive_scan(ti_begin, ti_end, norm_t.begin());
    PRINTER(norm_t);

    DVec d_f(size);
    thrust::upper_bound(norm_t.begin(), norm_t.end(), norm_t.begin(), norm_t.end(), d_f.begin());    
    PRINTER(d_f);

    DVec d_l(size);
    thrust::upper_bound(d_e.begin(), d_e.end(), d_e.begin(), d_e.end(), d_l.begin());    
    PRINTER(d_l);

    return 0;
}


标签: c++ cuda thrust