Pairwise operation on segmented data in CUDA/thrus

2020-02-29 11:15发布

Suppose I have

  • a data array,
  • an array containing keys referencing entries in the data array and
  • a third array which contains an id for every key array entry

e.g.

DataType dataArray[5];
int keyArray[10] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
int ids[10]      = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};

How can I execute a custom operator ResultDataType fun(int key1, int key2, int id) pairwise for each segment of ids ignoring the case key1 == key2 using thrust?

In this example I'd like to execute and store the result of:

fun(1,2,0)
fun(1,3,0)
fun(2,3,0)
fun(2,1,2)

2条回答
混吃等死
2楼-- · 2020-02-29 11:27

I've found this solution to your question. I used a mix of cuda kernels and thrust primitives. I suppose that your operator has the commutative property on the keys arguments, that is

fun(key1, key2, id)  == fun(key2, key1, id)

My solution produce three output arrays (keys1, keys2 and ids) in two steps. In the first step it computes only the number of elements in the output arrays and in the second step it fill the arrays. Basically the algorithm runs two times: in the first time it "simulates" the writing and the second time it actually writes the output. This is a common pattern that I use when the output size depends on the input data.

Here are the steps:

  1. Sort the keys by segment. This step is the same of the algorithm of @Robert Crovella.
  2. count the number of pairs produced by each key.Each thread is associated a key. Each thread starts counting all the valid pairs from its base index to the end of the segment. The output of this step is in the vector d_cpairs
  3. compute the size of keys1, keys2 and ids and the offset of each pair int in those arrays.A sum-reduction on d_cpairs computes the size of the output arrays. An exlusive-scan operation, performed on d_cpairs, produces the positions of the pairs in the output arrays. The output of this step is in the vector d_addrs
  4. fill the keys1, keys2 and ids with the data. This step is exactly the same of step 3) but it actually writes the data in keys1, keys2 and ids.

Here is the output of my algorithm:

    d_keys: 1 2 3 1 1 2 2 1 1 1 
    d_segs: 0 0 0 1 2 2 2 3 3 3 
    d_cpairs: 2 1 0 0 1 0 0 0 0 0 
    num_pairs: 4
    d_addrs: 0 2 3 3 3 4 4 4 4 4 
    keys1: 1 1 2 1 
    keys2: 2 3 3 2 
    ids: 0 0 0 2

Note that the last column in the output is not exactly like you want but if the commutative property hold it's fine.

Here is my complete code. Probably this is not the fastest solution but I think it's simple.

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/scan.h>
#include <thrust/sort.h>

#define SHOW_VECTOR(V,size)  std::cout << #V << ": "; for(int i =0 ; i < size; i++ ){ std::cout << V[i] << " "; } std::cout << std::endl;
#define RAW_CAST(V) thrust::raw_pointer_cast(V.data())

__global__ 
void count_kernel(const int *d_keys,const int *d_segs,int *d_cpairs, int siz){
    int tidx = threadIdx.x+ blockIdx.x*blockDim.x;

    if(tidx < siz){
        int sum = 0;
        int i = tidx+1; 
        while(d_segs[i] == d_segs[tidx]){        
            if(d_keys[i] != d_keys[tidx] &&
               d_keys[i] != d_keys[i-1]){
                sum++;
            }
            i++;
        }
        d_cpairs[tidx] = sum;
    }
}

__global__ 
void scatter_kernel(const int *d_keys,
                    const int *d_segs,
                    const int *d_addrs, 
                    int *d_keys1,
                    int *d_keys2,
                    int *d_ids,                           
                    int siz){
    int tidx = threadIdx.x+ blockIdx.x*blockDim.x;

    if(tidx < siz){
        int base_address = d_addrs[tidx];
        int j =0;
        int i = tidx+1; 
        while(d_segs[i] == d_segs[tidx]){        
            if(d_keys[i] != d_keys[tidx] &&
               d_keys[i] != d_keys[i-1]){

               d_keys1[base_address+j] = d_keys[tidx];
               d_keys2[base_address+j] = d_keys[i];
               d_ids[base_address+j] = d_segs[i];                      
               j++;
            }
            i++;
        }
    }
}

int main(){

    int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
    int idsArray[]      = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};


    int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);

    thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
    thrust::host_vector<int> h_segs(idsArray, idsArray+sz1);
    thrust::device_vector<int> d_keys = h_keys;
    thrust::device_vector<int> d_segs = h_segs;
    thrust::device_vector<int> d_cpairs(sz1);
    thrust::device_vector<int> d_addrs(sz1);

    //sort each segment to group like keys together
    thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
    thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());

    SHOW_VECTOR(d_keys,sz1);
    SHOW_VECTOR(d_segs,sz1);

    //count the number of pairs produced by each key
    count_kernel<<<1,sz1>>>(RAW_CAST(d_keys),RAW_CAST(d_segs),RAW_CAST(d_cpairs),sz1);

    SHOW_VECTOR(d_cpairs,sz1);

    //determine the total number of pairs
    int num_pairs  = thrust::reduce(d_cpairs.begin(),d_cpairs.end());

    std::cout << "num_pairs: " << num_pairs << std::endl;
    //compute the addresses     
    thrust::exclusive_scan(d_cpairs.begin(),d_cpairs.end(),d_addrs.begin());


    thrust::device_vector<int> keys1(num_pairs);
    thrust::device_vector<int> keys2(num_pairs);
    thrust::device_vector<int> ids(num_pairs);

    SHOW_VECTOR(d_addrs,sz1);   

    //fill the vector with the keys and ids
    scatter_kernel<<<1,sz1>>>(RAW_CAST(d_keys),
                              RAW_CAST(d_segs),
                              RAW_CAST(d_addrs),
                              RAW_CAST(keys1),
                              RAW_CAST(keys2),
                              RAW_CAST(ids),                              
                              sz1);

    SHOW_VECTOR(keys1,num_pairs);
    SHOW_VECTOR(keys2,num_pairs);
    SHOW_VECTOR(ids,num_pairs);


    return 0;
}
查看更多
来,给爷笑一个
3楼-- · 2020-02-29 11:47

This turned out to be more involved than I initially thought. My interpretation is that you essentially want to compute all combinations of N things taken two at a time, without repetition, per segment (N is different for each segment). I'll present an answer that I think covers most of the concepts you might want to consider. It's just one possible solution method, of course.

This may not arrive at precisely the solution you want (although I think it's pretty close). My purpose is not to give you a finished code, but to demonstrate a possible algorithmic approach.

Here's a walkthrough of the algorithm:

  1. Sort the keys by segment. This step isn't necessary if you can guarantee that like keys (within a segment) are grouped together. (Your data as presented wouldn't require this step.) We can use back-to-back stable_sort_by_key to accomplish this. Output:

    d_keys:
    1,2,3,1,1,2,2,1,1,1,
    d_segs:
    0,0,0,1,2,2,2,3,3,3,
    
  2. Reduce the data set to just the unique keys, per segment (remove duplicated keys.) We can do this on the keys and segments together with thrust::unique_copy, and an appropriate functor to define equality of keys within a segment only. Output:

    d_ukeys:
    1,2,3,1,1,2,1,
    d_usegs:
    0,0,0,1,2,2,3,
    
  3. Compute the length of each segment now that duplicate keys are removed. We can do this with thrust::reduce_by_key and a constant_iterator. Output:

    d_seg_lens:
    3,1,2,1,
    
  4. Now that we know each segment length, we want to remove any segment (plus it's associate keys) for which the segment length is 1. These segments don't have any key pairs possible. To facilitate this, we also need to compute the starting index of each segment and create a stencil that identifies the segments whose length is one, then we can use remove_if and scatter_if to remove the associated segment and key data. Output:

    d_seg_idxs:
    0,3,4,6,
    d_stencil:
    0,0,0,1,0,0,1,
    

    and the reduced data:

    d_ukeys:
    1,2,3,1,2,
    d_usegs:
    0,0,0,2,2,
    d_seg_lens:
    3,2,
    
  5. Our goal at this point will be to create a vector of appropriate length so that it can hold one entry for each combination of N things taken two at a time, where N is the length of each segment. So for the first segment above, it has 3 items, so the combination of 3 things taken 2 at a time requires storage for 3 combinations total. Likewise the second segment above of length 2 has only one unique combination between the 2 keys, therefore storage is needed for only one combination for that segment. We can compute this storage length using the standard combinations formula reduced to combinations of N things taken 2 at a time:

      C = (N)*(N-1)/2
    

    we'll use this formula, passed to thrust::transform_reduce along with our segment lengths, to compute the overall storage length needed. (4 combinations total, for this example).

  6. Once we have the overall length determined, and the necessary vectors of that length allocated, we need to generate the actual combinations that belong to each segment. This again is a multi-step sequence that begins with generating flags to mark (delineate) each "segment" within these vectors. With the flag array, we can use exclusive_scan_by_key to generate an ordinal sequence identifying the combination that belongs in each position, per segment. Output:

    d_flags:
    1,0,0,1,
    example ordinal sequence:
    0,1,2,0
    
  7. With the ordinal sequence per segment, we now need to generate the actual unique combinations, per segment. This step required some consideration to try and come up with an algorithm to accomplish this in a fixed time (ie. without iteration). The algorithm I came up with is to map each ordinal sequence to a matrix, so for a segment with 4 combinations (which would have 4 things taken 2 at a time, so 6 total combinations, larger than any in this example):

        1   2   3 
    
     0  C1  C2  C3 
     1  C4  C5  C6
     2
    

    We then "re-map" any combinations (Cx above) that are below the main diagonal to alternate locations in the matrix, like this:

        1   2   3 
    
     0  C1  C2  C3  
     1      C5  C6
     2          C4
    

    we can then read off unique combination pairs as the row and column indices of the above specially-crafted matrix. (effectively a linear-to-triangular mapping) So C1 corresponds to the combination of logical key 0 and logical key 1. C6 corresponds to the combination of logical key 1 and logical key 3. This matrix assembly and mapping to row and column "indices" to produce unique combinations is handled by the comb_n functor passed to thrust::for_each_n, which is also receiving the segment lengths and the input segment ordinal sequences, and is generating "logical" keys 1 and 2 as output:

    d_lkey1:
    1,2,2,1,
    d_lkey2:
    0,0,1,0,
    

    (added note: I believe the approach I am describing here is comparable to what is discussed in this question/answer, which I stumbled upon recently, although they appear to be different at first glance.)

  8. The last transformation step is to convert our logical keys and segments into "actual" keys and segments. Output:

    d_key1:
    2,3,3,2,
    d_key2:
    1,1,2,1,
    d_aseg:
    0,0,0,2,
    

Here's a complete code that implements the above. It may not do precisely what you want, but I think it's pretty close to what you described, and hopefully will be useful for ideas if you want to do this with thrust:

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/reduce.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#include <thrust/transform_reduce.h>
#include <thrust/functional.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>

#include <iostream>

#define MY_DISPLAY


typedef float DataType;

// user supplied functor

struct fun
{

  DataType *dptr;
  fun(DataType *_dptr) : dptr(_dptr) {};

  template <typename T>
  __host__ __device__
  void operator()(const T d1)
  {
    int key1 = thrust::get<0>(d1);
    int key2 = thrust::get<1>(d1);
    int id   = thrust::get<2>(d1);
    DataType my_val = dptr[key1]*100+dptr[key2]*10+id;
    printf("result: %f\n", my_val);
  }
};

// for determining "equality" of keys
struct my_unique_eq
{

  template <typename T>
  __host__ __device__
  bool operator()(const T d1, const T d2) const
  {
    return ((thrust::get<0>(d1) == thrust::get<0>(d2))&&(thrust::get<1>(d1) == thrust::get<1>(d2)));
  }
};

// BinaryPredicate for the head flag segment representation 
// equivalent to thrust::not2(thrust::project2nd<int,int>())); 
// from: https://github.com/thrust/thrust/blob/master/examples/scan_by_key.cu

template <typename HeadFlagType> 
struct head_flag_predicate  
    : public thrust::binary_function<HeadFlagType,HeadFlagType,bool> 
{ 
    __host__ __device__ 
    bool operator()(HeadFlagType left, HeadFlagType right) const 
    { 
        return !right; 
    } 
};

// find nth combination of c things taken 2 at a time
struct comb_n
{

  template <typename T>
  __host__ __device__
  void operator()(const T &d1)
  {  // item c from n choose 2
    int c = thrust::get<0>(d1);
    int n = thrust::get<1>(d1);
    int row = c/(n-1);
    int col = c%(n-1);

    if (col < row) {
      col = (n-2)-col;
      row = (n-1)-row;
    }

    thrust::get<2>(d1) = col+1; // lkey1
    thrust::get<3>(d1) = row;   // lkey2
  }
};


using namespace thrust::placeholders;

typedef thrust::pair<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator > mip;

typedef thrust::zip_iterator< thrust::tuple<thrust::device_vector<int>::iterator, thrust::device_vector<int>::iterator> > mzi;


int main(){

  // set up sample data 

  DataType dataArray[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f};
  int keyArray[] = {1, 2, 3, 1, 2, 2, 1, 1, 1, 1};
  int ids[]      = {0, 0, 0, 1, 2, 2, 2, 3, 3, 3};

  int sz0 = sizeof(dataArray)/sizeof(dataArray[0]);
  int sz1 = sizeof(keyArray)/sizeof(keyArray[0]);

  thrust::host_vector<int> h_keys(keyArray, keyArray+sz1);
  thrust::host_vector<int> h_segs(ids, ids+sz1);
  thrust::device_vector<int> d_keys = h_keys;
  thrust::device_vector<int> d_segs = h_segs;
  thrust::host_vector<DataType> h_data(dataArray, dataArray+sz0);
  thrust::device_vector<DataType> d_data = h_data;

  //sort each segment to group like keys together
  thrust::stable_sort_by_key(d_keys.begin(), d_keys.end(), d_segs.begin());
  thrust::stable_sort_by_key(d_segs.begin(), d_segs.end(), d_keys.begin());

#ifdef MY_DISPLAY
  std::cout << "d_keys:" << std::endl;
  thrust::copy(d_keys.begin(), d_keys.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_segs:" << std::endl;
  thrust::copy(d_segs.begin(), d_segs.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif

  //now reduce the data set to just unique keys
  thrust::device_vector<int> d_ukeys(sz1);
  thrust::device_vector<int> d_usegs(sz1);
  mzi ip1 = thrust::unique_copy(thrust::make_zip_iterator(thrust::make_tuple(d_keys.begin(), d_segs.begin())), thrust::make_zip_iterator(thrust::make_tuple(d_keys.end(), d_segs.end())), thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin())), my_unique_eq());
  int sz2 = ip1  - thrust::make_zip_iterator(thrust::make_tuple(d_ukeys.begin(), d_usegs.begin()));
  thrust::device_vector<int> d_seg_lens(sz2);
  d_ukeys.resize(sz2);
  d_usegs.resize(sz2);
#ifdef MY_DISPLAY
  std::cout << "d_ukeys:" << std::endl;
  thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_usegs:" << std::endl;
  thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif

  thrust::device_vector<int> d_seg_nums(sz2);
  // compute the length of each segment
  mip ip2 = thrust::reduce_by_key(d_usegs.begin(), d_usegs.end(), thrust::make_constant_iterator(1), d_seg_nums.begin(), d_seg_lens.begin());
  int sz3 = thrust::get<1>(ip2) - d_seg_lens.begin();
  d_seg_nums.resize(sz3);
  d_seg_lens.resize(sz3);
  thrust::device_vector<int> d_seg_idxs(sz3);
  // remove segments that have only one unique key
    // compute start index of each segment
  thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_seg_idxs.begin());
#ifdef MY_DISPLAY
  std::cout << "d_seg_lens:" << std::endl;
  thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
  std::cout << "d_seg_idxs:" << std::endl;
  thrust::copy(d_seg_idxs.begin(), d_seg_idxs.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif
    // remove if the length of segment is 1
  thrust::device_vector<int> d_stencil(d_usegs.size());
  thrust::scatter_if(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+d_seg_lens.size(), d_seg_idxs.begin(), d_seg_lens.begin(), d_stencil.begin(), (_1 == 1));
#ifdef MY_DISPLAY
  std::cout << "d_stencil:" << std::endl;
  thrust::copy(d_stencil.begin(), d_stencil.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif 
  thrust::remove_if(d_usegs.begin(), d_usegs.end(),  d_stencil.begin(), (_1 == 1));
  thrust::remove_if(d_ukeys.begin(), d_ukeys.end(),  d_stencil.begin(), (_1 == 1));
  int removals = thrust::remove_if(d_seg_lens.begin(), d_seg_lens.end(),(_1 == 1)) - d_seg_lens.begin();
  d_usegs.resize(d_usegs.size()-removals);
  d_ukeys.resize(d_ukeys.size()-removals);
  d_seg_lens.resize(d_seg_lens.size()-removals);
  // compute storage length needed (sum of combinations in each segment)
  int sz4 = thrust::transform_reduce(d_seg_lens.begin(), d_seg_lens.end(), ((_1)*(_1 - 1))/2, 0, thrust::plus<int>());

#ifdef MY_DISPLAY
  std::cout << "d_ukeys:" << std::endl;
  thrust::copy(d_ukeys.begin(), d_ukeys.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_usegs:" << std::endl;
  thrust::copy(d_usegs.begin(), d_usegs.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
  std::cout << "d_seg_lens:" << std::endl;
  thrust::copy(d_seg_lens.begin(), d_seg_lens.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif

  // now create intra-segment index sequence
    // first create flag array to mark segments
  thrust::device_vector<int> d_flags(sz4);
    // compute flag indexes
  thrust::device_vector<int> d_flag_idxs(sz3);
  thrust::exclusive_scan(d_seg_lens.begin(), d_seg_lens.end(), d_flag_idxs.begin());
    // scatter flags
  thrust::scatter(thrust::make_constant_iterator(1), thrust::make_constant_iterator(1)+sz3, d_flag_idxs.begin(), d_flags.begin());
#ifdef MY_DISPLAY
  std::cout << "d_flags:" << std::endl;
  thrust::copy(d_flags.begin(), d_flags.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif

    // use flag array to create intra-segment index sequence
  thrust::device_vector<int> d_seg_i_idxs(d_flags.size());
  thrust::exclusive_scan_by_key(d_flags.begin(), d_flags.end(), thrust::make_constant_iterator(1), d_seg_i_idxs.begin(), 0, head_flag_predicate<int>());

  // convert flags to segment references
  thrust::inclusive_scan(d_flags.begin(), d_flags.end(), d_flags.begin());
  thrust::transform(d_flags.begin(), d_flags.end(), d_flags.begin(), (_1 - 1));

  // for_each - run functor that uses intra-segment index sequence plus segment
  // index to create logical combinations
  // input required:
  //  -- logical segment number
  //  -- intra-segment index
  //  -- segment size
  // output:
  //  -- logical key1
  //  -- logical key2

  thrust::device_vector<int> d_lkey1(sz4);
  thrust::device_vector<int> d_lkey2(sz4);
  thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_seg_i_idxs.begin(), thrust::make_permutation_iterator(d_seg_lens.begin(), d_flags.begin()), d_lkey1.begin(), d_lkey2.begin())), sz4, comb_n());  
#ifdef MY_DISPLAY
  std::cout << "d_lkey1:" << std::endl;
  thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_lkey2:" << std::endl;
  thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif

  // convert logical keys and logical segments to actual keys/segments
  thrust::device_vector<int> d_key1(sz4);
  thrust::device_vector<int> d_key2(sz4);
  thrust::device_vector<int> d_aseg(sz4);
  thrust::copy_n(thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), sz4, d_aseg.begin());
  thrust::transform(d_lkey1.begin(), d_lkey1.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey1.begin(), thrust::plus<int>());
  thrust::transform(d_lkey2.begin(), d_lkey2.end(), thrust::make_permutation_iterator(d_seg_idxs.begin(), d_flags.begin()), d_lkey2.begin(), thrust::plus<int>());
  thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey1.begin()), sz4, d_key1.begin());
  thrust::copy_n(thrust::make_permutation_iterator(d_ukeys.begin(), d_lkey2.begin()), sz4, d_key2.begin());
#ifdef MY_DISPLAY
  std::cout << "d_lkey1:" << std::endl;
  thrust::copy(d_lkey1.begin(), d_lkey1.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_lkey2:" << std::endl;
  thrust::copy(d_lkey2.begin(), d_lkey2.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif
#ifdef MY_DISPLAY
  std::cout << "d_key1:" << std::endl;
  thrust::copy(d_key1.begin(), d_key1.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl << "d_key2:" << std::endl;
  thrust::copy(d_key2.begin(), d_key2.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif
  // d_flags has already been converted to logical segment sequence
  // just need to convert these via lookup to the actual segments
  thrust::unique(d_usegs.begin(), d_usegs.end());
  thrust::copy_n(thrust::make_permutation_iterator(d_usegs.begin(), d_flags.begin()), sz4, d_aseg.begin());
#ifdef MY_DISPLAY
  std::cout << "d_aseg:" << std::endl;
  thrust::copy(d_aseg.begin(), d_aseg.end(), std::ostream_iterator<int>(std::cout, ","));
  std::cout << std::endl;
#endif
  // run desired user function
  thrust::for_each_n(thrust::make_zip_iterator(thrust::make_tuple(d_key1.begin(), d_key2.begin(), d_aseg.begin())), sz4, fun(thrust::raw_pointer_cast(d_data.data())));
  std::cout << "Finished." << std::endl;
  return 0;
}  
查看更多
登录 后发表回答