How to force a functor to see an entire thrust::ve

2019-06-04 04:01发布

问题:

I'm new to CUDA and having a little trouble with functors. I am trying to input a thrust::vector of thrust::vectors into a functor. Currently I can enter a vector and do something to each element and return the modified vector using thrust::for_each, but if I were to want to sort a vector in a functor I would need to be able to input the whole vector at once so the functor can act on it as a whole. Is there a way to do this?

The code below compiles, but does not return the vector sorted.

#include <iostream>
#include <fstream>
#include <sstream>
#include <string>
#include <vector>
#include <iterator>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/reduce.h>
#include <thrust/transform_reduce.h>
#include <thrust/transform.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/tuple.h>
#include <thrust/count.h>
#include <thrust/sequence.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/for_each.h>
#include <ctime>
#include <cstdio>
#include <cassert> 

using namespace std;
template<typename T>
struct sort_vector
{
    __host__ __device__ thrust::device_vector<float> operator()    (thrust::tuple<thrust::device_vector<float>, thrust::device_vector<float>>     x)
    {
    thrust::device_vector<float> y = thrust::get<0>(x);
    thrust::sort(y.begin(), y.end());
    return thrust::get<1>(x) = y;
    }
};

int main() {
    thrust::device_vector<float> d_fraction(5);
    d_fraction[0] = 1;
    d_fraction[1] = 5;
    d_fraction[2] = 3;
    d_fraction[3] = 2;
    d_fraction[4] = 4;

    cout << "original" << endl;
    int f = 0;
    while (f < 5){
        cout << d_fraction[f] << endl;
        f++;
    }

    cudaStream_t s1;
    cudaStreamCreate(&s1);
    thrust::device_vector<float> result1(5);

    thrust::for_each(thrust::cuda::par.on(s1), 
    thrust::make_zip_iterator(thrust::make_tuple(d_fraction.begin(), result1.begin())),
    thrust::make_zip_iterator(thrust::make_tuple(d_fraction.end(), result1.end())), sort_vector<thrust::device_vector<float>>());

    cudaStreamSynchronize(s1);

    cout << "sorted" << endl;
    int d = 0;
    while (d < 5){
        cout << Sresult2[d] << endl;
        d++;
    }

    cudaStreamDestroy(s1);
    return 0;
}

However, when I try to use a reference such as

 _host__ __device__ thrust::device_vector<float> operator()    (thrust::tuple<thrust::device_vector<float> &, thrust::device_vector<float> &>     x)

The code no longer compiles.

Is it possible I need to cast a reference pointer for the vector so the functor can see the whole vector? Or is it possible that the issue is that I am passing the vector by value and there is a different way I am unaware of to pass a vector into a functor?

回答1:

A functor normally operates from the context of a single thread. If using the CUDA backend, we're talking about a CUDA thread.

The typical approach to sort a vector would be to use thrust::sort directly on the vector. In the most trivial usages, no functor definition is needed at all.

If you wanted to sort a vector "within a functor", then it would be necessary to pass a pointer to that vector to the functor, and let the functor work on that.

Thrust device code (what is executed in the context of a functor) generally can't handle constructs like thrust::device_vector directly. It's also not possible currently in thrust to construct a device vector of device vectors.

Therefore I've modified your code to something that is workable and sorts "within a functor". I've chosen to concatenate the vectors to be sorted into a single vector. We pass the address of this vector to the sorting functor, and then each thread calculates its range to sort, and passes that to thrust::sort for sequential sorting in the thread:

$ cat t1211.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/for_each.h>
#include <thrust/sequence.h>
#include <cstdlib>

const int num_segs = 3;  // number of segments to sort
const int num_vals = 5;  // number of values in each segment
const int range = 100;   // range of values

using namespace std;

template <typename T>
struct sort_vector
{
    T *data;
    sort_vector(T *_data) : data(_data) {};
    __host__ __device__ void operator()(int idx)
    {
    thrust::sort(thrust::seq, data+idx*num_vals, data+((idx+1)*num_vals));
    }
};

int main() {
    thrust::device_vector<float> d_data(num_segs*num_vals);
    for (int i = 0; i < num_segs*num_vals; i++)
      d_data[i] = rand()%range;

    cout << "original" << endl;
    int f = 0;
    while (f < num_segs*num_vals){
        cout << d_data[f] << endl;
        f++;
    }
    thrust::device_vector<int> d_idxs(num_segs);
    thrust::sequence(d_idxs.begin(), d_idxs.end());
    cudaStream_t s1;
    cudaStreamCreate(&s1);
    //thrust::device_vector<float> result1(5);

    thrust::for_each(thrust::cuda::par.on(s1),
    d_idxs.begin(),
    d_idxs.end(), sort_vector<float>(thrust::raw_pointer_cast(d_data.data())));

    cudaStreamSynchronize(s1);

    cout << "sorted" << endl;
    int d = 0;
    while (d < num_segs*num_vals){
        cout << d_data[d] << endl;
        d++;
    }

    cudaStreamDestroy(s1);
    return 0;
}
$ nvcc -o t1211 t1211.cu
$ ./t1211
original
83
86
77
15
93
35
86
92
49
21
62
27
90
59
63
sorted
15
77
83
86
93
21
35
49
86
92
27
59
62
63
90
$

In this case, as is evident by thrust::seq, the work being done in each thread is being done sequentially. (Together, the threads here are operating in parallel, but they are not cooperating -- each thread is working on an independent problem).

This is not the only possible solution. You may be interested in this question/answer which has a variety of other related ideas.

To be clear, what I think you're discussing here is a "vectorized" (or segmented) sort. This is not the fastest approach, but I'm trying to demonstrate some workable concepts for you as a straightforward extension to what you have shown, in order to answer your question("How to force a functor to see an entire thrust::vector so that sorting is possible?") A faster approach for vectorized sorting is discussed in the above linked question/answer.