I have following simple CUDA-Thrust code which adds 10 to device vector but the function is getting called on host side instead of device.
#include <algorithm>
#include <iostream>
#include <numeric>
#include <vector>
#include <stdio.h>
#include <thrust/device_vector.h>
__host__ __device__ int add(int x){
#if defined(__CUDA_ARCH__)
printf("In device\n");
#else
printf("In host\n");
#endif
return x+10;
}
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
std::transform(data.begin(), data.end(), data.begin(),add);
return 0;
}
What am I doing wrong here?
The thrust quick start guide has good examples to follow.
It looks like you have several issues, some already pointed out.
If you want to use thrust, you should use thrust::transform
, not std::transform
. std::transform
has no knowledge of the GPU or CUDA or thrust, and will dispatch the host version of your add
function. I'm not sure what that would do exactly when you pass a thrust::device_vector
to it.
Thrust algorithms need to use function objects (functors) rather than bare CUDA __device__
functions, for the reason indicated by Jared (the thrust algorithm in your source code is actually host code. That host code cannot discover the address of a bare __device__
function). With this fix, you can be pretty certain that thrust will dispatch the device code path when working on device vectors.
Here's a modification of your code:
$ cat t856.cu
#include <stdio.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
struct my_func {
__host__ __device__
int operator()(int x){
#if defined(__CUDA_ARCH__)
printf("In device, x is %d\n", x);
#else
printf("In host, x is %d\n", x);
#endif
return x+10;
}
};
int main(void)
{
thrust::host_vector<int> H(4);
H[0] = H[1] = H[2] = H[3] = 10;
thrust::device_vector<int> data=H;
thrust::transform(data.begin(), data.end(), data.begin(),my_func());
return 0;
}
$ nvcc -o t856 t856.cu
$ ./t856
In device, x is 10
In device, x is 10
In device, x is 10
In device, x is 10
$