I want to do some thrust operations but I am not sure how exactly.
Right now , I am receiving am array full of zeros ( the h_a array)
I have :
#include <cstdio>
#include <cstdlib>
#include <cmath>
#include <iostream>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <thrust/device_ptr.h>
#include <thrust/fill.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/copy.h>
#include <thrust/generate.h>
template <typename T>
struct square
{
__host__ __device__
T operator()( const T& x ) const
{
return x * x;
}
};
int
main(
int argc,
const char * argv[] )
{
const size_t NbOfPoints = 256;
int BlocksPerGridX = 16;
int BlocksPerGridY = 16;
int ThreadsPerBlockX = 16;
int ThreadsPerBlockY = 16;
// generate random data on the host
thrust::host_vector<float> h_Kx ( NbOfPoints );
thrust::generate( h_Kx.begin(), h_Kx.end(), rand );
thrust::host_vector<float> h_Ky ( NbOfPoints );
thrust::generate( h_Ky.begin(), h_Ky.end(), rand );
// transfer to device
thrust::device_vector<float> dev_Kx = h_Kx;
thrust::device_vector<float> dev_Ky = h_Ky;
// create arrays for holding the number of threads per block in each dimension
int * X , * Y;
cudaMalloc((void **) &X, ThreadsPerBlockX * BlocksPerGridX * sizeof(*X) );
cudaMalloc((void **) &Y, ThreadsPerBlockY * BlocksPerGridY * sizeof(*Y) );
// wrap raw pointer with a device_ptr
thrust::device_ptr<int> dev_X ( X );
thrust::device_ptr<int> dev_Y ( Y );
// use device_ptr in Thrust algorithms
thrust::fill( dev_X, dev_X + ( ThreadsPerBlockX * BlocksPerGridX ) , (int) 0 );
thrust::fill( dev_Y, dev_Y + ( ThreadsPerBlockY * BlocksPerGridY ) , (int) 0 );
// setup arguments
square<float> square_op;
// create various vectors
thrust::device_vector<int> distX ( NbOfPoints );
thrust::device_vector<int> distY ( NbOfPoints );
thrust::device_vector<unsigned int> Tmp ( NbOfPoints );
thrust::host_vector<unsigned int> h_a ( NbOfPoints );
thrust::device_vector<unsigned int> distXSquared ( NbOfPoints );
thrust::device_vector<unsigned int> distYSquared ( NbOfPoints );
// compute distX = dev_Kx - dev_X and distY = dev_Ky - dev_Y
thrust::transform( dev_Kx.begin(), dev_Kx.begin(), dev_X , distX.begin() , thrust::minus<float>() );
thrust::transform( dev_Ky.begin(), dev_Ky.begin(), dev_Y , distY.begin() , thrust::minus<float>() );
//square distances
thrust::transform( distX.begin(), distX.end(), distXSquared.begin(), square_op );
thrust::transform( distY.begin(), distY.end(), distYSquared.begin(), square_op );
// compute Tmp = distX + distY
thrust::transform( distXSquared.begin() ,distXSquared.begin() , distYSquared.begin() , Tmp.begin() , thrust::plus<unsigned int>() );
thrust::copy( Tmp.begin(), Tmp.end(), h_a.begin() );
for ( int i = 0; i < 5; i ++ )
printf("\n temp = %u",h_a[ i ] );
return 0;
}
UPDATE:
Apart the edits from Robert Crovella , you must edit to integers:
square<int> square_op;
thrust::transform( dev_Kx.begin(), dev_Kx.end(), dev_X , distX.begin() , thrust::minus<int>() );
thrust::transform( dev_Ky.begin(), dev_Ky.end(), dev_Y , distY.begin() , thrust::minus<int>() );
You've got several instances of doing zero-length transforms:
and:
Since the first two parameters to each of the above transforms is the same, the work being done is zero. Presumably you want the corresponding
.end()
iterators in the second position rather than.begin()
When I make those changes, I got non-zero values printed out. They are quite large, but you appear to be squaring large values, so I'm not sure what your intent is.