为了更清楚,我要的是传递指针和所有他们指出,设备的数据。 为了测试我怎样才能达到这个目标,我写了一个简单的类:
class vecarray{
public:
int * vecptr[N]; //array of pointers pointing to array
int dim[N]; //store length of each array pointed to
__device__ __host__ vecarray(); //constructor
__device__ __host__ int sum(); //sum up all the elements in the array being
//pointed to
}
vecarray::vecarray(){
for(int i = 0; i<N; i++)
{
vecptr[i] = NULL;
dim[i] = 0;
}
}
int vecarray::sum(){
int i=0, j=0, s=0;
for (i=0; i<N; i++)
for(j=0; j < dim[i]; j++)
s += vecptr[i][j];
return s;
}
然后,我在下面的代码使用这个类:
#define N 2
__global__ void addvecarray( vecarray * v, int *s){
*s = v->sum();
}
int main(){ //copy *V to device, do sum() and pass back
vecarray *v, *dev_v; //the result by dev_v
v = new vecarray;
dev_v = new vecarray;
int a[3] = {1,2,3}; //initialize v manually
int b[4] = {4,5,6,7};
int result = 0;
int * dev_result;
v->vecptr[0] = a;
v->vecptr[1] = b;
v->dim[0] = 3; v->dim[1] = 4;
cudaMalloc((void**)&dev_v, sizeof(vecarray));
cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
for(int i = 0; i < N; i++){
cudaMalloc((void**)&(dev_v->vecptr[i]), v->dim[i]*sizeof(int));
}
for(int i = 0; i<N; i++ ){ //copy arrays
cudaMemcpy(dev_v->vecptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice));
}
addvecarray<<<1,1>>>(dev_v, dev_result);
cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
printf("the result is %d\n", result);
}
该代码通过NVCC编译器,但运行时分段故障失败。 我检查问题出在两个cudaMalloc和cudaMemcpy opertation在for循环。 所以我的问题是我应该如何把这个对象传递给CUDA? 提前致谢。
您的代码在它有几个错误。 正如我在评论中提到的,关键的错误之一是你如何由类中的指针引用的数据区分配内存。 关键的失误,在要传递一个指针cudaMalloc已经生活在设备内存。 我们可以通过创建一个额外的一套,我们将使用分配所需要的装置中储存所指向的类中的数组指针的解决这个问题。 另外还有一些其他的错误,比如你没有正确的分配设备存储的事实dev_result
。 下面的代码修复所有我能找到的错误,我相信给出正确的结果。 我还添加了CUDA错误检查的参考形式,你会发现在你的项目中使用有用:
#include <stdio.h>
#define N 2
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
using namespace std;
class vecarray{
public:
int *vecptr[N]; //array of pointers pointing to array
int dim[N]; //store length of each array pointed to
__device__ __host__ vecarray(); //constructor
__device__ __host__ int sum(); //sum up all the elements in the array being
//pointed to
};
vecarray::vecarray(){
for(int i = 0; i<N; i++)
{
vecptr[i] = NULL;
dim[i] = 0;
}
}
__device__ __host__ int vecarray::sum(){
int i=0, j=0, s=0;
for (i=0; i<N; i++)
for(j=0; j < dim[i]; j++)
s += vecptr[i][j];
return s;
}
__global__ void addvecarray( vecarray * v, int *s){
*s = v->sum();
}
int main(){ //copy *V to device, do sum() and pass back
vecarray *v, *dev_v; //the result by dev_v
v = new vecarray;
int a[3] = {1,2,3}; //initialize v manually
int b[4] = {4,5,6,7};
int result = 0;
int *dev_result;
v->vecptr[0] = a;
v->vecptr[1] = b;
v->dim[0] = 3; v->dim[1] = 4;
int *vptr[N];
cudaMalloc((void**)&dev_v, sizeof(vecarray));
cudaCheckErrors("cudaMalloc1 fail");
cudaMemcpy(dev_v, v, sizeof(vecarray),cudaMemcpyHostToDevice); //copy class object
cudaCheckErrors("cudaMemcpy1 fail");
for(int i = 0; i < N; i++){
cudaMalloc((void**)&(vptr[i]), v->dim[i]*sizeof(int));
cudaCheckErrors("cudaMalloc2 fail");
cudaMemcpy(&(dev_v->vecptr[i]), &vptr[i], sizeof(int*), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy2 fail");
}
for(int i = 0; i<N; i++ ){ //copy arrays
cudaMemcpy(vptr[i], v->vecptr[i], v->dim[i]*sizeof(int), cudaMemcpyHostToDevice);
cudaCheckErrors("cudaMemcpy3 fail");
}
cudaMalloc((void **)&dev_result, sizeof(int));
cudaCheckErrors("cudaMalloc3 fail");
addvecarray<<<1,1>>>(dev_v, dev_result);
cudaMemcpy(&result, dev_result, sizeof(int), cudaMemcpyDeviceToHost);
cudaCheckErrors("cudaMemcpy4 fail");
printf("the result is %d\n", result);
return 0;
}