I am having problems copying data from my device back to the host. My data are arranged in a struct:
typedef struct Array2D {
double* arr;
int rows;
int cols;
} Array2D;
arr
is a 'flat' array. rows
and cols
describes the dimensions.
The code below shows how I am trying to copy the data back to the host:
h_output = (Array2D*) malloc(sizeof(Array2D));
cudaMemcpy(h_output, d_output, sizeof(Array2D), cudaMemcpyDeviceToHost);
double* h_arr = (double*) malloc(h_output->cols*h_output->rows*sizeof(double));
cudaMemcpy(h_arr, h_output->arr, h_output->cols*h_output->rows*sizeof(double), cudaMemcpyDeviceToHost);
h_output->arr = h_arr;
However, in the fourth line the execution fails with cuda error 11 (invalid argument). I cannot see why this is happening. The size of the array is correct, and I can access both h_output
and h_array
from the host and both have 'real' addresses.
EDIT
Sorry for the late response to the request for more information (= more code).
I have tested that the pointer d_output->arr
is a device pointer, by trying to access the value of the device pointer on the host. As expected, I was not allowed to do that leaving me with the thought that d_output->arr
is in fact a valid device pointer.
The code's objective is to solve Thiele's differential equation using the fourth order Runge-Kutta method.
class CalculationSpecification
{
/* FUNCTIONS OMITTED */
public:
__device__ void RK4_n(CalculationSpecification* cs, CalcData data, Array2D* d_output)
{
double* rk4data = (double*)malloc((data.pdata->endYear - data.pdata->startYear + 1)*data.pdata->states*sizeof(double));
/* CALCULATION STUFF HAPPENS HERE */
// We know that rows = 51, cols = 1 and that rk4data contains 51 values as it should.
// This was confirmed by using printf directly in this function.
d_output->arr = rk4data;
d_output->rows = data.pdata->endYear - data.pdata->startYear + 1;
d_output->cols = data.pdata->states;
}
};
class PureEndowment : CalculationSpecification
{
/* FUNCTIONS OMITTED */
public:
__device__ void Compute(Array2D *result, CalcData data)
{
RK4_n(this, data, result);
}
};
__global__ void kernel2(Array2D *d_output)
{
/* Other code that initializes 'cd'. */
PureEndowment pe;
pe.Compute(d_output,cd);
}
void prepareOutputSet(Array2D* h_output, Array2D* d_output, int count)
{
h_output = (Array2D*) malloc(sizeof(Array2D));
cudaMemcpy(h_output, d_output, sizeof(Array2D), cudaMemcpyDeviceToHost); // After this call I can read the correct values of row, col as well as the address of the pointer.
double* h_arr = (double*) malloc(h_output->cols*h_output->rows*sizeof(double));
cudaMemcpy(h_arr, h_output->arr, h_output->cols*h_output->rows*sizeof(double), cudaMemcpyDeviceToHost)
h_output->arr = h_arr;
}
int main()
{
Array2D *h_output, *d_output;
cudaMalloc((void**)&d_output, sizeof(Array2D));
kernel2<<<1,1>>>(d_output);
cudaDeviceSynchronize();
prepareOutputSet(h_output, d_output, 1);
getchar();
return 0;
}
EDIT2
Additionally, I have now tested that the value of d_output->arr
when running on the device is identical to the value of h_output->arr
after the first cudaMemcpy
-call in prepareOutputSet
.
This (copying device-allocated memory using cudaMemcpy) is a known limitation in CUDA 4.1. A fix is in the works and will be released in a future version of the CUDA runtime.
The error you are seeing is almost certainly caused by h_output->arr
not being a valid device pointer, or by h_output->rows
or h_output->cols
having incorrect values somehow. You have chosen not to show any code explaining how the contents of the source memory d_output
have been set, so it is not possible to say for sure what is the root cause of your problem.
To illustrate the point, here is a complete, runnable demo showing the posted code in action:
#include <cstdlib>
#include <cstdio>
inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true)
{
if (code != 0) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line);
if (Abort) exit(code);
}
}
#define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); }
typedef float Real;
typedef struct Array2D {
Real* arr;
int rows;
int cols;
} Array2D;
__global__ void kernel(const int m, const int n, Real *lval, Array2D *output)
{
lval[threadIdx.x] = 1.0f + threadIdx.x;
if (threadIdx.x == 0) {
output->arr = lval;
output->rows = m;
output->cols = n;
}
}
int main(void)
{
const int m=8, n=8, mn=m*n;
Array2D *d_output;
Real *d_arr;
GPUerrchk( cudaMalloc((void **)&d_arr,sizeof(Real)*size_t(mn)) );
GPUerrchk( cudaMalloc((void **)&d_output, sizeof(Array2D)) );
kernel<<<1,mn>>>(m,n,d_arr,d_output);
GPUerrchk( cudaPeekAtLastError() );
// This section of code is the same as the original question
Array2D *h_output = (Array2D*)malloc(sizeof(Array2D));
GPUerrchk( cudaMemcpy(h_output, d_output, sizeof(Array2D), cudaMemcpyDeviceToHost) );
size_t sz = size_t(h_output->rows*h_output->cols)*sizeof(Real);
Real *h_arr = (Real*)malloc(sz);
GPUerrchk( cudaMemcpy(h_arr, h_output->arr, sz, cudaMemcpyDeviceToHost) );
for(int i=0; i<h_output->rows; i++)
for(int j=0; j<h_output->cols; j++)
fprintf(stdout,"(%d %d) %f\n", i, j, h_arr[j + i*h_output->rows]);
return 0;
}
I have had to take a few liberties here, because I only have a compute capability 1.2 device at my disposal, so no device side malloc
and no double precision. But the host side API calls which retrieve a valid Array2D
structure from device memory and use its contents are effectively the same. Running the program works as expected:
$ nvcc -Xptxas="-v" -arch=sm_12 Array2D.cu
ptxas info : Compiling entry function '_Z6kerneliiPfP7Array2D' for 'sm_12'
ptxas info : Used 2 registers, 16+16 bytes smem
$ cuda-memcheck ./a.out
========= CUDA-MEMCHECK
(0 0) 1.000000
(0 1) 2.000000
(0 2) 3.000000
(0 3) 4.000000
(0 4) 5.000000
(0 5) 6.000000
(0 6) 7.000000
(0 7) 8.000000
(1 0) 9.000000
(1 1) 10.000000
(1 2) 11.000000
(1 3) 12.000000
(1 4) 13.000000
(1 5) 14.000000
(1 6) 15.000000
(1 7) 16.000000
(2 0) 17.000000
(2 1) 18.000000
(2 2) 19.000000
(2 3) 20.000000
(2 4) 21.000000
(2 5) 22.000000
(2 6) 23.000000
(2 7) 24.000000
(3 0) 25.000000
(3 1) 26.000000
(3 2) 27.000000
(3 3) 28.000000
(3 4) 29.000000
(3 5) 30.000000
(3 6) 31.000000
(3 7) 32.000000
(4 0) 33.000000
(4 1) 34.000000
(4 2) 35.000000
(4 3) 36.000000
(4 4) 37.000000
(4 5) 38.000000
(4 6) 39.000000
(4 7) 40.000000
(5 0) 41.000000
(5 1) 42.000000
(5 2) 43.000000
(5 3) 44.000000
(5 4) 45.000000
(5 5) 46.000000
(5 6) 47.000000
(5 7) 48.000000
(6 0) 49.000000
(6 1) 50.000000
(6 2) 51.000000
(6 3) 52.000000
(6 4) 53.000000
(6 5) 54.000000
(6 6) 55.000000
(6 7) 56.000000
(7 0) 57.000000
(7 1) 58.000000
(7 2) 59.000000
(7 3) 60.000000
(7 4) 61.000000
(7 5) 62.000000
(7 6) 63.000000
(7 7) 64.000000
========= ERROR SUMMARY: 0 errors
I tried allocating the pointer Array2D->arr
on the host using cudaMalloc instead of allocating it on the device using malloc. After that, the code works as intended.
It looks very much like the problem described in the thread (http://forums.nvidia.com/index.php?showtopic=222659) on nVidia's forum that Pavan referred to in the comments to the question.
I think that probably closes the question for now, as the code works fine. However, if anyone has a proposal for a solution which utilizes malloc on the device, feel free to post.
It looks like h_output
is allocated with a call to malloc()
. In the first call to cudaMemcpy()
(line 2), h_output
is being used as as a host pointer (which seems right). In the second call to cudaMemcpy()
(line 4), h_output->arr
is being used as a device pointer (which does not seem right). In that 4th line, it looks like you are copying from host memory to host memory. So, you will probably want to use just a straight memcpy()
instead of cudaMemcpy()
.
At least that is what it looks like from the code you have provided.