Having finally gotten Dynamic Parallelism up and running, I'm trying to now implement my model with it. It took me a while to figure out that some strange output resulted from needing to use cudaDeviceSynchronize() to make the parent kernel wait for the child kernel to finish.
It seems there is something wrong with the device function I defined as arrAdd. Here's a table of outputs before and after each child kernel in the k2 parent kernel.
Initially : k1 = { -1 0 0 0 0 }
Post arrInit : temp = { .25 .25 .25 .25 .25}
Post arrMult : temp = {-.25 0 0 0 0 }
post arrAdd : temp = { -8 0 0 0 0 }
Expected : temp = {-.50 0 0 0 0 }
__global__ void k2(double* concs, int* maxlength, double* k1s, double* k2s, double * temp, double* tempsum)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
double a21 = .25;
arrInit<<< 1, *maxlength >>>(temp, a21); //temp = a21
cudaDeviceSynchronize();
arrMult<<< 1, *maxlength >>>(k1s, temp, temp); //temp = a21*k1
cudaDeviceSynchronize();
arrAdd<<< 1, *maxlength >>>(temp, temp, temp); //temp = 2*a21*k1
cudaDeviceSynchronize();
}
__global__ void arrAdd(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]+b[idx];
}
__global__ void arrMult(double* a, double* b, double* c)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
c[idx]=a[idx]*b[idx];
}
__global__ void arrInit(double* a, double b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
a[idx]=b;
}