This question is about heap size limitation in cuda. Having visited some questions concerning this topic, including this one: new operator in kernel .. strange behaviour I've made some tests. Given a kernel as follow:
#include <cuda.h>
#include <cuda_runtime.h>
#define CUDA_CHECK( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CUDA_CHECK_ERROR() __cudaCheckError( __FILE__, __LINE__ )
inline void __cudaSafeCall( cudaError err, const char *file, const int line )
{
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaSafeCall() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
return;
}
inline void __cudaCheckError( const char *file, const int line )
{
cudaError err = cudaGetLastError();
if ( cudaSuccess != err )
{
fprintf( stderr, "cudaCheckError() failed at %s:%i : %s\n",
file, line, cudaGetErrorString( err ) );
exit( -1 );
}
return;
}
#include <stdio>
#define NP 900000
__device__ double *temp;
__device__ double *temp2;
__global__
void test(){
int i = blockDim.x*blockIdx.x + threadIdx.x;
if(i==0){
temp = new double[NP];
//temp2 = new double[NP];
}
if(i==0){
for(int k=0;k<NP;k++){
temp[i] = 1.;
if(k%1000 == 0){
printf("%d : %g\n", k, temp[i]);
}
}
}
if(i==0){
delete(temp);
//delete(temp2);
}
}
int main(){
//cudaDeviceSetLimit(cudaLimitMallocHeapSize, 32*1024*1024);
//for(int k=0;k<2;k++){
test<<<ceil((float)NP/512), 512>>>();
CUDA_CHECK_ERROR();
//}
return 0;
}
I want to test the heap size limitation.
- Dynamically allocating one array (temp) with one thread which size is roughly over 960,000*sizeof(double) (close to 8MB, which is the default limit of the heap size) gives an error : ok. 900,000 works. (does someone know how to calculate the true limit?)
- Rising the heap size limit allows to allocate more memory : normal, ok.
- Back to a 8MB heap size, allocating one array per thread with TWO threads (so, replacing if (i==0) by if(i==0 || i==1), each one 900,000 * sizeof(double) fails. But 450,000*sizeof(double) each, works. Still ok.
- Here comes my problem : allocating TWO arrays with ONE thread (so, temp and temp2 for thread 0), each 900,000 * sizeof(double) works too, but it should not? Indeed when I try to write in both arrays, it fails. But anyone has an idea why this different behaviour in allocation when using two arrays with one thread instead of two arrays with two threads?
EDIT : another test, which I find interesting for those who, like me, would be learning the usage of heap : 5. Executing the kernel two times, with one array of size 900,000 * sizeof(double) allocated by the single thread 0, works if there is the delete. If delete is omitted, it will fail the second time, but the first call will be executed.
EDIT 2 : how to allocate a device-wide variable once but writable by all threads (not from host, using dynamic allocation in device code)?