I run the following (simplified) code, which runs a simplified kernel for a few seconds, and then checks the results. The first 400,000 or so results are correct, and then the next are all zero. The kernel should put the same value (4228) into each element of the output array of 4.5 million elements. It looks like somehow, somewhere, something is timing out, or not being synchronized, but I'm a bit puzzled, since I:
- even called clFinish, just to make sure
- am checking all errors, and no errors returned
The results look like:
user@pear:~/git/machinelearning/prototyping/build$ ./testcltimeout
out[442496] != 4228: 0
What I expect to happen is: code should just run to completion, with no errors.
Context: running on:
- beignet, OpenCL 1.2
- Intel HD 4000 integrated graphics
Kernel is:
kernel void test_read( const int one, const int two, global int *out) {
const int globalid = get_global_id(0);
int sum = 0;
int n = 0;
while( n < 100000 ) {
sum = (sum + one ) % 1357 * two;
n++;
}
out[globalid] = sum;
}
Test code (I've simplified this as much as possible...)
#include <iostream>
#include <sstream>
#include <stdexcept>
using namespace std;
#include "CL/cl.hpp"
template<typename T>
std::string toString(T val ) {
std::ostringstream myostringstream;
myostringstream << val;
return myostringstream.str();
}
void checkError( cl_int error ) {
if (error != CL_SUCCESS) {
throw std::runtime_error( "Error: " + toString(error) );
}
}
int main( int argc, char *argv[] ) {
cl_int error;
cl_device_id *device_ids;
cl_uint num_platforms;
cl_uint num_devices;
cl_platform_id platform_id;
cl_device_id device;
cl_context context;
cl_command_queue queue;
cl_program program;
checkError( clGetPlatformIDs(1, &platform_id, &num_platforms) );
checkError( clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices) );
device_ids = new cl_device_id[num_devices];
checkError( clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, num_devices, device_ids, &num_devices) );
device = device_ids[0];
context = clCreateContext(0, 1, &device, NULL, NULL, &error);
checkError(error);
queue = clCreateCommandQueue(context, device, 0, &error);
checkError(error);
string kernel_source = string( "kernel void test_read( const int one, const int two, global int *out) {\n" ) +
" const int globalid = get_global_id(0);\n" +
" int sum = 0;\n" +
" int n = 0;\n" +
" while( n < 100000 ) {\n" +
" sum = (sum + one ) % 1357 * two;\n" +
" n++;\n" +
" }\n" +
" out[globalid] = sum;\n" +
"}\n";
const char *source_char = kernel_source.c_str();
size_t src_size = strlen( source_char );
program = clCreateProgramWithSource(context, 1, &source_char, &src_size, &error);
checkError(error);
checkError( clBuildProgram(program, 1, &device, 0, NULL, NULL) );
cl_kernel kernel = clCreateKernel(program, "test_read", &error);
checkError(error);
const int N = 4500000;
int *out = new int[N];
if( out == 0 ) throw runtime_error("couldnt allocate array");
int c1 = 3;
int c2 = 7;
checkError( clSetKernelArg(kernel, 0, sizeof(int), &c1 ) );
checkError( clSetKernelArg(kernel, 1, sizeof(int), &c2 ) );
cl_mem outbuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int) * N, 0, &error);
checkError(error);
checkError( clSetKernelArg(kernel, 2, sizeof(cl_mem), &outbuffer) );
size_t globalSize = N;
size_t workgroupsize = 512;
globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
checkError( clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &globalSize, &workgroupsize, 0, NULL, NULL) );
checkError( clFinish( queue ) );
checkError( clEnqueueReadBuffer( queue, outbuffer, CL_TRUE, 0, sizeof(int) * N, out, 0, NULL, NULL) );
checkError( clFinish( queue ) );
for( int i = 0; i < N; i++ ) {
if( out[i] != 4228 ) {
cout << "out[" << i << "] != 4228: " << out[i] << endl;
exit(-1);
}
}
return 0;
}
You're kernel seems to be pretty long. I suspect you are TDR'ing (timing out) out and Linux (Beignet) handles this more silently than Windows. Hence, I have a couple ideas.
dmesg
for a TDR message. I haven't used Beignet or a Linux OpenCL implementation for that matter, but the Beignet documentation page (under "known issues") indicates you can check this viadmesg
.The documentation goes on to say that you can disable the timeout check if you really know the kernel is just taking longer to finish, but warns that you risk a machine hang.
Try the on Intel HD 4000 Graphics on Windows. If the kernel takes longer than a few seconds, it will time out and the driver actually crashes (but auto restarts).
Try the kernel with the Intel OpenCL CPU implementation (or any other without a TRD limit). Check for correctness and the length that it runs in (10 seconds? 10 minutes?). I don't think the CPU implementation will time out.