I have been working on an c/c++ OpenCL solution for the past few weeks now. For my solution, I need to pass a class from my CPU(Host) to GPU(Device). When I try to pass the class as an argument it gives an error "Unknown Type-Identifier Class". My doubt whether OpenCL on Intel Platform does it allow us to pass a class to kernel or any work around is available for it. In CUDA I have seen some examples and it works perfectly fine for the platform. However, with respect to OpenCL I am not able to find any references and No examples related to this query. I would be really thankful to any help with regards to this issue. I have posted the same question on Intel website but to no avail. If someone would be kind enough to help me understand where I am going wrong or how I should proceed with this I would be really thankful to you.
//HOST SIDE CODE
#include<stdio.h>
#include<iostream>
#include"CL/cl.h"
class test
{
public:
cl_int a;
cl_char b;
};
int main()
{
test *tempstruct = new test;
cl_platform_id platfrom_id;
cl_device_id device_id; // compute device id
cl_context context; // compute context
cl_command_queue commands; // compute command queue
cl_program program; // compute program
cl_kernel kernel; // compute kernel
int err;
err = clGetPlatformIDs(1, &platfrom_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a platfrom group!\n");
return -1;
}
err = clGetDeviceIDs(platfrom_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create a device group!\n");
return -1;
}
context = clCreateContext(0, 1, &device_id, NULL, NULL, NULL);
if (!context)
{
printf("Error: Failed to create a compute context!\n");
return -1;
}
commands = clCreateCommandQueue(context, device_id, 0, NULL);
if (!commands)
{
printf("Error: Failed to create a command commands!\n");
return -1;
}
#define MAX_SOURCE_SIZE (0x100000)
FILE *fp, *fp1;
char filename[] = "Template.cl";
fp = fopen(filename, "r");
if (fp == NULL)
{
printf("\n file not found \n");
return -1;
}
char * source_str = (char*)malloc(MAX_SOURCE_SIZE);
size_t size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
fclose(fp);
cl_mem classobj = clCreateBuffer(context, CL_MEM_USE_HOST_PTR, sizeof(tempstruct), &tempstruct, &err);
if (err != CL_SUCCESS)
{
printf("Error: Failed to allocate device memory!\n");
return -1;
}
program = clCreateProgramWithSource(context, 1, (const char **)& source_str, (const size_t *)&size, &err);
if (!program)
{
printf("Error: Failed to create program with source!\n");
return -1;
}
err = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to build program executable!\n");
return -1;
}
test *resptr = (test *)clEnqueueMapBuffer(commands, classobj, CL_TRUE, CL_MAP_WRITE, NULL, sizeof(test), NULL, NULL, NULL, &err);
// INITIALISATION OF CLASS
tempstruct->a = 10;
if (!resptr)
{
printf("Error: Failed to create enqueuemapbuffer!\n");
return -1;
}
err = clEnqueueUnmapMemObject(commands, classobj, resptr, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to write to source array!\n");
return -1;
}
kernel = clCreateKernel(program, "CLASS", &err);
if (err != CL_SUCCESS)
{
printf("Error: Failed to create compute kernel!\n");
return -1;
}
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &classobj);
if (err != CL_SUCCESS)
{
printf("Error: Failed to set kernel arguments! %d\n", err);
return -1;
}
size_t globalsize = 1;
size_t local = 1;
err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &globalsize, &local, 0, NULL, NULL);
if (err)
{
printf("Error: Failed to execute nd range!\n");
return -1;
}
test *resptr1 = (test *)clEnqueueMapBuffer(commands, classobj, CL_TRUE, CL_MAP_READ, NULL, sizeof(test), NULL, NULL, NULL, &err);
err = clEnqueueUnmapMemObject(commands, classobj, resptr1, 0, NULL, NULL);
if (err != CL_SUCCESS)
{
printf("Error: Failed to read output array! %d\n", err);
return -1;
}
// again i am printing the class value
printf("\n in cpu side = %d\n", tempstruct->a);
}
//HOST END
//DEVICE SIDE(KERNEL CODE)
// filename : Template.cl
class test
{
public:
cl_int a;
cl_char b;
};
__kernel void CLASS(__global test *inclass )
{
inclass->a = 10;
printf("\n in kernel side = %d \n",inclass->a);
}
//KERNEL END
ERRORS:
I am facing these all errors at kernel side only
1) Error Tempalte.CL unknown type name 'test'
2) Error Tempalte.CL expected ';' after top level declarator
3) Error Tempalte.CL program scope variables are required to be declared in constant address space
4) Error Tempalte.CL unknown type name 'class'
QUERY:
Q)My main Question is How to Pass a CLASS to kernel in Intel architecture
* I have successfully passed class to kernel in AMD. whenever I tried with same code in Intel side it shows the above four errors.
* Is there any alternative method to pass class to kernel in Intel or is it possible to pass class to kernel in Intel architecture ?
OpenCL uses C99. So you can pass structs, but not classes, to the kernel.
As huseyin tugrul buyukisik says, you can use SYCL, which supports c++14 (or thereabouts).
Alternatively, if you want to support both NVIDIA® CUDA™ and OpenCL, you could write it only in NVIDIA® CUDA™, and then use https://github.com/hughperkins/cuda-on-cl to run the NVIDIA® CUDA™ code on OpenCL 1.2 GPU devices. Full disclosure: I'm the author of cuda-on-cl, and it's a bit of a work-in-progress for now. It does work though, with some caveats/limitations. It can handle full-blown C++11, templates, classes etc. For example, it can be used to compile and run Eigen on OpenCL 1.2 GPUs https://bitbucket.org/hughperkins/eigen/src/eigen-cl/unsupported/test/cuda-on-cl/?at=eigen-cl
If sycl(and Hugh Perkins's nice solution) is not option for you and if your class doesn't have any methods, you can use structs instead(serialize to byte array when copying to device):
and then you are responsible for handling of alignment and sizes of structs. For example,
Warrior
struct has fields totally 20 bytes but it is likely 32 bytes in device side(because of some rules forcing it being power of 2 in memory) and you should acknowledge it from host side and put data accordingly in tune with alignment and variable sizes. Not even mentioning endianness which is a pain to handle for "write once, run everywhere". So you should run it only in your computer which is optimized for.Pack biggest fields on top of struct, add smaller ones in bottom. Calculate their in-struct alignment as powers of 2 too!. Keep an eye of float3, int3 and similar not-so-documented-well implementations as they may or may not use float4,int4 in background. If performance of global memory access is not important for you, you can simply select a big number like N for every struct smaller than that for simplicity and put relative byte addressings to a structs beginning byte. Such as byte address of
hp
field in aWarrior
struct at top(in a packed 4-bytes into single int). Then device side it can be queried to which byte does a field start. (endianness can make it more trickier so don't use buffercopies for pure structs)If alignment of struct fields in host side is not an option:
Warrior
is build from arrays of hp,id,...)