I wrote an OpenCL function to increase 64-bits float point value in an array. But the results is different between CPU and GPU.
import numpy as np
import pyopencl as cl
CL_INC = '''
__kernel void inc_f64(__global const double *a_g, __global double *res_g)
{
int gid = get_global_id(0);
res_g[gid] = a_g[gid] + 1.0;
}
'''
def test(dev_type):
ctx = cl.Context(dev_type=dev_type)
queue = cl.CommandQueue(ctx)
mf = cl.mem_flags
prg = cl.Program(ctx, CL_INC).build()
in_py = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
out_py = np.empty_like(in_py)
in_cl = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=in_py)
out_cl = cl.Buffer(ctx, mf.WRITE_ONLY, in_py.nbytes)
prg.inc_f64(queue, in_py.shape, None, in_cl, out_cl)
cl.enqueue_copy(queue, out_py, out_cl)
queue.finish()
return out_py
print('Run inc_f64() on CPU: ', end='')
print(test(cl.device_type.CPU))
print('Run inc_f64() on GPU: ', end='')
print(test(cl.device_type.GPU))
Output:
Run inc_f64() on CPU: [2. 3. 4. 5. 6.]
Run inc_f64() on GPU: [2.40000038e+001 3.20000076e+001 5.26354425e-315 0.00000000e+000
0.00000000e+000]
Hardware informations:
[0] Apple / OpenCL 1.2 (Oct 31 2017 18:30:00)
|- [0:0] CPU / OpenCL 1.2 / Intel(R) Core(TM) i7-3667U CPU @ 2.00GHz
|- [0:1] GPU / OpenCL 1.2 / HD Graphics 4000
Is it a hardware limitation or just a bug in the source code?