I want to run a function on a large, 2D complex array (eventually 2*12x2*12 datapoints). However, pycuda does not work as expected. The ElementWise function doesn't work at 2d arrays, so I used the SourceModule function with block sizes.
The problem is now that the C code on the GPU does not give the same result as the numpy calculation on the CPU. Very large and strange numbers are resulting.
I'm using the following code. What's going wrong?
#!/usr/bin/env python
#https://github.com/lebedov/scikits.cuda/blob/master/demos/indexing_2d_demo.py
"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""
from string import Template
import pycuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from matplotlib import pyplot as plt
# Set size
A = 2**3
B = 2**3
N = A*B
x_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: (1.+x)*np.exp(1.j*y*np.pi/10), (A,B)) )
y_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: 1.*x, (A,B)).astype(
x_gpu.dtype))
d_gpu = gpuarray.to_gpu(np.zeros_like(x_gpu.get()))#.astype(np.float32))
func_mod_template = Template("""
// Macro for converting subscripts to linear index:
#define INDEX(a, b) a*${B}+b
#include <pycuda-complex.hpp>
//__global__ void func(double *d,double *x,double *y, unsigned int N) {
__global__ void func(pycuda::complex<float> *d,pycuda::complex<float> *x,
pycuda::complex<float> *y)
{
// Obtain the linear index corresponding to the current thread:
// unsigned int idx = blockIdx.y*blockDim.y*gridDim.x +
blockIdx.x*blockDim.x*gridDim.y +threadIdx.x+threadIdx.y;
unsigned int block_num = blockIdx.x + blockIdx.y * gridDim.x;
unsigned int thread_num = threadIdx.y * blockDim.x + threadIdx.x;
unsigned int threads_in_block = blockDim.x * blockDim.y;
unsigned int idx = (threads_in_block * block_num + thread_num);
// Convert the linear index to subscripts:
unsigned int a = idx/${B};
unsigned int b = idx%${B};
// Use the subscripts to access the array:
// d[INDEX(a,b)] = x[INDEX(a,b)]+y[INDEX(a,b)];
pycuda::complex<float> j(0,arg(x[idx]));
pycuda::complex<float> i(abs(x[idx]),0);
d[idx] = i * exp(j);
}
""")
max_threads_per_block = pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
max_block_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_X),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Y),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Z))
max_grid_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_X),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Y),
pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Z))
max_blocks_per_grid = max(max_grid_dim)
block_dim = max_block_dim
block_dim = (max_block_dim[0],1,1)
grid_dim = (int(np.ceil(1.*x_gpu.shape[0]/block_dim[0])),
int(np.ceil(1.*x_gpu.shape[1]/block_dim[1])))
print block_dim,grid_dim, N
func_mod = \
SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
max_blocks_per_grid=max_blocks_per_grid,
A=A, B=B))
func = func_mod.get_function('func')
func(d_gpu,x_gpu,y_gpu,
block=block_dim,
grid=grid_dim)
print d_gpu.get()/x_gpu.get()
#print 'Success status: ', np.allclose(x_np, x_gpu.get())
plt.imshow((d_gpu.get()/x_gpu.get()).real)
plt.colorbar()
plt.show()