Element wise function on pycuda::complex array

I want to run a function on a large, 2D complex array (eventually 2*12x2*12 datapoints). However, pycuda does not work as expected. The ElementWise function doesn't work at 2d arrays, so I used the SourceModule function with block sizes.

The problem is now that the C code on the GPU does not give the same result as the numpy calculation on the CPU. Very large and strange numbers are resulting.

I'm using the following code. What's going wrong?

#!/usr/bin/env python
#https://github.com/lebedov/scikits.cuda/blob/master/demos/indexing_2d_demo.py
"""
Demonstrates how to access 2D arrays within a PyCUDA kernel in a
numpy-consistent manner.
"""

from string import Template
import pycuda
import pycuda.autoinit
import pycuda.gpuarray as gpuarray
from pycuda.compiler import SourceModule
import numpy as np
from matplotlib import pyplot as plt

# Set size
A = 2**3
B = 2**3
N = A*B
x_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: (1.+x)*np.exp(1.j*y*np.pi/10), (A,B)) )
y_gpu = gpuarray.to_gpu(np.fromfunction(lambda x,y: 1.*x, (A,B)).astype(
                                x_gpu.dtype)) 
d_gpu = gpuarray.to_gpu(np.zeros_like(x_gpu.get()))#.astype(np.float32))

func_mod_template = Template("""
// Macro for converting subscripts to linear index:
#define INDEX(a, b) a*${B}+b
#include <pycuda-complex.hpp>

//__global__ void func(double *d,double *x,double *y, unsigned int N) {
__global__ void func(pycuda::complex<float> *d,pycuda::complex<float> *x,
                     pycuda::complex<float> *y)
{
    // Obtain the linear index corresponding to the current thread:     
    // unsigned int idx =  blockIdx.y*blockDim.y*gridDim.x + 
                        blockIdx.x*blockDim.x*gridDim.y +threadIdx.x+threadIdx.y;
    unsigned int block_num        = blockIdx.x + blockIdx.y * gridDim.x;              
    unsigned int thread_num       = threadIdx.y * blockDim.x + threadIdx.x;           
    unsigned int threads_in_block = blockDim.x * blockDim.y;                          
    unsigned int idx              =  (threads_in_block * block_num + thread_num);

    // Convert the linear index to subscripts:
    unsigned int a = idx/${B};
    unsigned int b = idx%${B};

    // Use the subscripts to access the array:
    // d[INDEX(a,b)] = x[INDEX(a,b)]+y[INDEX(a,b)];
    pycuda::complex<float> j(0,arg(x[idx]));
    pycuda::complex<float> i(abs(x[idx]),0);
    d[idx] = i * exp(j);
}
""")

max_threads_per_block = pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_THREADS_PER_BLOCK)
max_block_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_X),
                 pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Y),
                 pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_BLOCK_DIM_Z))
max_grid_dim = (pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_X),
                pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Y),
                pycuda.autoinit.device.get_attribute(pycuda._driver.device_attribute.MAX_GRID_DIM_Z))
max_blocks_per_grid = max(max_grid_dim)
block_dim = max_block_dim
block_dim = (max_block_dim[0],1,1)
grid_dim = (int(np.ceil(1.*x_gpu.shape[0]/block_dim[0])),
            int(np.ceil(1.*x_gpu.shape[1]/block_dim[1])))
print block_dim,grid_dim, N

func_mod = \
         SourceModule(func_mod_template.substitute(max_threads_per_block=max_threads_per_block,
                                                   max_blocks_per_grid=max_blocks_per_grid,
                                                   A=A, B=B))
func = func_mod.get_function('func')
func(d_gpu,x_gpu,y_gpu,
     block=block_dim,
    grid=grid_dim)

print d_gpu.get()/x_gpu.get()
#print 'Success status: ', np.allclose(x_np, x_gpu.get())
plt.imshow((d_gpu.get()/x_gpu.get()).real)
plt.colorbar()
plt.show()

As an actual answer: changing the x_gpu line to

x_gpu = gpuarray.to_gpu(np.fromfunction(
    lambda x,y: (1.+x)*np.exp(1.j*y*np.pi/10), (A,B)).astype(np.complex64) )

seems to fix the problem. Also, although ElementwiseKernel does not work with 2d arrays, you are using 2d->1d transformation anyway, so nothing really stops you from writing

func = ElementwiseKernel(
    "pycuda::complex<float> *d, pycuda::complex<float> *x, pycuda::complex<float> *y",

    Template("""
    // Convert the linear index to subscripts:
    unsigned int a = i/${B};
    unsigned int b = i%${B};

    // Use the subscripts to access the array:
    //d[INDEX(a,b)] = x[INDEX(a,b)]+y[INDEX(a,b)];
    pycuda::complex<float> angle(0,arg(x[i]));
    pycuda::complex<float> module(abs(x[i]),0);
    d[i] = module * exp(angle);
    """).substitute(A=A, B=B),

    preamble=Template("""
    #define INDEX(a, b) a*${B}+b
    """).substitute(A=A, B=B))

func(d_gpu, x_gpu, y_gpu)

This way you will not need to juggle block/grid sizes because PyCUDA will handle this for you.