I'm trying to wrap the cublasXt*gemm
functions in CUDA 9.0 with ctypess in Python 2.7.14 on Ubuntu Linux 16.04. These functions accept arrays in host memory as some of their arguments. I have been able to use them successfully in C++ as follows:
#include <iostream>
#include <cstdlib>
#include "cublasXt.h"
#include "cuda_runtime_api.h"
void rand_mat(float* &x, int m, int n) {
x = new float[m*n];
for (int i=0; i<m; ++i) {
for (int j=0; j<n; ++j) {
x[i*n+j] = ((float)rand())/RAND_MAX;
}
}
}
int main(void) {
cublasXtHandle_t handle;
cublasXtCreate(&handle);
int devices[1] = {0};
if (cublasXtDeviceSelect(handle, 1, devices) !=
CUBLAS_STATUS_SUCCESS) {
std::cout << "initialization failed" << std::endl;
return 1;
}
float *a, *b, *c;
int m = 4, n = 4, k = 4;
rand_mat(a, m, k);
rand_mat(b, k, n);
rand_mat(c, m, n);
float alpha = 1.0;
float beta = 0.0;
if (cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
m, n, k, &alpha, a, m, b, k, &beta, c, m) !=
CUBLAS_STATUS_SUCCESS) {
std::cout << "matrix multiply failed" << std::endl;
return 1;
}
delete a; delete b; delete c;
cublasXtDestroy(handle);
}
However, when I try to wrap them in Python as follows, I encounter a segfault at the cublasXt*gemm
call:
import ctypes
import numpy as np
_libcublas = ctypes.cdll.LoadLibrary('libcublas.so')
_libcublas.cublasXtCreate.restype = int
_libcublas.cublasXtCreate.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDestroy.restype = int
_libcublas.cublasXtDestroy.argtypes = [ctypes.c_void_p]
_libcublas.cublasXtDeviceSelect.restype = int
_libcublas.cublasXtDeviceSelect.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p]
_libcublas.cublasXtSgemm.restype = int
_libcublas.cublasXtSgemm.argtypes = [ctypes.c_void_p,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_int,
ctypes.c_void_p,
ctypes.c_void_p,
ctypes.c_int]
handle = ctypes.c_void_p()
_libcublas.cublasXtCreate(ctypes.byref(handle))
deviceId = np.array([0], np.int32)
status = _libcublas.cublasXtDeviceSelect(handle, 1,
deviceId.ctypes.data)
if status:
raise RuntimeError
a = np.random.rand(4, 4).astype(np.float32)
b = np.random.rand(4, 4).astype(np.float32)
c = np.zeros((4, 4), np.float32)
status = _libcublas.cublasXtSgemm(handle, 0, 0, 4, 4, 4,
ctypes.byref(ctypes.c_float(1.0)),
a.ctypes.data, 4, b.ctypes.data, 4,
ctypes.byref(ctypes.c_float(0.0)),
c.ctypes.data, 4)
if status:
raise RuntimeError
print 'success? ', np.allclose(np.dot(a.T, b.T).T, c_gpu.get())
_libcublas.cublasXtDestroy(handle)
Curiously, the Python wrappers above work if I slightly modify them to accept pycuda.gpuarray.GPUArray
matrices that I have transferred to the GPU. Any thoughts as to why I am encountering a segfault only in Python when passing host memory to the function?