Following a previous question ( Pycuda - Best way to perform a high number of small matrix inversion - CUBLAS or MAGMA ), considering the inversion of 4x4 matrix, I would like to do the same but with 3x3 matrix. As @Robert Crovella said, this change implies a complete rewrite.
Given the code shown below, I tried to test some things like putting zeros instead of values but this method doesn't seem to work.
Here is the code working for a high number of 4x4 matrix inversion
$ cat t10.py
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off = off >> 4;
return ret;
}
const int block_size = 256;
const unsigned tmsk = 0xFFFFFFFF;
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv4x4(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
if (idx < n*16){
si[threadIdx.x] = in[idx];
unsigned lane = threadIdx.x & 15;
unsigned sibase = threadIdx.x & 0x03F0;
__syncwarp();
unsigned off = pat[lane];
T a,b;
a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
if (!getoff(off)) a = -a;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+16];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
off = pat[lane+32];
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
if (getoff(off)) a += b;
else a -=b;
T det = si[sibase + (lane>>2)]*a;
det += __shfl_down_sync(tmsk, det, 4, 16); // first add
det += __shfl_down_sync(tmsk, det, 8, 16); // second add
det = __shfl_sync(tmsk, det, 0, 16); // broadcast
out[idx] = a / det;
}
}
""")
# python function for inverting 4x4 matrices
# n should be an even number
def gpuinv4x4(inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*16), dtype= np.float32)
# Get kernel function
matinv4x4 = kernel.get_function("inv4x4")
# Define block, grid and compute
blockDim = (256,1,1) # do not change
gridDim = ((n/16)+1,1,1)
# Kernel function
matinv4x4 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 0.0, 1.0, 0.0, 2.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv4x4(inp, n)
print(result.reshape(2,4,4))
$ python t10.py
[[-3. -0.5 1.5 1. ]
[ 1. 0.25 -0.25 -0.5 ]
[ 3. 0.25 -1.25 -0.5 ]
[-3. -0. 1. 1. ]]
[[ 1. 0. 0. 0. ]
[ 0. 1. 0. 0. ]
[ 0. 0. 1. 0. ]
[ 0. 0. 0. 1. ]]
I expect the same behavior, except that I am not longer working with 4x4 matrix but with 3x3 matrix.
If someone could help me to adapt this code above to work with 3x3 matrix inversion, this would be nice.
UPDATE 1 : Here the modifications that I made.
I have modified the dimension and used direct formula from the link given by @Robert Crovella (https://ardoris.wordpress.com/2008/07/18/general-formula-for-the-inverse-of-a-3x3-matrix/) . Below the code modified :
import numpy as np
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel of 3x3 inversion
kernel_3x3 = SourceModule("""
// in-place is acceptable i.e. out == in)
// T = float or double only
typedef float T;
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
size_t ix = threadIdx.x;
size_t idx = ix + blockDim.x*blockIdx.x;
if (ix < n*9){
T det = in[0+idx]*(in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx]) - in[1+idx]*(in[3+idx]*in[8+idx]-in[6+idx]*in[5+idx]) + in[2+idx]*(in[3+idx]*in[7+idx]-in[6+idx]*in[4+idx]);
out[0+idx] = (in[4+idx]*in[8+idx]-in[7+idx]*in[5+idx])/det;
out[1+idx] = (in[2+idx]*in[7+idx]-in[1+idx]*in[8+idx])/det;
out[2+idx] = (in[1+idx]*in[5+idx]-in[2+idx]*in[4+idx])/det;
out[3+idx] = (in[6+idx]*in[5+idx]-in[3+idx]*in[8+idx])/det;
out[4+idx] = (in[0+idx]*in[8+idx]-in[2+idx]*in[6+idx])/det;
out[5+idx] = (in[2+idx]*in[3+idx]-in[0+idx]*in[5+idx])/det;
out[6+idx] = (in[3+idx]*in[7+idx]-in[4+idx]*in[6+idx])/det;
out[7+idx] = (in[1+idx]*in[6+idx]-in[0+idx]*in[7+idx])/det;
out[8+idx] = (in[0+idx]*in[4+idx]-in[1+idx]*in[3+idx])/det;
__syncwarp();
}
}
""")
def gpuinv3x3 (inp, n):
# internal constants not to be modified
hpat = ( 0x0EB51FA5, 0x1EB10FA1, 0x0E711F61, 0x1A710B61, 0x1EB40FA4, 0x0EB01FA0, 0x1E700F60, 0x0A701B60, 0x0DB41F94, 0x1DB00F90, 0x0D701F50, 0x19700B50, 0x1DA40E94, 0x0DA01E90, 0x1D600E50, 0x09601A50, 0x1E790F69, 0x0E391F29, 0x1E350F25, 0x0A351B25, 0x0E781F68, 0x1E380F28, 0x0E341F24, 0x1A340B24, 0x1D780F58, 0x0D381F18, 0x1D340F14, 0x09341B14, 0x0D681E58, 0x1D280E18, 0x0D241E14, 0x19240A14, 0x0A7D1B6D, 0x1A3D0B2D, 0x063D172D, 0x16390729, 0x1A7C0B6C, 0x0A3C1B2C, 0x163C072C, 0x06381728, 0x097C1B5C, 0x193C0B1C, 0x053C171C, 0x15380718, 0x196C0A5C, 0x092C1A1C, 0x152C061C, 0x05281618)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float32)
hpatd = np.array(hpat, dtype=np.uint32)
output = np.empty((n*9), dtype= np.float32)
# Get kernel function
matinv3x3 = kernel_3x3.get_function("inv3x3")
# Define block, grid and compute
blockDim = (81,1,1) # do not change
gridDim = ((n/9)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
#example/test case
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
The first matrix is correctly inverted but not the second one (the identity matrix which has identity matrix as inverse) :
[[[ 2. -0. -1. ]
[-1. -0.33333334 1. ]
[-0. 0.33333334 -0. ]]
[[ 1. -0.5 -0. ]
[ -inf 1. -1. ]
[ nan nan 1. ]]]
So, this issue doesn't seem to come from the kernel code but rather the batch size or something similar with dimensions of global 1D array (in my code, you can see the 2 3x3 matrices formatted as 1D array of 18 elements ( inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
).
Anyone could see what's wrong in this code ? Especially, the issue of bad inversion on the second matrix . Just a last point, an odd size of group doesn't imply issues for GPU processing ?