Before doing anything else, try running more blocks. A given block only ever runs on a single SM - by using only 16 blocks you are guaranteeing that about 40% of the GPU capacity will be idle. Some multiple of 27 should be the optimal number of blocks on your GTX260-216. You might also find that reducing the number of threads per block won't hurt performance, so that you can keep about the same amount of work per thread, but just do it with enough blocks to cover all the SM in the GPU.
EDIT:
Just to illustrate the point, consider this little test harness for your kernel:
template<int blocksize, int gridsize>
extern float GPUcomputeForces( unsigned int n, float* gm, float* gpx, float* gpy, float* gpz, float* gfx, float* gfy, float* gfz ){
float time;
dim3 gridDim( gridsize, 1, 1 ); //specifys how many blocks in three possible dimensions
dim3 blockDim( blocksize, 1, 1 ); //threads per block
cudaEvent_t start, stop;
errchk( cudaEventCreate(&start) );
errchk( cudaEventCreate(&stop) );
errchk( cudaEventRecord(start, 0) );
KernelcomputeForces<<<gridDim, blockDim>>>( n, gm, gpx, gpy, gpz, gfx, gfy, gfz );
rterrchk;
errchk( cudaEventRecord(stop, 0) );
errchk( cudaEventSynchronize(stop) );
errchk( cudaEventElapsedTime(&time, start, stop) );
return time;
}
int main(void)
{
const int n = 16384;
size_t gsize = sizeof(float) * size_t(n);
float * g[4], * _g[7];
errchk( cudaSetDevice(1) ); // GTX 275
for(int i=0; i<7; i++)
errchk( cudaMalloc((void **)&_g[i], gsize) );
for(int i=0; i<4; i++)
g[i] = (float *)malloc(gsize);
for(int i=0; i<n; i++)
for(int j=0; j<4; j++)
*(g[j]+i) = (float)drand48();
for(int i=0; i<4; i++)
errchk( cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice) );
// Warm up to take context establishment time out.
GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]);
// Bench runs
printf("(1,1)@(512,1,1): %f\n", GPUcomputeForces<1,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(8,1)@(512,1,1): %f\n", GPUcomputeForces<8,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(16,1)@(512,1,1): %f\n", GPUcomputeForces<16,512>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(30,1)@(256,1,1): %f\n", GPUcomputeForces<30,256>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(60,1)@(128,1,1): %f\n", GPUcomputeForces<60,128>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(120,1)@(64,1,1): %f\n", GPUcomputeForces<120,64>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
printf("(240,1)@(32,1,1): %f\n", GPUcomputeForces<240,32>(n,_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6]) );
cudaThreadExit();
return 0;
}
When I run that, I get the following execution times on a stock GTX 275 (all times are in milliseconds):
(1,1)@(512,1,1): 1087.107910
(8,1)@(512,1,1): 135.582458
(16,1)@(512,1,1): 67.876671
(30,1)@(256,1,1): 54.881279
(60,1)@(128,1,1): 35.261280
(120,1)@(64,1,1): 36.316288
(240,1)@(32,1,1): 39.870495
Ie: Running at least as many blocks as there are MP on the card is critical to improving performance, even when using very small block sizes.
@talonmies has already answered this question, showing how shared memory is helpful to improve performance. So, there is not much more to add. I just want to provide a full code, with different optimization steps, including tiling, shared memory, and shuffle operations, and show some results of a testing performed on a Kepler K20c.
The code below, moulded around what presented by @talonmies, has 5
kernel functions, namely
KernelcomputeForces
: no optimization
KernelcomputeForces_Shared
: exploits tiling in the source masses and shared memory
KernelcomputeForces_Tiling
: exploits tiling in the destination masses
KernelcomputeForces_Tiling_Shared
: exploits both tiling in the source and destination masses and exploits shared memory
KernelcomputeForces_Tiling_Shuffle
: same as above, but uses shuffle operations instead of shared memory.
Perhaps, as compared to what already published in the literature (Fast N-Body Simulation with CUDA) and what already available as codes (see the above answers and Mark Harris' GitHub N-body page, the last kernel is the only new thing. But I have played a bit with N-body, and found it useful to post this answer, potentially useful to next users.
Here is the code
#include <stdio.h>
#define GRAVITATIONAL_CONST 6.67*1e−11
#define SOFTENING 1e-9f
/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
/**********/
/* iDivUp */
/**********/
int iDivUp(int a, int b) { return ((a % b) != 0) ? (a / b + 1) : (a / b); }
/*******************/
/* KERNEL FUNCTION */
/*******************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < N) {
float invDist, invDist3;
// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {
// --- Compute relative distances
float dx = (x_d[ib] - x_reg);
float dy = (y_d[ib] - y_reg);
float dz = (z_d[ib] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}
/**********************************/
/* KERNEL FUNCTION: SHARED MEMORY */
/**********************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < N) {
float invDist, invDist3;
__shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];
// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {
// --- Loading data to shared memory
x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
m_sh[threadIdx.x] = m_d[ib + threadIdx.x];
__syncthreads();
#pragma unroll
for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {
// --- Compute relative distances
float dx = (x_sh[ic] - x_reg);
float dy = (y_sh[ic] - y_reg);
float dz = (z_sh[ic] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}
// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}
/***************************/
/* KERNEL FUNCTION: TILING */
/***************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;
for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {
// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {
// --- Compute relative distances
float dx = (x_d[ib] - x_reg);
float dy = (y_d[ib] - y_reg);
float dz = (z_d[ib] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_d[ib]) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}
/*******************************************/
/* KERNEL FUNCTION: TILING + SHARED MEMORY */
/*******************************************/
template<int BLOCKSIZE>
__global__
void KernelcomputeForces_Tiling_Shared(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;
for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {
__shared__ float x_sh[BLOCKSIZE], y_sh[BLOCKSIZE], z_sh[BLOCKSIZE], m_sh[BLOCKSIZE];
// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=BLOCKSIZE) {
// --- Loading data to shared memory
x_sh[threadIdx.x] = x_d[ib + threadIdx.x];
y_sh[threadIdx.x] = y_d[ib + threadIdx.x];
z_sh[threadIdx.x] = z_d[ib + threadIdx.x];
m_sh[threadIdx.x] = m_d[ib + threadIdx.x];
__syncthreads();
#pragma unroll
for(unsigned int ic=0; ic<BLOCKSIZE; ic++) {
// --- Compute relative distances
float dx = (x_sh[ic] - x_reg);
float dy = (y_sh[ic] - y_reg);
float dz = (z_sh[ic] - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * m_sh[ic]) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}
// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}
/************************************************/
/* KERNEL FUNCTION: TILING + SHUFFLE OPERATIONS */
/************************************************/
__global__
oid KernelcomputeForces_Tiling_Shuffle(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N)
{
float invDist, invDist3;
const int laneid = threadIdx.x & 31;
for (unsigned int tid = blockIdx.x*blockDim.x + threadIdx.x;
tid < N;
tid += blockDim.x*gridDim.x) {
// --- Initialize register accumulators for forces
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Move target particle data to registers
float x_reg = x_d[tid], y_reg = y_d[tid], z_reg = z_d[tid], m_reg = m_d[tid];
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib+=32) {
// --- Loading data to shared memory
float x_src = x_d[ib + laneid];
float y_src = y_d[ib + laneid];
float z_src = z_d[ib + laneid];
float m_src = m_d[ib + laneid];
#pragma unroll 32
for(unsigned int ic=0; ic<32; ic++) {
// --- Compute relative distances
float dx = (__shfl(x_src, ic) - x_reg);
float dy = (__shfl(y_src, ic) - y_reg);
float dz = (__shfl(z_src, ic) - z_reg);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = rsqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_reg * __shfl(m_src, ic)) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
__syncthreads();
}
// --- Stores local memory to global memory
fx_d[tid] = fx_temp;
fy_d[tid] = fy_temp;
fz_d[tid] = fz_temp;
}
}
/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int BLOCKSIZE>
float GPUcomputeForces(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {
float time;
dim3 grid(iDivUp(N,BLOCKSIZE), 1, 1); // --- Specifys how many blocks in three possible dimensions
dim3 block(BLOCKSIZE, 1, 1); // --- Threads per block
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start, 0));
KernelcomputeForces_Shared<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
//KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
//KernelcomputeForces<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(stop, 0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));
return time;
}
/*****************************************/
/* WRAPPER FUNCTION FOR GPU CALCULATIONS */
/*****************************************/
template<int GRIDSIZE, int BLOCKSIZE>
float GPUcomputeForces_Tiling(float* m_d, float* x_d, float* y_d, float* z_d, float* fx_d, float* fy_d, float* fz_d, unsigned int N) {
float time;
dim3 grid(GRIDSIZE, 1, 1); // --- Specifys how many blocks in three possible dimensions
dim3 block(BLOCKSIZE, 1, 1); // --- Threads per block
cudaEvent_t start, stop;
gpuErrchk(cudaEventCreate(&start));
gpuErrchk(cudaEventCreate(&stop));
gpuErrchk(cudaEventRecord(start, 0));
//KernelcomputeForces_Tiling<BLOCKSIZE><<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
KernelcomputeForces_Tiling_Shuffle<<<grid, block>>>(m_d, x_d, y_d, z_d, fx_d, fy_d, fz_d, N);
gpuErrchk(cudaPeekAtLastError());
gpuErrchk(cudaDeviceSynchronize());
gpuErrchk(cudaEventRecord(stop, 0));
gpuErrchk(cudaEventSynchronize(stop));
gpuErrchk(cudaEventElapsedTime(&time, start, stop));
return time;
}
/********************/
/* CPU CALCULATIONS */
/********************/
void CPUcomputeForces(float* m_h, float* x_h, float* y_h, float* z_h, float* fx_h, float* fy_h, float* fz_h, unsigned int N) {
for (unsigned int i=0; i<N; i++) {
float invDist, invDist3;
float fx_temp = 0.0f, fy_temp = 0.0f, fz_temp = 0.0f;
// --- Interact all with all
for(unsigned int ib=0; ib<N; ib++) {
// --- Compute relative distances
float dx = (x_h[ib] - x_h[i]);
float dy = (y_h[ib] - y_h[i]);
float dz = (z_h[ib] - z_h[i]);
float distanceSquared = dx*dx + dy*dy + dz*dz;
// --- Prevent slingshots and division by zero
distanceSquared += SOFTENING;
float invDist = 1.f / sqrtf(distanceSquared);
float invDist3 = invDist * invDist * invDist;
// --- Calculate gravitational magnitude between the bodies
float magnitude = GRAVITATIONAL_CONST * (m_h[i] * m_h[ib]) * invDist3;
// --- Calculate forces for the bodies: magnitude times direction
fx_temp += magnitude*dx;
fy_temp += magnitude*dy;
fz_temp += magnitude*dz;
}
// --- Stores local memory to global memory
fx_h[i] = fx_temp;
fy_h[i] = fy_temp;
fz_h[i] = fz_temp;
}
}
/********/
/* MAIN */
/********/
int main(void)
{
const int N = 16384;
size_t gsize = sizeof(float) * size_t(N);
float * g[10], * _g[7];
for(int i=0; i<7; i++) gpuErrchk( cudaMalloc((void **)&_g[i], gsize));
for(int i=0; i<10; i++) g[i] = (float *)malloc(gsize);
for(int i=0; i<N; i++)
for(int j=0; j<4; j++)
*(g[j]+i) = (float)rand();
for(int i=0; i<4; i++) gpuErrchk(cudaMemcpy(_g[i], g[i], gsize, cudaMemcpyHostToDevice));
// --- Warm up to take context establishment time out.
GPUcomputeForces<512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
//GPUcomputeForces_Tiling<32,512>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N);
// --- Bench runs
printf("1024: %f\n", GPUcomputeForces<512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("512: %f\n", GPUcomputeForces<512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256: %f\n", GPUcomputeForces<256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128: %f\n", GPUcomputeForces<128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64: %f\n", GPUcomputeForces<64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32: %f\n", GPUcomputeForces<32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("16, 1024: %f\n", GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("8, 1024: %f\n", GPUcomputeForces_Tiling<8,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("4, 1024: %f\n", GPUcomputeForces_Tiling<4,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 512: %f\n", GPUcomputeForces_Tiling<32,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("16, 512: %f\n", GPUcomputeForces_Tiling<16,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("8, 512: %f\n", GPUcomputeForces_Tiling<8,512> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 256: %f\n", GPUcomputeForces_Tiling<64,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 256: %f\n", GPUcomputeForces_Tiling<32,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("16, 256: %f\n", GPUcomputeForces_Tiling<16,256> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,128: %f\n", GPUcomputeForces_Tiling<128,128>(_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 128: %f\n", GPUcomputeForces_Tiling<64,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("32, 128: %f\n", GPUcomputeForces_Tiling<32,128> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256,64: %f\n", GPUcomputeForces_Tiling<256,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,64: %f\n", GPUcomputeForces_Tiling<128,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("64, 64: %f\n", GPUcomputeForces_Tiling<64,64> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("512,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("256,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
printf("128,32: %f\n", GPUcomputeForces_Tiling<512,32> (_g[0],_g[1],_g[2],_g[3],_g[4],_g[5],_g[6],N));
for(int i=8; i<10; i++) gpuErrchk(cudaMemcpy(g[i], _g[i-3], gsize, cudaMemcpyDeviceToHost));
CPUcomputeForces(g[0],g[1],g[2],g[3],g[4],g[5],g[6],N);
for(int i=0; i<N; i++)
for(int j=8; j<10; j++) {
if (abs(*(g[j]+i) - *(g[j-3]+i)) > 0.001f) {
printf("Error at %i, %i; GPU = %f; CPU = %f\n",i, (j-8), *(g[j-3]+i), *(g[j]+i));
return;
}
}
printf("Test passed!\n");
cudaDeviceReset();
return 0;
}
Here are some results for N = 16384
.
KernelcomputeForces
: optimal BLOCKSIZE = 128
; t = 10.07ms
.
KernelcomputeForces_Shared
: optimal BLOCKSIZE = 128
; t = 7.04ms
.
KernelcomputeForces_Tiling
: optimal BLOCKSIZE = 128
; t = 11.14ms
.
KernelcomputeForces_Tiling_Shared
: optimal GRIDSIZE = 64
; optimal BLOCKSIZE = 256
; t = 7.20ms
.
KernelcomputeForces_Tiling_Shuffle
: optimal GRIDSIZE = 128
; optimal BLOCKSIZE = 128
; t = 6.84ms
.
Warp shuffle operations seem to very slightly improve the performance against shared memory.