I have a simulation that calculates the 3D vectors of charged particles moving in an electric and magnetic field. I attempted to speed this up in CUDA using the __align__
specifier, thinking that perhaps the limiting factor was the global memory reading and writing, but using __align__
ended up slowing it down (maybe because it increased the total memory requirement). I also tried using float3
and float4
but their performance was similar
I've created a simplified version of this code and pasted it below to show my problem. The code below should be compilable and by changing the definition of CASE
on the fourth line to 0
, 1
, or 2
, the different options I described above can be tried. Two functions, ParticleMoverCPU
and ParticleMoverGPU
are defined to compare CPU vs GPU performance.
- Is there a reason that my attempts at memory coalescence are slowing down rather than speeding up my code?
- Is there anything else immediately obvious that I am not doing that I could do to get better than a 60x speedup for an "embarrassingly parallel" code such as this?
Thank you!
CPU - Intel Xeon E5620 @2.40GHz
GPU - NVIDIA Tesla C2070
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE 0 // define to either 0, 1 or 2 as described above
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>
#define CEX 10 // x-value of electric field (dimensionless and arbitrary)
#define CEY 0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ 0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY 0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ 10 // x-value of magnetic field (dimensionless and arbitrary)
#define FACTOR 15 // I played around with these numbers until I got the best speedup
#define THREADS 256 // I played around with these numbers until I got the best speedup
typedef struct{
float x;
float y;
float z;
} VecCPU; //Struct for vectors for CPU calculation
// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
float x;
float y;
float z;
} VecGPU;
#endif
#if CASE==1
// This method seems to be less fast. It is an attempt to align for memory coalescence
typedef struct __align__(16){
float x;
float y;
float z;
} VecGPU;
#endif
// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif
VecCPU *pos_c, *vel_c; // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d; // pointers in constant memory which we will point to data in global memory
void ParticleMoverCPU(int np, int ts, float dt){
int n = 0;
while (n < np){
VecCPU vminus, tvec, vprime, vplus;
float tvec_fact;
int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_c[n].x + CEX*0.5*dt;
vminus.y = vel_c[n].y + CEY*0.5*dt;
vminus.z = vel_c[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_c[n].x = vplus.x + CEX*0.5*dt;
vel_c[n].y = vplus.y + CEY*0.5*dt;
vel_c[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_c[n].x += vel_c[n].x*dt;
pos_c[n].y += vel_c[n].y*dt;
pos_c[n].z += vel_c[n].z*dt;
it++;
}
n++;
}
}
__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){
register int n = threadIdx.x + blockDim.x * blockIdx.x;
while (n < np){
register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
register float tvec_fact;
register int it = 0;
while (it < ts){
// ----- Update velocities by the Boris method ------ //
vminus.x = vel_d[n].x + CEX*0.5*dt;
vminus.y = vel_d[n].y + CEY*0.5*dt;
vminus.z = vel_d[n].z + CEZ*0.5*dt;
tvec.x = CBX*0.5*dt;
tvec.y = CBY*0.5*dt;
tvec.z = CBZ*0.5*dt;
tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
vel_d[n].x = vplus.x + CEX*0.5*dt;
vel_d[n].y = vplus.y + CEY*0.5*dt;
vel_d[n].z = vplus.z + CEZ*0.5*dt;
// ------ Update Particle positions -------------- //
pos_d[n].x += vel_d[n].x*dt;
pos_d[n].y += vel_d[n].y*dt;
pos_d[n].z += vel_d[n].z*dt;
it++;
}
n += blockDim.x*gridDim.x;
}
}
int main(void){
int np = 50000; // Number of Particles
const int ts = 1000; // Number of Time-steps
const float dt = 1E-3; // Time-step value
// ----------- CPU ----------- //
pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for position
vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np); // allocate memory for velocity
for (int n = 0; n < np; n++){
pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0; // zero out position for CPU variables
vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0; // zero out velocity for CPU variables
}
printf("Starting CPU kernel\n");
clock_t startCPU;
float CPUtime;
startCPU = clock();
ParticleMoverCPU(np, ts, dt); // Launch CPU kernel
CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
printf("CPU kernel finished\n");
// Ouput final CPU computation time
printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);
// ------------ GPU ----------- //
cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1); //Set memory preference to L1 (doesn't have much effect)
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, 0);
int blocks = deviceProp.multiProcessorCount;
VecGPU *pos_g, *vel_g, *pos_l, *vel_l;
pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for positions on the CPU
vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np); // allocate memory for velocities on the CPU
cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np); // allocate memory for positions on the GPU
cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np); // allocate memory for velocities on the GPU
cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*)); // copy memory address of position to the constant memory pointer pos_d
cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*)); // copy memory address of velocity to the constant memory pointer vel_d
for (int n = 0; n < np; n++){
pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
}
cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy positions to GPU global memory
cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice); // Copy velocities to GPU global memory
printf("Starting GPU kernel\n");
// start cuda timer
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt); // Launch GPU kernel
//stop cuda timer
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
printf("GPU kernel finished\n");
cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy positions from GPU memory back to CPU
cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost); // Copy velocities from GPU memory back to CPU
// Ouput GPU computation time
printf("GPUtime = %6.1f ms\n", elapsedTime);
// Output speedup factor
printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);
// free allocated memory
cudaFree(pos_l);
cudaFree(vel_l);
free(pos_g);
free(vel_g);
free(pos_c);
free(vel_c);
}
For CASE 0
(regular vector struct) I get:
CPUtime = 1302.0 ms
GPUtime = 21.8 ms
Speedup = 59.79
For CASE 1
(__align__(16)
vector struct) I get:
CPUtime = 1298.0 ms
GPUtime = 24.5 ms
Speedup = 53.08
For CASE 2
(using float3
) I get:
CPUtime = 1305.0 ms
GPUtime = 21.8 ms
Speedup = 59.80
If I use float4
instead of float3
I get something similar to the __align__(16)
method.
Thanks!!