CUDA structure alignment is slowing down my code (

2019-01-29 11:52发布

问题:

I have a simulation that calculates the 3D vectors of charged particles moving in an electric and magnetic field. I attempted to speed this up in CUDA using the __align__ specifier, thinking that perhaps the limiting factor was the global memory reading and writing, but using __align__ ended up slowing it down (maybe because it increased the total memory requirement). I also tried using float3 and float4 but their performance was similar

I've created a simplified version of this code and pasted it below to show my problem. The code below should be compilable and by changing the definition of CASE on the fourth line to 0, 1, or 2, the different options I described above can be tried. Two functions, ParticleMoverCPU and ParticleMoverGPU are defined to compare CPU vs GPU performance.

  1. Is there a reason that my attempts at memory coalescence are slowing down rather than speeding up my code?
  2. Is there anything else immediately obvious that I am not doing that I could do to get better than a 60x speedup for an "embarrassingly parallel" code such as this?

Thank you!

CPU - Intel Xeon E5620 @2.40GHz

GPU - NVIDIA Tesla C2070

// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE        0   // define to either 0, 1 or 2 as described above

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>

#define CEX         10  // x-value of electric field (dimensionless and arbitrary)
#define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)

#define FACTOR      15  // I played around with these numbers until I got the best speedup
#define THREADS     256 // I played around with these numbers until I got the best speedup

typedef struct{
    float x;
    float y;
    float z;
} VecCPU;           //Struct for vectors for CPU calculation

// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
    float x;
    float y;
    float z;
} VecGPU;
#endif

#if CASE==1
// This method seems to be less fast.  It is an attempt to align for memory coalescence
typedef struct __align__(16){
    float x;
    float y;
    float z;
} VecGPU;
#endif

// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif

VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation
__constant__ VecGPU *pos_d, *vel_d;     // pointers in constant memory which we will point to data in global memory

void ParticleMoverCPU(int np, int ts, float dt){

    int n = 0;
    while (n < np){

        VecCPU vminus, tvec, vprime, vplus;
        float tvec_fact;
        int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_c[n].x + CEX*0.5*dt;
            vminus.y = vel_c[n].y + CEY*0.5*dt;
            vminus.z = vel_c[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_c[n].x = vplus.x + CEX*0.5*dt;
            vel_c[n].y = vplus.y + CEY*0.5*dt;
            vel_c[n].z = vplus.z + CEZ*0.5*dt;

            // ------ Update Particle positions -------------- //
            pos_c[n].x += vel_c[n].x*dt;
            pos_c[n].y += vel_c[n].y*dt;
            pos_c[n].z += vel_c[n].z*dt;
            it++;
        }
        n++;
    }
}

__global__ void ParticleMoverGPU(register int np,register int ts, register float dt){

    register int n = threadIdx.x + blockDim.x * blockIdx.x;
    while (n < np){

        register VecGPU vminus, tvec, vprime, vplus;// , vtemp;
        register float tvec_fact;
        register int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_d[n].x + CEX*0.5*dt;
            vminus.y = vel_d[n].y + CEY*0.5*dt;
            vminus.z = vel_d[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_d[n].x = vplus.x + CEX*0.5*dt;
            vel_d[n].y = vplus.y + CEY*0.5*dt;
            vel_d[n].z = vplus.z + CEZ*0.5*dt;
            // ------ Update Particle positions -------------- //
            pos_d[n].x += vel_d[n].x*dt;
            pos_d[n].y += vel_d[n].y*dt;
            pos_d[n].z += vel_d[n].z*dt;
            it++;
        }
        n += blockDim.x*gridDim.x;
    }
}

int main(void){

    int np = 50000;                                         // Number of Particles
    const int ts = 1000;                                    // Number of Time-steps
    const float dt = 1E-3;                                  // Time-step value


    // ----------- CPU ----------- //

    pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
    vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity

    for (int n = 0; n < np; n++){
        pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
        vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
    }

    printf("Starting CPU kernel\n");
    clock_t startCPU;
    float CPUtime;
    startCPU = clock();
    ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
    CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
    printf("CPU kernel finished\n");
    // Ouput final CPU computation time
    printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);

    // ------------ GPU ----------- //

    cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int blocks = deviceProp.multiProcessorCount;

    VecGPU *pos_g, *vel_g, *pos_l, *vel_l;

    pos_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for positions on the CPU
    vel_g = (VecGPU*)malloc(sizeof(VecGPU)*np);         // allocate memory for velocities on the CPU

    cudaMalloc((void**)&pos_l, sizeof(VecGPU)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l, sizeof(VecGPU)*np);      // allocate memory for velocities on the GPU

    cudaMemcpyToSymbol(pos_d, &pos_l, sizeof(void*));   // copy memory address of position to the constant memory pointer pos_d
    cudaMemcpyToSymbol(vel_d, &vel_l, sizeof(void*));   // copy memory address of velocity to the constant memory pointer vel_d

    for (int n = 0; n < np; n++){
        pos_g[n].x = 0; pos_g[n].y = 0; pos_g[n].z = 0; // zero out position for GPU variables (before copying to GPU)
        vel_g[n].x = 0; vel_g[n].y = 0; vel_g[n].z = 0; // zero out velocity for GPU variables (before copying to GPU)
    }

    cudaMemcpy(pos_l, pos_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l, vel_g, sizeof(VecGPU)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory

    printf("Starting GPU kernel\n");
    // start cuda timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(np, ts, dt);             // Launch GPU kernel

    //stop cuda timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    printf("GPU kernel finished\n");

    cudaMemcpy(pos_g, pos_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy positions from GPU memory back to CPU
    cudaMemcpy(vel_g, vel_l, sizeof(VecGPU)*np, cudaMemcpyDeviceToHost);    // Copy velocities from GPU memory back to CPU

    // Ouput GPU computation time
    printf("GPUtime = %6.1f ms\n", elapsedTime);

    // Output speedup factor
    printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);

    // free allocated memory
    cudaFree(pos_l);
    cudaFree(vel_l);
    free(pos_g);
    free(vel_g);
    free(pos_c);
    free(vel_c);
}

For CASE 0 (regular vector struct) I get:

CPUtime = 1302.0 ms
GPUtime =   21.8 ms
Speedup = 59.79

For CASE 1 (__align__(16) vector struct) I get:

CPUtime = 1298.0 ms
GPUtime =   24.5 ms
Speedup = 53.08

For CASE 2 (using float3) I get:

CPUtime = 1305.0 ms
GPUtime =   21.8 ms
Speedup = 59.80

If I use float4 instead of float3 I get something similar to the __align__(16) method.

Thanks!!

回答1:

  1. The pointers in __constant__ memory is a waste of your time. I'm not sure why you would jump though all those hoops.
  2. Dropping register everywhere is a waste of your time. You're not smarter than the compiler in telling it to use registers wherever possible.
  3. In case you're not, you should use proper cuda error checking. That's just a boiler-plate statement I make. I don't think there are any API level errors in this code.
  4. It's not clear you understand what "coalescence" is. Alignment of data only tangentially affects the ability of memory transactions to coalesce. What is more important is the actual addresses being generated by adjacent threads in the warp for a given memory transaction -- do they refer to adjacent memory locations? If so, things are probably coalescing nicely. If not, probably not. So you have a data structure which "naturally" occupies 12 bytes, and in one case (the slower one) you are telling it to occupy 16 bytes instead. What does this do exactly? To answer that we have to look at a given transaction:

        vminus.x = vel_d[n].x + CEX*0.5*dt;
    

    The above transaction is requesting the x-component of the vel_d vector. In the "non-align" case, that data will be stored like this, and the above transaction will "ask" for the starred quantities (32 per warp):

    mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 ...
    vel_d:  x0 y0 z0 x1 y1 z1 x2 y2 z2 x3 y3 z3 x4 y4 z4 x5 y5 z5 ...
             *        *        *        *        *        *       ...
    

    In the "align" case, the above pattern would look like:

    mem idx: 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17    ...
    vel_d:  x0 y0 z0 ?? x1 y1 z1 ?? x2 y2 z2 ?? x3 y3 z3 ?? x4 y4 z4 ...
             *           *           *           *           *       ...
    

    So we see that when you specify the align directive, the packing is less dense, and a given 128 byte cacheline supplies fewer of the necessary items for the given transaction. Therefore more cachelines must be retrieved from global memory to satisfy this one read request in the align case. This is likely accounting for the ~10-20% difference you are seeing.

  5. But we can do better than above. You have a classic AoS (Array of structures) data storage scheme, and that is canonically bad for GPU programming. A standard performance enhancement is to convert from AoS to SoA storage. This means breaking out the x,y,z components of your pos and vel vectors into separate arrays, each, and then accessing those. (Alternatively since you are processing all components in a single thread, you could try doing a vector load. But that is a separate discussion.) The desired storage and load pattern then becomes:

    mem idx:  0  1  2  3  4  5  6  7  8  9  ...
    vel_d_x: x0 x1 x2 x3 x4 x5 x6 x7 x8 x9  ...
              *  *  *  *  *  *  *  *  *  *  ...
    

    and the code might look like this:

        vminus.x = vel_d_x[n] + CEX*0.5*dt;
        vminus.y = vel_d_y[n] + CEY*0.5*dt;
        vminus.z = vel_d_z[n] + CEZ*0.5*dt;
    

The following code implements some of the above, including the AoS -> SoA transformation for the GPU side, and should be faster than any of your cases.

$ cat t895.cu
// CASE 0: Regular struct with 3 floats
// CASE 1: Aligned struct using __align__(16) with 3 floats
// CASE 2: float3
#define CASE        0   // define to either 0, 1 or 2 as described above

#include <stdio.h>
#include <math.h>
#include <time.h>
#include <malloc.h>
#include <sys/stat.h>

#define CEX         10  // x-value of electric field (dimensionless and arbitrary)
#define CEY         0.1 // y-value of electric field (dimensionless and arbitrary)
#define CEZ         0.1 // z-value of electric field (dimensionless and arbitrary)
#define CBX         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBY         0.1 // x-value of magnetic field (dimensionless and arbitrary)
#define CBZ         10  // x-value of magnetic field (dimensionless and arbitrary)

#define FACTOR      15  // I played around with these numbers until I got the best speedup
#define THREADS     256 // I played around with these numbers until I got the best speedup

typedef struct{
    float x;
    float y;
    float z;
} VecCPU;           //Struct for vectors for CPU calculation

// Fastest method seems to be a regular unaligned struct with 3 floats
#if CASE==0
typedef struct {
    float x;
    float y;
    float z;
} VecGPU;
#endif

#if CASE==1
// This method seems to be less fast.  It is an attempt to align for memory coalescence
typedef struct __align__(16){
    float x;
    float y;
    float z;
} VecGPU;
#endif

// Using float3 seems to be about the same as defining our own vector3 structure
#if CASE==2
typedef float3 VecGPU;
#endif

VecCPU *pos_c, *vel_c;                  // global position and velocity vectors for CPU calculation

void ParticleMoverCPU(int np, int ts, float dt){

    int n = 0;
    while (n < np){

        VecCPU vminus, tvec, vprime, vplus;
        float tvec_fact;
        int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_c[n].x + CEX*0.5*dt;
            vminus.y = vel_c[n].y + CEY*0.5*dt;
            vminus.z = vel_c[n].z + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_c[n].x = vplus.x + CEX*0.5*dt;
            vel_c[n].y = vplus.y + CEY*0.5*dt;
            vel_c[n].z = vplus.z + CEZ*0.5*dt;

            // ------ Update Particle positions -------------- //
            pos_c[n].x += vel_c[n].x*dt;
            pos_c[n].y += vel_c[n].y*dt;
            pos_c[n].z += vel_c[n].z*dt;
            it++;
        }
        n++;
    }
}

__global__ void ParticleMoverGPU(float *vel_d_x, float *vel_d_y, float *vel_d_z, float *pos_d_x, float *pos_d_y, float *pos_d_z, int np,int ts, float dt){

    int n = threadIdx.x + blockDim.x * blockIdx.x;
    while (n < np){

        VecGPU vminus, tvec, vprime, vplus;// , vtemp;
        register float tvec_fact;
        register int it = 0;
        while (it < ts){
            // ----- Update velocities by the Boris method ------ //
            vminus.x = vel_d_x[n] + CEX*0.5*dt;
            vminus.y = vel_d_y[n] + CEY*0.5*dt;
            vminus.z = vel_d_z[n] + CEZ*0.5*dt;
            tvec.x = CBX*0.5*dt;
            tvec.y = CBY*0.5*dt;
            tvec.z = CBZ*0.5*dt;
            tvec_fact = 2 / (1 + tvec.x*tvec.x + tvec.y*tvec.y + tvec.z*tvec.z);
            vprime.x = vminus.x + vminus.y*tvec.z - vminus.z*tvec.y;
            vprime.y = vminus.y + vminus.z*tvec.x - vminus.x*tvec.z;
            vprime.z = vminus.z + vminus.x*tvec.y - vminus.y*tvec.x;
            vplus.x = vminus.x + (vprime.y*tvec.z - vprime.z*tvec.y)*tvec_fact;
            vplus.y = vminus.y + (vprime.z*tvec.x - vprime.x*tvec.z)*tvec_fact;
            vplus.z = vminus.z + (vprime.x*tvec.y - vprime.y*tvec.x)*tvec_fact;
            vel_d_x[n] = vplus.x + CEX*0.5*dt;
            vel_d_y[n] = vplus.y + CEY*0.5*dt;
            vel_d_z[n] = vplus.z + CEZ*0.5*dt;
            // ------ Update Particle positions -------------- //
            pos_d_x[n] += vel_d_x[n]*dt;
            pos_d_y[n] += vel_d_y[n]*dt;
            pos_d_z[n] += vel_d_z[n]*dt;
            it++;
        }
        n += blockDim.x*gridDim.x;
    }
}

int main(void){

    int np = 50000;                                         // Number of Particles
    const int ts = 1000;                                    // Number of Time-steps
    const float dt = 1E-3;                                  // Time-step value


    // ----------- CPU ----------- //

    pos_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for position
    vel_c = (VecCPU*)malloc(sizeof(VecCPU)*np);             // allocate memory for velocity

    for (int n = 0; n < np; n++){
        pos_c[n].x = 0; pos_c[n].y = 0; pos_c[n].z = 0;     // zero out position for CPU variables
        vel_c[n].x = 0; vel_c[n].y = 0; vel_c[n].z = 0;     // zero out velocity for CPU variables
    }

    printf("Starting CPU kernel\n");
    clock_t startCPU;
    float CPUtime;
    startCPU = clock();
    ParticleMoverCPU(np, ts, dt);                           // Launch CPU kernel
    CPUtime = ((float)(clock() - startCPU)) / CLOCKS_PER_SEC;
    printf("CPU kernel finished\n");
    // Ouput final CPU computation time
    printf("CPUtime = %6.1f ms\n", ((float)CPUtime)*1E3);

    // ------------ GPU ----------- //

    cudaFuncSetCacheConfig(ParticleMoverGPU, cudaFuncCachePreferL1);    //Set memory preference to L1 (doesn't have much effect)
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, 0);
    int blocks = deviceProp.multiProcessorCount;

    float *pos_g_x, *pos_g_y, *pos_g_z, *vel_g_x, *vel_g_y, *vel_g_z, *pos_l_x, *pos_l_y, *pos_l_z, *vel_l_x, *vel_l_y, *vel_l_z;

    pos_g_x = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_x = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU
    pos_g_y = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_y = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU
    pos_g_z = (float*)malloc(sizeof(float)*np);         // allocate memory for positions on the CPU
    vel_g_z = (float*)malloc(sizeof(float)*np);         // allocate memory for velocities on the CPU

    cudaMalloc((void**)&pos_l_x, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_x, sizeof(float)*np);      // allocate memory for velocities on the GPU
    cudaMalloc((void**)&pos_l_y, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_y, sizeof(float)*np);      // allocate memory for velocities on the GPU
    cudaMalloc((void**)&pos_l_z, sizeof(float)*np);      // allocate memory for positions on the GPU
    cudaMalloc((void**)&vel_l_z, sizeof(float)*np);      // allocate memory for velocities on the GPU

    for (int n = 0; n < np; n++){
        pos_g_x[n] = 0; pos_g_y[n] = 0; pos_g_z[n] = 0; // zero out position for GPU variables (before copying to GPU)
        vel_g_x[n] = 0; vel_g_y[n] = 0; vel_g_z[n] = 0; // zero out velocity for GPU variables (before copying to GPU)
    }

    cudaMemcpy(pos_l_x, pos_g_x, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_x, vel_g_x, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
    cudaMemcpy(pos_l_y, pos_g_y, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_y, vel_g_y, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory
    cudaMemcpy(pos_l_z, pos_g_z, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy positions to GPU global memory
    cudaMemcpy(vel_l_z, vel_g_z, sizeof(float)*np, cudaMemcpyHostToDevice);    // Copy velocities to GPU global memory

    printf("Starting GPU kernel\n");
    // start cuda timer
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    ParticleMoverGPU <<<blocks*FACTOR, THREADS >>>(vel_l_x, vel_l_y, vel_l_z, pos_l_x, pos_l_y, pos_l_z, np, ts, dt);             // Launch GPU kernel

    //stop cuda timer
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    printf("GPU kernel finished\n");


    // Ouput GPU computation time
    printf("GPUtime = %6.1f ms\n", elapsedTime);

    // Output speedup factor
    printf("CASE=%i, Speedup = %4.2f\n",CASE, CPUtime*1E3 / elapsedTime);

}

$ nvcc -O3 -o t895 t895.cu
$ ./t895
Starting CPU kernel
CPU kernel finished
CPUtime =  923.6 ms
Starting GPU kernel
GPU kernel finished
GPUtime =   12.3 ms
CASE=0, Speedup = 74.95
$