I am using the following two functions to time different parts (cudaMemcpyHtoD, kernel execution, cudaMemcpyDtoH) of my code (which includes multi-gpus, concurrent kernels on same GPU, sequential execution of kernels, et al). As I understand, these functions would record the time elapsed between the events, but I guess inserting events along the lifetime of the code may result in overheads and inaccuracies. I would like to hear criticisms, general advice to improve these functions and caveat emptors regarding them.
//Create event and start recording
cudaEvent_t *start_event(int device, cudaEvent_t *events, cudaStream_t streamid=0)
{
cutilSafeCall( cudaSetDevice(device) );
cutilSafeCall( cudaEventCreate(&events[0]) );
cutilSafeCall( cudaEventCreate(&events[1]) );
cudaEventRecord(events[0], streamid);
return events;
}
//Return elapsed time and destroy events
float end_event(int device, cudaEvent_t *events, cudaStream_t streamid=0)
{
float elapsed = 0.0;
cutilSafeCall( cudaSetDevice(device) );
cutilSafeCall( cudaEventRecord(events[1], streamid) );
cutilSafeCall( cudaEventSynchronize(events[1]) );
cutilSafeCall( cudaEventElapsedTime(&elapsed, events[0], events[1]) );
cutilSafeCall( cudaEventDestroy( events[0] ) );
cutilSafeCall( cudaEventDestroy( events[1] ) );
return elapsed;
}
Usage:
cudaEvent_t *events;
cudaEvent_t event[2]; //0 for start and 1 for end
...
events = start_event( cuda_device, event, 0 );
<Code to time>
printf("Time taken for the above code... - %f secs\n\n", (end_event(cuda_device, events, 0) / 1000) );