I would like to overlap a thrust::sort_by_key operation with a host-to-device copy. Despite taking a cudaStream_t as an argument, my experiments seem to show that thrust::sort_by_key is a blocking operation. Below I attach a full code example in which first I measure the time to copy the data (from pinned memory), then I measure the time to do the sort_by_key. Finally, I try to overlap the two operations. I would expect to the see the copy time hidden by the sort_by_key operation. Instead, I find that the overlayed operation take more than the sum of the two standalone operations.
Can anyone see a problem with the code? Or am I misunderstanding the support in thrust for cuda streams?
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <random>
#include <iostream>
#include <sys/time.h>
int main() {
// size of arrays
const int n = 300000000;
// random number generator
std::mt19937 rng;
// key/val on host
uint32_t * key = new uint32_t[n];
uint32_t * val = new uint32_t[n];
// fill key val
for(int i = 0; i < n; i++) {
key[i] = rng();
val[i] = i;
}
// key/val on device
uint32_t * dev_key;
uint32_t * dev_val;
// allocate memory on GPU for key/val
cudaMalloc((void**)&dev_key, n*sizeof(uint32_t));
cudaMalloc((void**)&dev_val, n*sizeof(uint32_t));
// copy key/val onto the device
cudaMemcpy(dev_key, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_val, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
// get thrust device pointers to key/val on device
thrust::device_ptr<uint32_t> dev_key_ptr = thrust::device_pointer_cast(dev_key);
thrust::device_ptr<uint32_t> dev_val_ptr = thrust::device_pointer_cast(dev_val);
// data on host
uint32_t * data;
// allocate pinned memory for data on host
cudaMallocHost((void**)&data, n*sizeof(uint32_t));
// fill data with random numbers
for(int i = 0; i < n; i++) {
data[i] = rng();
}
// data on device
uint32_t * dev_data;
// allocate memory for data on the device
cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));
// for timing
struct timeval t1, t2;
// two streams
cudaStream_t stream1;
cudaStream_t stream2;
// create streams
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for(int i = 0; i < 10; i++) {
// Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
gettimeofday(&t1, NULL);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Overlap both operations
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both1: " << t_both << std::endl;
}
// clean up
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFreeHost(data);
cudaFree(dev_data);
cudaFree(dev_key);
cudaFree(dev_val);
delete [] key;
delete [] val;
}
Here is the results obtained when running on a GTX 1080 TI and compiling using CUDA toolkit (V9.0.176):
t_copy: 99.972, t_sort: 215.597, t_both: 393.861
t_copy: 100.769, t_sort: 225.234, t_both: 394.839
t_copy: 100.489, t_sort: 221.44, t_both: 397.042
t_copy: 100.047, t_sort: 214.231, t_both: 403.371
t_copy: 100.167, t_sort: 222.031, t_both: 393.143
t_copy: 100.255, t_sort: 209.191, t_both: 374.633
t_copy: 100.179, t_sort: 208.452, t_both: 374.122
t_copy: 100.038, t_sort: 208.39, t_both: 375.454
t_copy: 100.072, t_sort: 208.468, t_both: 376.02
t_copy: 100.069, t_sort: 208.426, t_both: 377.759
Furthermore, profiling using nvprof reveals that all operations are being carried out in two separate, non-default streams.
I would be extremely grateful if anyone can reproduce this, or suggest a fix.
Thank you very much. I tested your code and it indeed solves the problem. I also found an alternate solution using CUB (see below). The main difference is you have to allocate all the necessary memory for the radix sort upfront (rather than using a cached allocator).
You need to include the CUB header library at compile time:
This code gives me the following (CUDA 9.0, Tesla P100, NVLINK, RHEL7):
thrust sort operations do a memory allocation "under the hood". This should be discoverable using
nvprof --print-api-trace ...
- you should see acudaMalloc
operation associated with each sort. This device memory allocation is synchronizing and may prevent expected overlap. If you want to work around this, you could explore using a thrust custom allocator.Here is a worked example, borrowing heavily from the above link:
CentOS 7.4, CUDA 9.1, Tesla P100