可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效,请关闭广告屏蔽插件后再试):
问题:
I would like to overlap a thrust::sort_by_key operation with a host-to-device copy. Despite taking a cudaStream_t as an argument, my experiments seem to show that thrust::sort_by_key is a blocking operation. Below I attach a full code example in which first I measure the time to copy the data (from pinned memory), then I measure the time to do the sort_by_key. Finally, I try to overlap the two operations. I would expect to the see the copy time hidden by the sort_by_key operation. Instead, I find that the overlayed operation take more than the sum of the two standalone operations.
Can anyone see a problem with the code? Or am I misunderstanding the support in thrust for cuda streams?
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <random>
#include <iostream>
#include <sys/time.h>
int main() {
// size of arrays
const int n = 300000000;
// random number generator
std::mt19937 rng;
// key/val on host
uint32_t * key = new uint32_t[n];
uint32_t * val = new uint32_t[n];
// fill key val
for(int i = 0; i < n; i++) {
key[i] = rng();
val[i] = i;
}
// key/val on device
uint32_t * dev_key;
uint32_t * dev_val;
// allocate memory on GPU for key/val
cudaMalloc((void**)&dev_key, n*sizeof(uint32_t));
cudaMalloc((void**)&dev_val, n*sizeof(uint32_t));
// copy key/val onto the device
cudaMemcpy(dev_key, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_val, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
// get thrust device pointers to key/val on device
thrust::device_ptr<uint32_t> dev_key_ptr = thrust::device_pointer_cast(dev_key);
thrust::device_ptr<uint32_t> dev_val_ptr = thrust::device_pointer_cast(dev_val);
// data on host
uint32_t * data;
// allocate pinned memory for data on host
cudaMallocHost((void**)&data, n*sizeof(uint32_t));
// fill data with random numbers
for(int i = 0; i < n; i++) {
data[i] = rng();
}
// data on device
uint32_t * dev_data;
// allocate memory for data on the device
cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));
// for timing
struct timeval t1, t2;
// two streams
cudaStream_t stream1;
cudaStream_t stream2;
// create streams
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for(int i = 0; i < 10; i++) {
// Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
gettimeofday(&t1, NULL);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Overlap both operations
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par.on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both1: " << t_both << std::endl;
}
// clean up
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFreeHost(data);
cudaFree(dev_data);
cudaFree(dev_key);
cudaFree(dev_val);
delete [] key;
delete [] val;
}
Here is the results obtained when running on a GTX 1080 TI and compiling using CUDA toolkit (V9.0.176):
t_copy: 99.972, t_sort: 215.597, t_both: 393.861
t_copy: 100.769, t_sort: 225.234, t_both: 394.839
t_copy: 100.489, t_sort: 221.44, t_both: 397.042
t_copy: 100.047, t_sort: 214.231, t_both: 403.371
t_copy: 100.167, t_sort: 222.031, t_both: 393.143
t_copy: 100.255, t_sort: 209.191, t_both: 374.633
t_copy: 100.179, t_sort: 208.452, t_both: 374.122
t_copy: 100.038, t_sort: 208.39, t_both: 375.454
t_copy: 100.072, t_sort: 208.468, t_both: 376.02
t_copy: 100.069, t_sort: 208.426, t_both: 377.759
Furthermore, profiling using nvprof reveals that all operations are being carried out in two separate, non-default streams.
I would be extremely grateful if anyone can reproduce this, or suggest a fix.
回答1:
thrust sort operations do a memory allocation "under the hood". This should be discoverable using nvprof --print-api-trace ...
- you should see a cudaMalloc
operation associated with each sort. This device memory allocation is synchronizing and may prevent expected overlap. If you want to work around this, you could explore using a thrust custom allocator.
Here is a worked example, borrowing heavily from the above link:
$ cat t44.cu
#include <cuda_runtime.h>
#include <thrust/device_vector.h>
#include <thrust/sort.h>
#include <random>
#include <iostream>
#include <sys/time.h>
#include <thrust/system/cuda/vector.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/pair.h>
#include <cstdlib>
#include <iostream>
#include <map>
#include <cassert>
// This example demonstrates how to intercept calls to get_temporary_buffer
// and return_temporary_buffer to control how Thrust allocates temporary storage
// during algorithms such as thrust::sort. The idea will be to create a simple
// cache of allocations to search when temporary storage is requested. If a hit
// is found in the cache, we quickly return the cached allocation instead of
// resorting to the more expensive thrust::cuda::malloc.
//
// Note: this implementation cached_allocator is not thread-safe. If multiple
// (host) threads use the same cached_allocator then they should gain exclusive
// access to the allocator before accessing its methods.
// cached_allocator: a simple allocator for caching allocation requests
class cached_allocator
{
public:
// just allocate bytes
typedef char value_type;
cached_allocator() {}
~cached_allocator()
{
// free all allocations when cached_allocator goes out of scope
free_all();
}
char *allocate(std::ptrdiff_t num_bytes)
{
char *result = 0;
// search the cache for a free block
free_blocks_type::iterator free_block = free_blocks.find(num_bytes);
if(free_block != free_blocks.end())
{
std::cout << "cached_allocator::allocator(): found a hit" << std::endl;
// get the pointer
result = free_block->second;
// erase from the free_blocks map
free_blocks.erase(free_block);
}
else
{
// no allocation of the right size exists
// create a new one with cuda::malloc
// throw if cuda::malloc can't satisfy the request
try
{
std::cout << "cached_allocator::allocator(): no free block found; calling cuda::malloc" << std::endl;
// allocate memory and convert cuda::pointer to raw pointer
result = thrust::cuda::malloc<char>(num_bytes).get();
}
catch(std::runtime_error &e)
{
throw;
}
}
// insert the allocated pointer into the allocated_blocks map
allocated_blocks.insert(std::make_pair(result, num_bytes));
return result;
}
void deallocate(char *ptr, size_t n)
{
// erase the allocated block from the allocated blocks map
allocated_blocks_type::iterator iter = allocated_blocks.find(ptr);
std::ptrdiff_t num_bytes = iter->second;
allocated_blocks.erase(iter);
// insert the block into the free blocks map
free_blocks.insert(std::make_pair(num_bytes, ptr));
}
private:
typedef std::multimap<std::ptrdiff_t, char*> free_blocks_type;
typedef std::map<char *, std::ptrdiff_t> allocated_blocks_type;
free_blocks_type free_blocks;
allocated_blocks_type allocated_blocks;
void free_all()
{
std::cout << "cached_allocator::free_all(): cleaning up after ourselves..." << std::endl;
// deallocate all outstanding blocks in both lists
for(free_blocks_type::iterator i = free_blocks.begin();
i != free_blocks.end();
++i)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->second));
}
for(allocated_blocks_type::iterator i = allocated_blocks.begin();
i != allocated_blocks.end();
++i)
{
// transform the pointer to cuda::pointer before calling cuda::free
thrust::cuda::free(thrust::cuda::pointer<char>(i->first));
}
}
};
int main() {
cached_allocator alloc;
// size of arrays
const int n = 300000000;
// random number generator
std::mt19937 rng;
// key/val on host
uint32_t * key = new uint32_t[n];
uint32_t * val = new uint32_t[n];
// fill key val
for(int i = 0; i < n; i++) {
key[i] = rng();
val[i] = i;
}
// key/val on device
uint32_t * dev_key;
uint32_t * dev_val;
// allocate memory on GPU for key/val
cudaMalloc((void**)&dev_key, n*sizeof(uint32_t));
cudaMalloc((void**)&dev_val, n*sizeof(uint32_t));
// copy key/val onto the device
cudaMemcpy(dev_key, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_val, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
// get thrust device pointers to key/val on device
thrust::device_ptr<uint32_t> dev_key_ptr = thrust::device_pointer_cast(dev_key);
thrust::device_ptr<uint32_t> dev_val_ptr = thrust::device_pointer_cast(dev_val);
// data on host
uint32_t * data;
// allocate pinned memory for data on host
cudaMallocHost((void**)&data, n*sizeof(uint32_t));
// fill data with random numbers
for(int i = 0; i < n; i++) {
data[i] = rng();
}
// data on device
uint32_t * dev_data;
// allocate memory for data on the device
cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));
// for timing
struct timeval t1, t2;
// two streams
cudaStream_t stream1;
cudaStream_t stream2;
// create streams
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for(int i = 0; i < 10; i++) {
// Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
gettimeofday(&t1, NULL);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par(alloc).on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Overlap both operations
gettimeofday(&t1, NULL);
thrust::sort_by_key(thrust::cuda::par(alloc).on(stream2), dev_key_ptr, dev_key_ptr + n, dev_val_ptr);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both1: " << t_both << std::endl;
}
// clean up
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFreeHost(data);
cudaFree(dev_data);
cudaFree(dev_key);
cudaFree(dev_val);
delete [] key;
delete [] val;
}
$ nvcc -arch=sm_60 -std=c++11 -o t44 t44.cu
$ ./t44
cached_allocator::allocator(): no free block found; calling cuda::malloc
cached_allocator::allocator(): found a hit
t_copy: 100.329, t_sort: 110.122, t_both1: 109.585
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.441, t_sort: 106.454, t_both1: 109.692
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.595, t_sort: 106.507, t_both1: 109.436
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.35, t_sort: 106.463, t_both1: 109.517
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.486, t_sort: 106.473, t_both1: 109.6
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.324, t_sort: 106.385, t_both1: 109.551
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.4, t_sort: 106.549, t_both1: 109.692
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.521, t_sort: 106.445, t_both1: 109.719
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.362, t_sort: 106.413, t_both1: 109.762
cached_allocator::allocator(): found a hit
cached_allocator::allocator(): found a hit
t_copy: 100.349, t_sort: 106.37, t_both1: 109.52
cached_allocator::free_all(): cleaning up after ourselves...
$
CentOS 7.4, CUDA 9.1, Tesla P100
回答2:
Thank you very much. I tested your code and it indeed solves the problem. I also found an alternate solution using CUB (see below). The main difference is you have to allocate all the necessary memory for the radix sort upfront (rather than using a cached allocator).
$ cat main_cub.cu
#include <cuda_runtime.h>
#include <random>
#include <iostream>
#include <sys/time.h>
#include <cub/device/device_radix_sort.cuh>
int main() {
// size of arrays
const int n = 300000000;
// random number generator
std::mt19937 rng;
// key/val on host
uint32_t * key = new uint32_t[n];
uint32_t * val = new uint32_t[n];
// fill key val
for(int i = 0; i < n; i++) {
key[i] = rng();
val[i] = i;
}
// key/val on device
uint32_t * dev_key_in;
uint32_t * dev_val_in;
// allocate memory on GPU for key/val
cudaMalloc((void**)&dev_key_in, n*sizeof(uint32_t));
cudaMalloc((void**)&dev_val_in, n*sizeof(uint32_t));
// copy key/val onto the device
cudaMemcpy(dev_key_in, key, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
cudaMemcpy(dev_val_in, val, n*sizeof(uint32_t), cudaMemcpyHostToDevice);
// sorted key/val on device
uint32_t * dev_key_out;
uint32_t * dev_val_out;
// allocate memory on device for sorted key/val
cudaMalloc((void**)&dev_key_out, n*sizeof(uint32_t));
cudaMalloc((void**)&dev_val_out, n*sizeof(uint32_t));
// determine how much temp storage cub needs
void * dev_temp_storage = NULL;
size_t temp_storage_bytes = 0;
cub::DeviceRadixSort::SortPairs(dev_temp_storage, temp_storage_bytes, dev_key_in, dev_key_out, dev_val_in, dev_val_out, n);
// allocate the temp storage for cub
cudaMalloc((void**)&dev_temp_storage, temp_storage_bytes);
// data on host
uint32_t * data;
// allocate pinned memory for data on host
cudaMallocHost((void**)&data, n*sizeof(uint32_t));
// fill data with random numbers
for(int i = 0; i < n; i++) {
data[i] = rng();
}
// data on device
uint32_t * dev_data;
// allocate memory for data on the device
cudaMalloc((void**)&dev_data, n*sizeof(uint32_t));
// for timing
struct timeval t1, t2;
// two streams
cudaStream_t stream1;
cudaStream_t stream2;
// create streams
cudaStreamCreate(&stream1);
cudaStreamCreate(&stream2);
for(int i = 0; i < 10; i++) {
// Copy data into dev_data on stream 1 (nothing happening on stream 2 for now)
gettimeofday(&t1, NULL);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_copy = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Sort_by_key on stream 2 (nothing hapenning on stream 1 for now)
gettimeofday(&t1, NULL);
cub::DeviceRadixSort::SortPairs(dev_temp_storage, temp_storage_bytes, dev_key_in, dev_key_out, dev_val_in, dev_val_out, n, 0, sizeof(uint32_t)*8, stream2);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_sort = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
// Overlap both operations
gettimeofday(&t1, NULL);
cub::DeviceRadixSort::SortPairs(dev_temp_storage, temp_storage_bytes, dev_key_in, dev_key_out, dev_val_in, dev_val_out, n, 0, sizeof(uint32_t)*8, stream2);
cudaMemcpyAsync(dev_data, data, n*sizeof(uint32_t), cudaMemcpyHostToDevice, stream1);
cudaDeviceSynchronize();
gettimeofday(&t2, NULL);
double t_both = double(t2.tv_sec-t1.tv_sec)*1000.0 + double(t2.tv_usec-t1.tv_usec)/1000.0;
std::cout << "t_copy: " << t_copy << ", t_sort: " << t_sort << ", t_both: " << t_both << std::endl;
}
// clean up
cudaStreamDestroy(stream1);
cudaStreamDestroy(stream2);
cudaFreeHost(data);
cudaFree(dev_data);
cudaFree(dev_key_in);
cudaFree(dev_val_in);
cudaFree(dev_key_out);
cudaFree(dev_val_out);
delete [] key;
delete [] val;
}
You need to include the CUB header library at compile time:
$ nvcc -arch=sm_60 -std=c++11 main_cub.cu -I./cub-1.7.4 -o cub-test
This code gives me the following (CUDA 9.0, Tesla P100, NVLINK, RHEL7):
$ ./cub-test
t_copy: 40.79, t_sort: 97.305, t_both: 99.603
t_copy: 40.809, t_sort: 96.363, t_both: 99.378
t_copy: 40.816, t_sort: 96.46, t_both: 99.347
t_copy: 40.747, t_sort: 96.473, t_both: 99.429
t_copy: 40.766, t_sort: 96.33, t_both: 99.398
t_copy: 40.947, t_sort: 96.426, t_both: 99.394
t_copy: 40.848, t_sort: 96.445, t_both: 99.406
t_copy: 40.843, t_sort: 96.395, t_both: 99.484
t_copy: 40.833, t_sort: 96.303, t_both: 99.381
t_copy: 40.831, t_sort: 96.356, t_both: 99.292