How to asynchronously copy memory from the host to

I would like to copy memory from the host to the device using thrust as in

thrust::host_vector<float> h_vec(1 << 28);
thrust::device_vector<float> d_vec(1 << 28);
thrust::copy(h_vec.begin(), h_vec.end(), d_vec.begin());

using CUDA streams analogously to how you would copy memory from the device to the device using streams:

cudaStream_t s;
cudaStreamCreate(&s);

thrust::device_vector<float> d_vec1(1 << 28), d_vec2(1 << 28);
thrust::copy(thrust::cuda::par.on(s), d_vec1.begin(), d_vec1.end(), d_vec2.begin());

cudaStreamSynchronize(s);
cudaStreamDestroy(s);

The problem is that I can't set the execution policy to CUDA to specify the stream when copying from the host to the device, because, in that case, thrust would assume that both vectors are stored on the device. Is there a way to get around this problem? I'm using the latest thrust version from github (it says 1.8 in the version.h file).

标签： c++ asynchronous cuda thrust

2条回答

叼着烟拽天下

2楼-- · 2019-01-22 16:33

Here's a worked example using thrust::cuda::experimental::pinned_allocator<T>:

// Compile with:
// nvcc --std=c++11 mem_async.cu -o mem_async

#include <cuda.h>
#include <cuda_runtime.h>
#include <cufft.h>

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/fill.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#define LEN 1024

int main(int argc, char *argv[]) {
    thrust::host_vector<float, thrust::cuda::experimental::pinned_allocator<float>> h_vec(LEN);
    thrust::device_vector<float> d_vec(LEN);

    thrust::fill(d_vec.begin(), d_vec.end(), -1.0);

    cudaMemcpyAsync(thrust::raw_pointer_cast(h_vec.data()),
                    thrust::raw_pointer_cast(d_vec.data()),
                    d_vec.size()*sizeof(float),
                    cudaMemcpyDeviceToHost);

    // Comment out this line to see what happens.
    cudaDeviceSynchronize();

    std::cout << h_vec[0] << std::endl;
}

Comment out the synchronize step and you should get 0 printed to the console due to the async memory transfer.

0人赞添加讨论(0) 举报

甜甜的少女心

3楼-- · 2019-01-22 16:42

As indicated in the comments, I don't think this will be possible directly with thrust::copy. However we can use cudaMemcpyAsync in a thrust application to achieve the goal of asynchronous copies and overlap of copy with compute.

Here is a worked example:

#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/fill.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>

// DSIZE determines duration of H2D and D2H transfers
#define DSIZE (1048576*8)
// SSIZE,LSIZE determine duration of kernel launched by thrust
#define SSIZE (1024*512)
#define LSIZE 1
// KSIZE determines size of thrust kernels (number of threads per block)
#define KSIZE 64
#define TV1 1
#define TV2 2

typedef int mytype;
typedef thrust::host_vector<mytype, thrust::cuda::experimental::pinned_allocator<mytype> > pinnedVector;

struct sum_functor
{
  mytype *dptr;
  sum_functor(mytype* _dptr) : dptr(_dptr) {};
  __host__ __device__ void operator()(mytype &data) const
    {
      mytype result = data;
      for (int j = 0; j < LSIZE; j++)
        for (int i = 0; i < SSIZE; i++)
          result += dptr[i];
      data = result;
    }
};

int main(){

  pinnedVector hi1(DSIZE);
  pinnedVector hi2(DSIZE);
  pinnedVector ho1(DSIZE);
  pinnedVector ho2(DSIZE);
  thrust::device_vector<mytype> di1(DSIZE);
  thrust::device_vector<mytype> di2(DSIZE);
  thrust::device_vector<mytype> do1(DSIZE);
  thrust::device_vector<mytype> do2(DSIZE);
  thrust::device_vector<mytype> dc1(KSIZE);
  thrust::device_vector<mytype> dc2(KSIZE);

  thrust::fill(hi1.begin(), hi1.end(),  TV1);
  thrust::fill(hi2.begin(), hi2.end(),  TV2);
  thrust::sequence(do1.begin(), do1.end());
  thrust::sequence(do2.begin(), do2.end());

  cudaStream_t s1, s2;
  cudaStreamCreate(&s1); cudaStreamCreate(&s2);

  cudaMemcpyAsync(thrust::raw_pointer_cast(di1.data()), thrust::raw_pointer_cast(hi1.data()), di1.size()*sizeof(mytype), cudaMemcpyHostToDevice, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(di2.data()), thrust::raw_pointer_cast(hi2.data()), di2.size()*sizeof(mytype), cudaMemcpyHostToDevice, s2);

  thrust::for_each(thrust::cuda::par.on(s1), do1.begin(), do1.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di1.data())));
  thrust::for_each(thrust::cuda::par.on(s2), do2.begin(), do2.begin()+KSIZE, sum_functor(thrust::raw_pointer_cast(di2.data())));

  cudaMemcpyAsync(thrust::raw_pointer_cast(ho1.data()), thrust::raw_pointer_cast(do1.data()), do1.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s1);
  cudaMemcpyAsync(thrust::raw_pointer_cast(ho2.data()), thrust::raw_pointer_cast(do2.data()), do2.size()*sizeof(mytype), cudaMemcpyDeviceToHost, s2);

  cudaDeviceSynchronize();
  for (int i=0; i < KSIZE; i++){
    if (ho1[i] != ((LSIZE*SSIZE*TV1) + i)) { std::cout << "mismatch on stream 1 at " << i << " was: " << ho1[i] << " should be: " << ((DSIZE*TV1)+i) << std::endl; return 1;}
    if (ho2[i] != ((LSIZE*SSIZE*TV2) + i)) { std::cout << "mismatch on stream 2 at " << i << " was: " << ho2[i] << " should be: " << ((DSIZE*TV2)+i) << std::endl; return 1;}
    }
  std::cout << "Success!" << std::endl;
  return 0;
}

For my test case, I used RHEL5.5, Quadro5000, and cuda 6.5RC. This example is designed to have thrust create very small kernels (only a single threadblock, as long as KSIZE is small, say 32 or 64), so that the kernels that thrust creates from thrust::for_each are able to run concurrently.

When I profile this code, I see:

nvvp output for thrust streams application

This indicates that we are achieving proper overlap both between thrust kernels, and between copy operations and thrust kernels, as well as asynchronous data copying at the completion of the kernels. Note that the cudaDeviceSynchronize() operation "fills" the timeline, indicating that all the async operations (data copying, thrust functions) were issued asynchronously and control returned to the host thread before any of the operations were underway. All of this is expected, proper behavior for full concurrency between host, GPU, and data copying operations.

0人赞添加讨论(0) 举报

How to asynchronously copy memory from the host to

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间