Using peered GPUs in a single stream

In my current project I use GPUs for signal processing and visualization. I'm already using streams to allow for asynchronous operation. The signal is processed in frames and for each frame the processing steps in a stream are as following

memcpy to device
signal conditioning
image processing
visualization

Right now the steps are happening on a single GPU, however my machine has a Multi-GPU card (GeForce GTX 690) and I'd like to distribute the operation between the two devices. Basically I'd like to perform steps 1 and 2 on device A and steps 3 and 4 on device B, while the operations 1, 2, 3 and 4 are still executed as a single asynchronous stream. The desired outcome is a streaming layout that looks like this

Device A Stream a 1 2       1 2  ...
         Stream b      1 2  ...
Device B Stream a    3 4       3 4 ...
         Stream b         3 4  ...

How can I do this?

标签： cuda multi-gpu

2条回答

淡お忘

2楼-- · 2019-05-30 00:58

My previous attempt was not correct, because a stream is associated with the device it was created on. So I think the most direct answer to your question as posed in the title is "it can't be done". You cannot create a single stream and issue commands to more than one GPU from it. From here:

Stream and Event Behavior

A kernel launch or memory copy will fail if it is issued to a stream that is not associated to the current device

However while researching it, I noted that events are a suggested way to synchronize two streams on two different devices:

cudaStreamWaitEvent() will succeed even if the input stream and input event are associated to different devices. cudaStreamWaitEvent() can therefore be used to synchronize multiple devices with each other.

So in that vein, I created the following code to illustrate this:

#include <stdio.h>
#define SIZE 32
#define K1VAL 5
#define K3VAL 3

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void kernel1(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);

  if (idx == 0){
   int *a = new int[10000]; // just to make this kernel take a while
   for (int i = 0; i<10000; i++)
     a[i] = 0;
   for (int i = 0; i < size; i++)
     frame[i] += K1VAL;
   }
}


__global__ void kernel3(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);
  if (idx == 0)
   for (int i = 0; i < size; i++)
     frame[i] -= K3VAL;
}

void set_device(int dev){
  int ldev;
  cudaSetDevice(dev);
  cudaGetDevice(&ldev);
  cudaCheckErrors("set device error");
  if (ldev != dev){
    printf("set device mismatch error\n");
    exit(1);
    }
}


int main(){

  int A=0;
  int B=1;
  int framesize = SIZE*sizeof(int);
  int *h_frame;
  int *d_frame_aA,  *d_frame_bB;
  int numdev = 0;
  cudaGetDeviceCount(&numdev);
  cudaCheckErrors("can't determine number of devices");
  if (numdev < 2){
   printf("not enough devices!\n");
   return 1;
   }
  set_device(A);
  cudaMalloc((void **) &d_frame_aA, framesize);  // stream_a
  cudaMemset(d_frame_aA, 0, framesize);
  set_device(B);
  cudaMalloc((void **) &d_frame_bB, framesize);  // stream_b
  cudaMemset(d_frame_bB, 0, framesize);
  cudaHostAlloc((void **) &h_frame, framesize, cudaHostAllocDefault);
  cudaCheckErrors("allocations failure");

  set_device(A);
  cudaStream_t stream_a, stream_b;
  cudaStreamCreate(&stream_a);
  cudaEvent_t absync;
  cudaEventCreate(&absync);
  set_device(B);
  cudaStreamCreate(&stream_b);
  cudaCheckErrors("stream creation failure");


  for (int i = 0; i < SIZE; i++)
    h_frame[i] = 0;

  set_device(A);
  cudaDeviceEnablePeerAccess(B, 0);
  set_device(B);
  cudaDeviceEnablePeerAccess(A, 0);
  cudaCheckErrors("enable peer access fail");


  set_device(A);
  cudaMemcpyAsync(d_frame_aA, h_frame, framesize, cudaMemcpyHostToDevice, stream_a);
  kernel1<<<1,1,0, stream_a>>>(d_frame_aA, SIZE);
  cudaCheckErrors("kernel1 fail");
  cudaMemcpyPeerAsync(d_frame_bB, B, d_frame_aA, A, framesize, stream_a );
  cudaCheckErrors("memcpypeer fail");
  cudaEventRecord(absync, stream_a);
  set_device(B);
  // comment out the next line to see the failure
  cudaStreamWaitEvent(stream_b, absync, 0);
  kernel3<<<1,1,0, stream_b>>>(d_frame_bB, SIZE);
  cudaCheckErrors("main sequence fail");


//  cudaCheckErrors("main sequence failure");

  cudaMemcpy(h_frame, d_frame_bB, framesize, cudaMemcpyDeviceToHost);
  cudaCheckErrors("results_a memcpy fail");
  for (int i = 0; i < SIZE; i++)
    if (h_frame[i] != (K1VAL - K3VAL)) {
      printf("results error\n");
      return 1;
      }
  printf("success\n");
  return 0;

}

If you run the code as-is, you should get a success message. If you comment out the line that forces stream b (on Device B) to wait on stream a (on Device A), then you'll see a results error message. So this demonstrates how to sync a stream on one device to a stream on another. Hope it helps. Sorry for the confusion on the first go-round.

0人赞添加讨论(0) 举报

手持菜刀，她持情操

3楼-- · 2019-05-30 01:04

cudaStreamWaitEvent() enables inter-GPU synchronization, since you can insert a wait on a CUDA event belonging to another device.

So what you need for inter-GPU synchronization between the producer and the consumer is to allocate a few events (at least 2) for each of the 2 GPUs, then have the producer cudaEventRecord() and the consumer cudaStreamWaitEvent() on the same event. cudaStreamWaitEvent() inserts a command into the current device's command buffer that causes it to suspend execution until the given event has been recorded.

Below see a code fragment where a peer-to-peer memcpy is implemented using cudaStreamWaitEvent() in this way. Once the pump is primed, the producer and consumer should both be doing PCIe transfers concurrently, each to one of the two staging buffers (which are allocated in portable pinned memory).

cudaError_t
chMemcpyPeerToPeer( 
    void *_dst, int dstDevice, 
    const void *_src, int srcDevice, 
    size_t N ) 
{
    cudaError_t status;
    char *dst = (char *) _dst;
    const char *src = (const char *) _src;
    int stagingIndex = 0;
    while ( N ) {
        size_t thisCopySize = min( N, STAGING_BUFFER_SIZE );

        CUDART_CHECK( cudaSetDevice( srcDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[dstDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( g_hostBuffers[stagingIndex], src, thisCopySize, 
            cudaMemcpyDeviceToHost, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[srcDevice][stagingIndex] ) );

        CUDART_CHECK( cudaSetDevice( dstDevice ) );
        CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[srcDevice][stagingIndex], 0 ) );
        CUDART_CHECK( cudaMemcpyAsync( dst, g_hostBuffers[stagingIndex], thisCopySize, 
            cudaMemcpyHostToDevice, NULL ) );
        CUDART_CHECK( cudaEventRecord( g_events[dstDevice][stagingIndex] ) );

        dst += thisCopySize;
        src += thisCopySize;
        N -= thisCopySize;
        stagingIndex = 1 - stagingIndex;
    }
    // Wait until both devices are done
    CUDART_CHECK( cudaSetDevice( srcDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );

    CUDART_CHECK( cudaSetDevice( dstDevice ) );
    CUDART_CHECK( cudaDeviceSynchronize() );

Error:
    return status;
}

Full source code in https://github.com/ArchaeaSoftware/cudahandbook/blob/master/memory/peer2peerMemcpy.cu

0人赞添加讨论(0) 举报

Using peered GPUs in a single stream

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间