In my current project I use GPUs for signal processing and visualization. I'm already using streams to allow for asynchronous operation. The signal is processed in frames and for each frame the processing steps in a stream are as following
- memcpy to device
- signal conditioning
- image processing
- visualization
Right now the steps are happening on a single GPU, however my machine has a Multi-GPU card (GeForce GTX 690) and I'd like to distribute the operation between the two devices. Basically I'd like to perform steps 1 and 2 on device A and steps 3 and 4 on device B, while the operations 1, 2, 3 and 4 are still executed as a single asynchronous stream. The desired outcome is a streaming layout that looks like this
Device A Stream a 1 2 1 2 ...
Stream b 1 2 ...
Device B Stream a 3 4 3 4 ...
Stream b 3 4 ...
How can I do this?
My previous attempt was not correct, because a stream is associated with the device it was created on. So I think the most direct answer to your question as posed in the title is "it can't be done". You cannot create a single stream and issue commands to more than one GPU from it. From here:
Stream and Event Behavior
A kernel launch or memory copy will fail if it is issued to a stream that is not associated to the current device
However while researching it, I noted that events are a suggested way to synchronize two streams on two different devices:
cudaStreamWaitEvent()
will succeed even if the input stream and input
event are associated to different devices. cudaStreamWaitEvent() can
therefore be used to synchronize multiple devices with each other.
So in that vein, I created the following code to illustrate this:
#include <stdio.h>
#define SIZE 32
#define K1VAL 5
#define K3VAL 3
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
__global__ void kernel1(int *frame, int size){
int idx = threadIdx.x + (blockDim.x * blockIdx.x);
if (idx == 0){
int *a = new int[10000]; // just to make this kernel take a while
for (int i = 0; i<10000; i++)
a[i] = 0;
for (int i = 0; i < size; i++)
frame[i] += K1VAL;
}
}
__global__ void kernel3(int *frame, int size){
int idx = threadIdx.x + (blockDim.x * blockIdx.x);
if (idx == 0)
for (int i = 0; i < size; i++)
frame[i] -= K3VAL;
}
void set_device(int dev){
int ldev;
cudaSetDevice(dev);
cudaGetDevice(&ldev);
cudaCheckErrors("set device error");
if (ldev != dev){
printf("set device mismatch error\n");
exit(1);
}
}
int main(){
int A=0;
int B=1;
int framesize = SIZE*sizeof(int);
int *h_frame;
int *d_frame_aA, *d_frame_bB;
int numdev = 0;
cudaGetDeviceCount(&numdev);
cudaCheckErrors("can't determine number of devices");
if (numdev < 2){
printf("not enough devices!\n");
return 1;
}
set_device(A);
cudaMalloc((void **) &d_frame_aA, framesize); // stream_a
cudaMemset(d_frame_aA, 0, framesize);
set_device(B);
cudaMalloc((void **) &d_frame_bB, framesize); // stream_b
cudaMemset(d_frame_bB, 0, framesize);
cudaHostAlloc((void **) &h_frame, framesize, cudaHostAllocDefault);
cudaCheckErrors("allocations failure");
set_device(A);
cudaStream_t stream_a, stream_b;
cudaStreamCreate(&stream_a);
cudaEvent_t absync;
cudaEventCreate(&absync);
set_device(B);
cudaStreamCreate(&stream_b);
cudaCheckErrors("stream creation failure");
for (int i = 0; i < SIZE; i++)
h_frame[i] = 0;
set_device(A);
cudaDeviceEnablePeerAccess(B, 0);
set_device(B);
cudaDeviceEnablePeerAccess(A, 0);
cudaCheckErrors("enable peer access fail");
set_device(A);
cudaMemcpyAsync(d_frame_aA, h_frame, framesize, cudaMemcpyHostToDevice, stream_a);
kernel1<<<1,1,0, stream_a>>>(d_frame_aA, SIZE);
cudaCheckErrors("kernel1 fail");
cudaMemcpyPeerAsync(d_frame_bB, B, d_frame_aA, A, framesize, stream_a );
cudaCheckErrors("memcpypeer fail");
cudaEventRecord(absync, stream_a);
set_device(B);
// comment out the next line to see the failure
cudaStreamWaitEvent(stream_b, absync, 0);
kernel3<<<1,1,0, stream_b>>>(d_frame_bB, SIZE);
cudaCheckErrors("main sequence fail");
// cudaCheckErrors("main sequence failure");
cudaMemcpy(h_frame, d_frame_bB, framesize, cudaMemcpyDeviceToHost);
cudaCheckErrors("results_a memcpy fail");
for (int i = 0; i < SIZE; i++)
if (h_frame[i] != (K1VAL - K3VAL)) {
printf("results error\n");
return 1;
}
printf("success\n");
return 0;
}
If you run the code as-is, you should get a success
message.
If you comment out the line that forces stream b (on Device B) to wait on stream a (on Device A), then you'll see a results error
message. So this demonstrates how to sync a stream on one device to a stream on another. Hope it helps. Sorry for the confusion on the first go-round.
cudaStreamWaitEvent()
enables inter-GPU synchronization, since you can insert a wait on a CUDA event belonging to another device.
So what you need for inter-GPU synchronization between the producer and the consumer is to allocate a few events (at least 2) for each of the 2 GPUs, then have the producer cudaEventRecord()
and the consumer cudaStreamWaitEvent()
on the same event. cudaStreamWaitEvent()
inserts a command into the current device's command buffer that causes it to suspend execution until the given event has been recorded.
Below see a code fragment where a peer-to-peer memcpy is implemented using cudaStreamWaitEvent()
in this way. Once the pump is primed, the producer and consumer should both be doing PCIe transfers concurrently, each to one of the two staging buffers (which are allocated in portable pinned memory).
cudaError_t
chMemcpyPeerToPeer(
void *_dst, int dstDevice,
const void *_src, int srcDevice,
size_t N )
{
cudaError_t status;
char *dst = (char *) _dst;
const char *src = (const char *) _src;
int stagingIndex = 0;
while ( N ) {
size_t thisCopySize = min( N, STAGING_BUFFER_SIZE );
CUDART_CHECK( cudaSetDevice( srcDevice ) );
CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[dstDevice][stagingIndex], 0 ) );
CUDART_CHECK( cudaMemcpyAsync( g_hostBuffers[stagingIndex], src, thisCopySize,
cudaMemcpyDeviceToHost, NULL ) );
CUDART_CHECK( cudaEventRecord( g_events[srcDevice][stagingIndex] ) );
CUDART_CHECK( cudaSetDevice( dstDevice ) );
CUDART_CHECK( cudaStreamWaitEvent( NULL, g_events[srcDevice][stagingIndex], 0 ) );
CUDART_CHECK( cudaMemcpyAsync( dst, g_hostBuffers[stagingIndex], thisCopySize,
cudaMemcpyHostToDevice, NULL ) );
CUDART_CHECK( cudaEventRecord( g_events[dstDevice][stagingIndex] ) );
dst += thisCopySize;
src += thisCopySize;
N -= thisCopySize;
stagingIndex = 1 - stagingIndex;
}
// Wait until both devices are done
CUDART_CHECK( cudaSetDevice( srcDevice ) );
CUDART_CHECK( cudaDeviceSynchronize() );
CUDART_CHECK( cudaSetDevice( dstDevice ) );
CUDART_CHECK( cudaDeviceSynchronize() );
Error:
return status;
}
Full source code in https://github.com/ArchaeaSoftware/cudahandbook/blob/master/memory/peer2peerMemcpy.cu