Can consecutive CUDA atomic operations on global m

2019-05-30 09:43发布

问题:

In a cache-enabled CUDA device, does locality of references in consecutive atomic operations on global memory addresses by one thread benefit from L2 cache?
For example, I have an atomic operation in a CUDA kernel that uses the returned value.

uint a = atomicAnd( &(GM_addr[index]), b );

I'm thinking if I'm about to use atomic by the thread in the same kernel again , if I can confine the address of new atomic operation to 32-byte long [ &(GM_addr[index&0xFFFFFFF8]), &(GM_addr[index|7]) ] interval, I'll have a hit in L2 cache (that has a 32-byte long cache line). Is this speculation correct? Or are there exceptions associated with global atomics?

回答1:

I'm answering here to share my approach to find out the impact of L2 cache utilization in global atomics. I do not accept this answer because I do not consider myself yet aware of what happens with atomics on L2 cache from an architectural point of view.

I created a simple CUDA program.

#include <stdio.h>

static void HandleError( cudaError_t err, const char *file, int line ) {
    if (err != cudaSuccess) {
        fprintf( stderr, "%s in %s at line %d\n", cudaGetErrorString( err ), file, line );
        exit( EXIT_FAILURE );
    }
}
#define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))

__global__ void address_confined(uint* data, uint nElems) {
    uint tmp, a = 1;
    for(    uint index = 0;
            index < nElems;
            ++index ) {
        tmp = data[index];
        data[index] += a;
        a = tmp;
    }
}

__global__ void address_not_confined(uint* data, uint nElems) {
    uint tmp, a = 1;
    for(    uint index = 0;
            index < nElems;
            index += 8  ) {
        tmp = data[index];
        data[index] += a;
        a = tmp;
    }
}

__global__ void address_confined_atomics(uint* data, uint nElems) {
    uint a = 1;
    for(    uint index = 0;
            index < nElems;
            ++index ) {
        a = atomicAdd ( &(data[index]), a);
    }
}

__global__ void address_not_confined_atomics(uint* data, uint nElems) {
    uint a = 1;
    for(    uint index = 0;
            index < nElems;
            index += 8  ) {
        a = atomicAdd ( &(data[index]), a);
    }
}

int main ( ){

    const unsigned int nElems = 1 << 23;

    unsigned int* dev_data;
    HANDLE_ERROR( cudaMalloc((void**) &(dev_data), (nElems) * sizeof(unsigned int)) );
    HANDLE_ERROR( cudaMemset(dev_data, 0, nElems) );

    cudaEvent_t start, stop;
    HANDLE_ERROR( cudaEventCreate(&start) );
    HANDLE_ERROR( cudaEventCreate(&stop) );
    float dt_ms;

    HANDLE_ERROR( cudaEventRecord(start) );
    address_confined<<<1,1>>>(dev_data, nElems>>3);
    HANDLE_ERROR( cudaPeekAtLastError() );
    HANDLE_ERROR( cudaEventRecord(stop) );
    HANDLE_ERROR( cudaDeviceSynchronize() );
    HANDLE_ERROR( cudaEventElapsedTime(&dt_ms, start, stop) );
    fprintf( stdout, "Address-confined global access took %f (ms).\n", dt_ms);

    HANDLE_ERROR( cudaEventRecord(start) );
    address_not_confined<<<1,1>>>(dev_data, nElems);
    HANDLE_ERROR( cudaPeekAtLastError() );
    HANDLE_ERROR( cudaEventRecord(stop) );
    HANDLE_ERROR( cudaDeviceSynchronize() );
    HANDLE_ERROR( cudaEventElapsedTime(&dt_ms, start, stop) );
    fprintf( stdout, "Address-NOT-confined global access took %f (ms).\n", dt_ms);

    HANDLE_ERROR( cudaEventRecord(start) );
    address_confined_atomics<<<1,1>>>(dev_data, nElems>>3);
    HANDLE_ERROR( cudaPeekAtLastError() );
    HANDLE_ERROR( cudaEventRecord(stop) );
    HANDLE_ERROR( cudaDeviceSynchronize() );
    HANDLE_ERROR( cudaEventElapsedTime(&dt_ms, start, stop) );
    fprintf( stdout, "Address-confined atomics took %f (ms).\n", dt_ms);

    HANDLE_ERROR( cudaEventRecord(start) );
    address_not_confined_atomics<<<1,1>>>(dev_data, nElems);
    HANDLE_ERROR( cudaPeekAtLastError() );
    HANDLE_ERROR( cudaEventRecord(stop) );
    HANDLE_ERROR( cudaDeviceSynchronize() );
    HANDLE_ERROR( cudaEventElapsedTime(&dt_ms, start, stop) );
    fprintf( stdout, "Address-NOT-confined atomics took %f (ms).\n", dt_ms);

    HANDLE_ERROR( cudaFree(dev_data) );
    return(EXIT_SUCCESS);

}

In above four kernels, only one active thread tries to perform a read-modify-write on integers in global memory. I chose one thread in order to eliminate the possible effects of other threads. Two kernels do it with 32-byte hops to skip what has been cached in L2 and two others access consecutive integers. Two kernels use atomics and two don't.
I compiled it for CC=3.5 and with -O3 flag in Ubuntu 12.04 using CUDA 6.0. I ran it on a GeForce GTX 780 (Kepler GK110).

I got below results:

Address-confined global access took 286.206207 (ms).
Address-NOT-confined global access took 398.450348 (ms).
Address-confined atomics took 231.808640 (ms).
Address-NOT-confined atomics took 349.534637 (ms).

You can see from above results that utilization of L2 has equal or even more effect on atomics comparing to its impact on usual global memory accesses.

I got below results from profiling atomic kernels:

-- address_not_confined_atomics --
L2 Write Transactions: 1048582
L2 Read Transactions: 1069849
Device Memory Write Transactions: 1048578
Device Memory Read Transactions: 1877877
L2 Throughput (Writes): 96.753 (MB/s)
L2 Throughput (Reads): 98.716 (MB/s)

-- address_confined_atomics --
L2 Write Transactions: 1048581
L2 Read Transactions: 1061095
Device Memory Write Transactions: 1046652
Device Memory Read Transactions: 672616
L2 Throughput (Writes): 147.380 (MB/s)
L2 Throughput (Reads): 149.139 (MB/s)

I do not bring non-atomic profiling results here because they're more or less similar to their corresponding versions above. It seems to me the performance gain comes from L2 cache throughput enhancement. Especially when the degree to which the kernel execution time has reduced is proportional to the increase in L2 cache throughput. L2 cache, in both atomic and non-atomic versions, reduces the required number of read transactions from device global memory hence reducing overall read latency. To recap, it seems that it can be as important as non-atomic accesses for atomic operations (those that use returned value) to have locality in global memory references. Beware that atomics that don't use returned value produce a different device instruction; thus above evaluations cannot be relied on.