Thrust inside user written kernels

2楼-- · 2020-01-23 07:44

If you mean to use the data allocated / processed by thrust yes you can, just get the raw pointer of the allocated data.

int * raw_ptr = thrust::raw_pointer_cast(dev_ptr);

if you want to allocate thrust vectors in the kernel I never tried but I don't think will work and also if it works I don't think it will provide any benefit.

0人赞添加讨论(0) 举报

够拽才男人

3楼-- · 2020-01-23 07:47

As it was originally written, Thrust is purely a host side abstraction. It cannot be used inside kernels. You can pass the device memory encapsulated inside a thrust::device_vector to your own kernel like this:

thrust::device_vector< Foo > fooVector;
// Do something thrust-y with fooVector

Foo* fooArray = thrust::raw_pointer_cast( &fooVector[0] );

// Pass raw array and its size to kernel
someKernelCall<<< x, y >>>( fooArray, fooVector.size() );

and you can also use device memory not allocated by thrust within thrust algorithms by instantiating a thrust::device_ptr with the bare cuda device memory pointer.

Edited four and half years later to add that as per @JackOLantern's answer, thrust 1.8 adds a sequential execution policy which means you can run single threaded versions of thrust's alogrithms on the device. Note that it still isn't possible to directly pass a thrust device vector to a kernel and device vectors can't be directly used in device code.

Note that it is also possible to use the thrust::device execution policy in some cases to have parallel thrust execution launched by a kernel as a child grid. This requires separate compilation/device linkage and hardware which supports dynamic parallelism. I am not certain whether this is actually supported in all thrust algorithms or not, but certainly works with some.

0人赞添加讨论(0) 举报

做个烂人

4楼-- · 2020-01-23 07:53

I would like to provide an updated answer to this question.

Starting from Thrust 1.8, CUDA Thrust primitives can be combined with the thrust::seq execution policy to run sequentially within a single CUDA thread (or sequentially within a single CPU thread). Below, an example is reported.

If you want parallel execution within a thread, then you may consider using CUB which provides reduction routines that can be called from within a threadblock, provided that your card enables dynamic parallelism.

Here is the example with Thrust

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

/********************/
/* CUDA ERROR CHECK */
/********************/
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void test(float *d_A, int N) {

    float sum = thrust::reduce(thrust::seq, d_A, d_A + N);

    printf("Device side result = %f\n", sum);

}

int main() {

    const int N = 16;

    float *h_A = (float*)malloc(N * sizeof(float));
    float sum = 0.f;
    for (int i=0; i<N; i++) {
        h_A[i] = i;
        sum = sum + h_A[i];
    }
    printf("Host side result = %f\n", sum);

    float *d_A; gpuErrchk(cudaMalloc((void**)&d_A, N * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_A, h_A, N * sizeof(float), cudaMemcpyHostToDevice));

    test<<<1,1>>>(d_A, N);

}

0人赞添加讨论(0) 举报

地球回转人心会变

5楼-- · 2020-01-23 08:06

This is an update to my previous answer.

Starting from Thrust 1.8.1, CUDA Thrust primitives can be combined with the thrust::device execution policy to run in parallel within a single CUDA thread exploiting CUDA dynamic parallelism. Below, an example is reported.

#include <stdio.h>

#include <thrust/reduce.h>
#include <thrust/execution_policy.h>

#include "TimingGPU.cuh"
#include "Utilities.cuh"

#define BLOCKSIZE_1D    256
#define BLOCKSIZE_2D_X  32
#define BLOCKSIZE_2D_Y  32

/*************************/
/* TEST KERNEL FUNCTIONS */
/*************************/
__global__ void test1(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::seq, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

__global__ void test2(const float * __restrict__ d_data, float * __restrict__ d_results, const int Nrows, const int Ncols) {

    const unsigned int tid = threadIdx.x + blockDim.x * blockIdx.x;

    if (tid < Nrows) d_results[tid] = thrust::reduce(thrust::device, d_data + tid * Ncols, d_data + (tid + 1) * Ncols);

}

/********/
/* MAIN */
/********/
int main() {

    const int Nrows = 64;
    const int Ncols = 2048;

    gpuErrchk(cudaFree(0));

//    size_t DevQueue;
//    gpuErrchk(cudaDeviceGetLimit(&DevQueue, cudaLimitDevRuntimePendingLaunchCount));
//    DevQueue *= 128;
//    gpuErrchk(cudaDeviceSetLimit(cudaLimitDevRuntimePendingLaunchCount, DevQueue));

    float *h_data       = (float *)malloc(Nrows * Ncols * sizeof(float));
    float *h_results    = (float *)malloc(Nrows *         sizeof(float));
    float *h_results1   = (float *)malloc(Nrows *         sizeof(float));
    float *h_results2   = (float *)malloc(Nrows *         sizeof(float));
    float sum = 0.f;
    for (int i=0; i<Nrows; i++) {
        h_results[i] = 0.f;
        for (int j=0; j<Ncols; j++) {
            h_data[i*Ncols+j] = i;
            h_results[i] = h_results[i] + h_data[i*Ncols+j];
        }
    }

    TimingGPU timerGPU;

    float *d_data;          gpuErrchk(cudaMalloc((void**)&d_data,     Nrows * Ncols * sizeof(float)));
    float *d_results1;      gpuErrchk(cudaMalloc((void**)&d_results1, Nrows         * sizeof(float)));
    float *d_results2;      gpuErrchk(cudaMalloc((void**)&d_results2, Nrows         * sizeof(float)));
    gpuErrchk(cudaMemcpy(d_data, h_data, Nrows * Ncols * sizeof(float), cudaMemcpyHostToDevice));

    timerGPU.StartCounter();
    test1<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 1 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 1; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    timerGPU.StartCounter();
    test2<<<iDivUp(Nrows, BLOCKSIZE_1D), BLOCKSIZE_1D>>>(d_data, d_results1, Nrows, Ncols);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing approach nr. 2 = %f\n", timerGPU.GetCounter());

    gpuErrchk(cudaMemcpy(h_results1, d_results1, Nrows * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i=0; i<Nrows; i++) {
        if (h_results1[i] != h_results[i]) {
            printf("Approach nr. 2; Error at i = %i; h_results1 = %f; h_results = %f", i, h_results1[i], h_results[i]);
            return 0;
        }
    }

    printf("Test passed!\n");

}

The above example performs reductions of the rows of a matrix in the same sense as Reduce matrix rows with CUDA, but it is done differently from the above post, namely, by calling CUDA Thrust primitives directly from user written kernels. Also, the above example serves to compare the performance of the same operations when done with two execution policies, namely, thrust::seq and thrust::device. Below, some graphs showing the difference in performance.

The performance has been evaluated on a Kepler K20c and on a Maxwell GeForce GTX 850M.

0人赞添加讨论(0) 举报

Thrust inside user written kernels

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间