FIR filter in CUDA (as a 1D convolution)

2019-07-23 11:29发布

问题:

I'm trying to implement a FIR (Finite Impulse Response) filter in CUDA. My approach is quite simple and looks somewhat like this:

#include <cuda.h>

__global__ void filterData(const float *d_data,
                           const float *d_numerator, 
                           float *d_filteredData, 
                           const int numeratorLength,
                           const int filteredDataLength)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;

    float sum = 0.0f;

    if (i < filteredDataLength)
    {
        for (int j = 0; j < numeratorLength; j++)
        {
            // The first (numeratorLength-1) elements contain the filter state
            sum += d_numerator[j] * d_data[i + numeratorLength - j - 1];
        }
    }

    d_filteredData[i] = sum;
}

int main(void)
{
    // (Skipping error checks to make code more readable)

    int dataLength = 18042;
    int filteredDataLength = 16384;
    int numeratorLength= 1659;

    // Pointers to data, filtered data and filter coefficients
    // (Skipping how these are read into the arrays)
    float *h_data = new float[dataLength];
    float *h_filteredData = new float[filteredDataLength];
    float *h_filter = new float[numeratorLength];


    // Create device pointers
    float *d_data = nullptr;
    cudaMalloc((void **)&d_data, dataLength * sizeof(float));

    float *d_numerator = nullptr;
    cudaMalloc((void **)&d_numerator, numeratorLength * sizeof(float));

    float *d_filteredData = nullptr;
    cudaMalloc((void **)&d_filteredData, filteredDataLength * sizeof(float));


    // Copy data to device
    cudaMemcpy(d_data, h_data, dataLength * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_numerator, h_numerator, numeratorLength * sizeof(float), cudaMemcpyHostToDevice);  

    // Launch the kernel
    int threadsPerBlock = 256;
    int blocksPerGrid = (filteredDataLength + threadsPerBlock - 1) / threadsPerBlock;
    filterData<<<blocksPerGrid,threadsPerBlock>>>(d_data, d_numerator, d_filteredData, numeratorLength, filteredDataLength);

    // Copy results to host
    cudaMemcpy(h_filteredData, d_filteredData, filteredDataLength * sizeof(float), cudaMemcpyDeviceToHost);

    // Clean up
    cudaFree(d_data);
    cudaFree(d_numerator);
    cudaFree(d_filteredData);

    // Do stuff with h_filteredData...

    // Clean up some more
    delete [] h_data;
    delete [] h_filteredData;
    delete [] h_filter;
}

The filter works, but as I'm new to CUDA programming and I'm not sure how to optimize it.

A slight problem that I see is that dataLength, filteredDataLength, and numeratorLength are not known before hand in the application I intend to use the filter in. Also, even though dataLength is a multiple of 32 in the above code, it is not guaranteed to be that in the final application.

When I compare my code above to ArrayFire, my code takes about three times longer to execute.

Does anyone have any ideas on how to speed things up?

EDIT: Have changed all filterLength to numeratorLength.

回答1:

I can suggest the following to speed up your code:

  1. Use the shared memory: it is a tiny cache-like memory but extremely faster than the global card memory. You can find more about it by looking for __shared__ keyword in CUDA documentation. For example, you can pre-fetch the filter numerators and big chunks of data in shared memory, this will significantly enhance your performance. You need to pay extra attention to the data alignment in this case as it really matters and it can slow down your code.
  2. Think about unrolling the for-loop of the numerator sum. You can check the reduce-vector example in CUDA documentation.
  3. You can also think about parallelizing the numerator loop itself by itself. This can be done by adding an extra dimension (say 'y') to your thread-block. You will need to make sum a shared vector as well that has the dimension of numeratorLength. You can also check the reduce vector example on how to quickly take the sum of this vector at the end.


回答2:

You are attempting at calculating the filter output by directly evaluating the 1D convolution through a CUDA kernel.

In the case when the filter impulse response duration is long, one thing you can do to evaluate the filtered input is performing the calculations directly in the conjugate domain using FFTs. Below I'm reporting a sample code using CUDA Thrust and the cuFFT library. It is a direct translation of the Matlab-based example reported at

Low-Pass Filtering by FFT Convolution

Let me disclaim that some optimizations are possible with this code, but I preferred to leave it as it is so that it could be more easily compared to its Matlab's counterpart.

#include <stdio.h>
#include <math.h>

#include <cufft.h>

#include <thrust\device_vector.h>
#include <thrust\sequence.h>

#define pi_f  3.14159265358979f                 // Greek pi in single precision

/****************/
/* SIN OPERATOR */
/****************/
class sin_op {

    float fk_, Fs_;

    public:

        sin_op(float fk, float Fs) { fk_ = fk; Fs_ = Fs; }

        __host__ __device__ float operator()(float x) const { return sin(2.f*pi_f*x*fk_/Fs_); }
};

/*****************/
/* SINC OPERATOR */
/*****************/
class sinc_op {

    float fc_, Fs_;

    public:

        sinc_op(float fc, float Fs) { fc_ = fc; Fs_ = Fs; }

        __host__ __device__ float operator()(float x) const 
        {
            if (x==0)   return (2.f*fc_/Fs_);
            else            return (2.f*fc_/Fs_)*sin(2.f*pi_f*fc_*x/Fs_)/(2.f*pi_f*fc_*x/Fs_);
        }
};

/********************/
/* HAMMING OPERATOR */
/********************/
class hamming_op {

    int L_;

    public:

        hamming_op(int L) { L_ = L; }

        __host__ __device__ float operator()(int x) const 
        {
            return 0.54-0.46*cos(2.f*pi_f*x/(L_-1));
        }
};


/*********************************/
/* MULTIPLY CUFFTCOMPLEX NUMBERS */
/*********************************/
struct multiply_cufftComplex {
    __device__ cufftComplex operator()(const cufftComplex& a, const cufftComplex& b) const {
        cufftComplex r;
        r.x = a.x * b.x - a.y * b.y;
        r.y = a.x * b.y + a.y * b.x;
        return r;
    }
};

/********/
/* MAIN */
/********/
void main(){

    // Signal parameters:
    int M = 256;                            // signal length
    const int N = 4;
    float f[N] = { 440, 880, 1000, 2000 };              // frequencies
    float Fs = 5000.;                       // sampling rate

    // Generate a signal by adding up sinusoids:
    thrust::device_vector<float> d_x(M,0.f);            // pre-allocate 'accumulator'
    thrust::device_vector<float> d_n(M);                // discrete-time grid
    thrust::sequence(d_n.begin(), d_n.end(), 0, 1);

    thrust::device_vector<float> d_temp(M);
    for (int i=0; i<N; i++) { 
        float fk = f[i];
        thrust::transform(d_n.begin(), d_n.end(), d_temp.begin(), sin_op(fk,Fs));
        thrust::transform(d_temp.begin(), d_temp.end(), d_x.begin(), d_x.begin(), thrust::plus<float>()); 
    }

    // Filter parameters:
    int L = 257;                        // filter length
    float fc = 600.f;                   // cutoff frequency

    // Design the filter using the window method:
    thrust::device_vector<float> d_hsupp(L);            
    thrust::sequence(d_hsupp.begin(), d_hsupp.end(), -(L-1)/2, 1);
    thrust::device_vector<float> d_hideal(L);           
    thrust::transform(d_hsupp.begin(), d_hsupp.end(), d_hideal.begin(), sinc_op(fc,Fs));
    thrust::device_vector<float> d_l(L);                
    thrust::sequence(d_l.begin(), d_l.end(), 0, 1);
    thrust::device_vector<float> d_h(L);                
    thrust::transform(d_l.begin(), d_l.end(), d_h.begin(), hamming_op(L));
    // h is our filter
    thrust::transform(d_hideal.begin(), d_hideal.end(), d_h.begin(), d_h.begin(), thrust::multiplies<float>());  

    // --- Choose the next power of 2 greater than L+M-1
    int Nfft = pow(2,(ceil(log2((float)(L+M-1))))); // or 2^nextpow2(L+M-1)

    // Zero pad the signal and impulse response:
    thrust::device_vector<float> d_xzp(Nfft,0.f);
    thrust::device_vector<float> d_hzp(Nfft,0.f);
    thrust::copy(d_x.begin(), d_x.end(), d_xzp.begin());
    thrust::copy(d_h.begin(), d_h.end(), d_hzp.begin());

    // Transform the signal and the filter:
    cufftHandle plan;
    cufftPlan1d(&plan, Nfft, CUFFT_R2C, 1);
    thrust::device_vector<cufftComplex> d_X(Nfft/2+1);
    thrust::device_vector<cufftComplex> d_H(Nfft/2+1);
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_xzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_X.data()));
    cufftExecR2C(plan, (cufftReal*)thrust::raw_pointer_cast(d_hzp.data()), (cufftComplex*)thrust::raw_pointer_cast(d_H.data()));

    thrust::device_vector<cufftComplex> d_Y(Nfft/2+1);
    thrust::transform(d_X.begin(), d_X.end(), d_H.begin(), d_Y.begin(), multiply_cufftComplex());  

    cufftPlan1d(&plan, Nfft, CUFFT_C2R, 1);
    thrust::device_vector<float> d_y(Nfft);
    cufftExecC2R(plan, (cufftComplex*)thrust::raw_pointer_cast(d_Y.data()), (cufftReal*)thrust::raw_pointer_cast(d_y.data()));

    getchar();

}


回答3:

Besides my other answer which I expect will be more convenient for convolution kernels with long duration, below I'm reporting a different implementation, which is more compliant with the OP's initial attempt and I expect will be more convenient for convolution kernels with short duration. Such an implementation is based on a hand-written kernel exploiting caching in shared memory. More details can be found in the book by D.B. Kirk and W.-m. W. Hwu

Programming Massively Parallel Processors, Second Edition: A Hands-on Approach

#include <stdio.h>
#include <stdlib.h>

#include "TimingGPU.cuh"
#include "Utilities.cuh"

#define RG          10
#define BLOCKSIZE   8

/****************/
/* CPU FUNCTION */
/****************/
void h_convolution_1D(const float * __restrict__ h_Signal, const float * __restrict__ h_ConvKernel, float * __restrict__ h_Result_CPU, 
                      const int N, const int K) {

    for (int i = 0; i < N; i++) {

        float temp = 0.f;

        int N_start_point = i - (K / 2);

        for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
            temp += h_Signal[N_start_point+ j] * h_ConvKernel[j];
        }

        h_Result_CPU[i] = temp;
    }
}

/********************/
/* BASIC GPU KERNEL */
/********************/
__global__ void d_convolution_1D_basic(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                       const int N, const int K) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    float temp = 0.f;

    int N_start_point = i - (K / 2);

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {
        temp += d_Signal[N_start_point+ j] * d_ConvKernel[j];
    }

    d_Result_GPU[i] = temp;
}

/***************************/
/* GPU KERNEL WITH CACHING */
/***************************/
__global__ void d_convolution_1D_caching(const float * __restrict__ d_Signal, const float * __restrict__ d_ConvKernel, float * __restrict__ d_Result_GPU, 
                                         const int N, const int K) {

    int i = blockIdx.x * blockDim.x + threadIdx.x;

    __shared__ float d_Tile[BLOCKSIZE];

    d_Tile[threadIdx.x] = d_Signal[i];
    __syncthreads();

    float temp = 0.f;

    int N_start_point = i - (K / 2);

    for (int j = 0; j < K; j++) if (N_start_point + j >= 0 && N_start_point + j < N) {

            if ((N_start_point + j >= blockIdx.x * blockDim.x) && (N_start_point + j < (blockIdx.x + 1) * blockDim.x))

                // --- The signal element is in the tile loaded in the shared memory
                temp += d_Tile[threadIdx.x + j - (K / 2)] * d_ConvKernel[j]; 

            else

                // --- The signal element is not in the tile loaded in the shared memory
                temp += d_Signal[N_start_point + j] * d_ConvKernel[j];

    }

    d_Result_GPU[i] = temp;
}

/********/
/* MAIN */
/********/
int main(){

    const int N = 15;           // --- Signal length
    const int K = 5;            // --- Convolution kernel length

    float *h_Signal         = (float *)malloc(N * sizeof(float));
    float *h_Result_CPU     = (float *)malloc(N * sizeof(float));
    float *h_Result_GPU     = (float *)malloc(N * sizeof(float));
    float *h_ConvKernel     = (float *)malloc(K * sizeof(float));

    float *d_Signal;        gpuErrchk(cudaMalloc(&d_Signal,     N * sizeof(float)));
    float *d_Result_GPU;    gpuErrchk(cudaMalloc(&d_Result_GPU, N * sizeof(float)));
    float *d_ConvKernel;    gpuErrchk(cudaMalloc(&d_ConvKernel, K * sizeof(float)));

    for (int i=0; i < N; i++) { h_Signal[i] = (float)(rand() % RG); }

    for (int i=0; i < K; i++) { h_ConvKernel[i] = (float)(rand() % RG); }

    gpuErrchk(cudaMemcpy(d_Signal,      h_Signal,       N * sizeof(float), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_ConvKernel,  h_ConvKernel,   K * sizeof(float), cudaMemcpyHostToDevice));

    h_convolution_1D(h_Signal, h_ConvKernel, h_Result_CPU, N, K);

    d_convolution_1D_basic<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}

    printf("Test basic passed\n");

    d_convolution_1D_caching<<<iDivUp(N, BLOCKSIZE), BLOCKSIZE>>>(d_Signal, d_ConvKernel, d_Result_GPU, N, K);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());

    gpuErrchk(cudaMemcpy(h_Result_GPU, d_Result_GPU, N * sizeof(float), cudaMemcpyDeviceToHost));

    for (int i = 0; i < N; i++) if (h_Result_CPU[i] != h_Result_GPU[i]) {printf("mismatch2 at %d, cpu: %d, gpu %d\n", i, h_Result_CPU[i], h_Result_GPU[i]); return 1;}

    printf("Test caching passed\n");

    return 0;
}