I have the following code that has bug when doing the inverse FFT. The forward FFT works as I printed the output and verified it. But the inverse does not seem to. Any ideas? Does it look like I have missed a concept?
Code - http://pastebin.com/iZYtdcqR
EDIT - I have essentially rewritten the code that comes with the CUDA toolkit samples. I am trying to perform a convolution using FFT but with a modified algorithm (DIF actually.)
EDIT2 - dding code to the question.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cufft.h>
typedef enum signaltype {REAL, COMPLEX} signal;
typedef float2 Complex;
void
printData(Complex *a, int size, char *msg) {
if (msg == "") printf("\n");
else printf("%s\n", msg);
for (int i = 0; i < size; i++)
printf("%f %f\n", a[i].x, a[i].y);
}
void
normData(Complex *a, int size, float norm) {
for (int i = 0; i < size; i++) {
a[i].x /= norm;
a[i].y /= norm;
}
}
void
randomFill(Complex *h_signal, int size, int flag) {
// Real signal.
if (flag == REAL) {
for (int i = 0; i < size; i++) {
h_signal[i].x = rand() / (float) RAND_MAX;
h_signal[i].y = 0;
}
}
}
// FFT a signal that's on the _DEVICE_.
void
signalFFT(Complex *d_signal, int signal_size) {
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan FFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_FORWARD) != CUFFT_SUCCESS) {
printf ("Failed Executing FFT\n");
exit(0);
}
}
void
signalIFFT(Complex *d_signal, int signal_size) {
cufftHandle plan;
if (cufftPlan1d(&plan, signal_size, CUFFT_C2C, 1) != CUFFT_SUCCESS) {
printf("Failed to plan IFFT\n");
exit(0);
}
// Execute the plan.
if (cufftExecC2C(plan, (cufftComplex *) d_signal, (cufftComplex *) d_signal, CUFFT_INVERSE) != CUFFT_SUCCESS) {
printf ("Failed Executing IFFT\n");
exit(0);
}
}
int main()
{
Complex *h_signal, *d_signal1;
int alloc_size, i;
alloc_size = 16;
// Kernel Block and Grid Size.
const dim3 blockSize(16, 16, 1);
const dim3 gridSize(alloc_size / 16 + 1, alloc_size / 16 + 1, 1);
h_signal = (Complex *) malloc(sizeof(Complex) * alloc_size);
cudaMalloc(&d_signal1, sizeof(Complex) * alloc_size);
if (cudaGetLastError() != cudaSuccess){
printf("Cuda error: Failed to allocate\n");
exit(0);
}
//cudaMalloc(&d_signal2, sizeof(Complex) * alloc_size);
// Add random data to signal.
randomFill(h_signal, alloc_size, REAL);
printData(h_signal, alloc_size, "Random H1");
cudaMemcpy(d_signal1, h_signal, sizeof(Complex) * alloc_size, cudaMemcpyHostToDevice);
signalFFT(d_signal1, alloc_size);
signalIFFT(d_signal1, alloc_size);
cudaDeviceSynchronize();
cudaMemcpy(h_signal, d_signal1, sizeof(Complex) * alloc_size, cudaMemcpyDeviceToHost);
printData(h_signal, alloc_size, "IFFT");
return 0;
}