Output of cuda program is not what was expected

2020-02-07 12:44发布

#include<cuda_runtime.h>
#include<stdio.h>
#include<cuda.h>
#include<stdlib.h>


__global__ void setVal(char **c){

c[(blockIdx.y * gridDim.x) + blockIdx.x] = "hello\0";

}


int main(){

char **gpu = NULL;
cudaMalloc((void**)&gpu, 6 * sizeof(char *));
int i;
/*
I cannot access second level directly
for( i =0 ; i < 6 ;i++){
    cudaMalloc((void**)&gpu[i], 10 * sizeof(char));
}*/


dim3 grid(3,2);
setVal<<<grid, 1>>>(gpu);
char *p = (char*)malloc(10 * sizeof(char));
char *x[6];

cudaMemcpy(x, gpu, 6*sizeof(char*), cudaMemcpyDeviceToHost);
for( i =0 ; i< 6; i++){
    cudaMemcpy(p, x[i], 10*sizeof(char), cudaMemcpyDeviceToHost);
    //put synchronize here if problem
    printf("%s\n",p);

}


getchar();
return 0;
}

Based on all the suggestions, i revised my code to make my concept correct. But, the code is still not working :(. Any help will be appreciated

标签: c++ cuda
3条回答
The star\"
2楼-- · 2020-02-07 13:16

There are several problems I'm seeing here. Here are some of the most obvious ones:

First, my guess is that the character string constant "4" is stored in host (CPU) memory, so you would have to copy it explicitly to device (global) memory. Once the string "4" is in device memory, then you can store a pointer to "4" in a device memory value, such as an element of array arr.

Second, the array x you pass to the setValues kernel is also in host memory. Remember that you need to use cudaMalloc to allocate a (global) device memory region, which an on-device kernel can then point to.

查看更多
家丑人穷心不美
3楼-- · 2020-02-07 13:21

Try this -- I tested it on a GTX 285 under CUDA 3.2 -- so it's a bit more restrictive than the current version, but it works.

#include<stdio.h>
#include<string.h>

__global__ void setValues(char** word)
{
    volatile char* myWord = word[blockIdx.x];

    myWord[0] = 'H';
    myWord[1] = 'o';
    myWord[2] = 'l';
    myWord[3] = 'a';
    myWord[4] = '\0';
}

int main()
{
    const size_t bufferSize = 32;
    const int nObjects = 10;

    char*  h_x[nObjects];
    char** d_x = 0;

    cudaMalloc( (void**)(&d_x), nObjects * sizeof(char*) );

    for ( int i=0; i < nObjects; i++ )
    {
        h_x[i] = NULL;
        cudaMalloc( (void**)(&h_x[i]), bufferSize * sizeof(char) );
        printf("h_x[%d] = %lx\n",i,(unsigned long)h_x[i]);
    }

    cudaMemcpy( d_x, h_x, nObjects*sizeof(char*), cudaMemcpyHostToDevice);
    printf("Copied h_x[] to d_x[]\n");

    char msg[] = "Hello World!";
    cudaMemcpy( h_x[0], msg, 13*sizeof(char), cudaMemcpyHostToDevice );

    /*  Force Thread Synchronization  */
    cudaError err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    setValues<<<nObjects,1>>>(d_x);

    /*  Force Thread Synchronization  */
    err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    printf("Kernel Completed Successfully.  Woot.\n\n");

    char p[bufferSize];

    printf("d_x = %lx\n", (unsigned long)d_x );
    printf("h_x = %lx\n", (unsigned long)h_x );

    cudaMemcpy( h_x, d_x, nObjects*sizeof(char*), cudaMemcpyDeviceToHost);

    printf("d_x = %lx\n", (unsigned long)d_x );
    printf("h_x = %lx\n", (unsigned long)h_x );

    for ( int i=0; i < nObjects; i++ )
    {
        cudaMemcpy( &p, h_x[i], bufferSize*sizeof(char), cudaMemcpyDeviceToHost);
        printf("%d p[] = %s\n",i,p);
    }

    /*  Force Thread Synchronization  */
    err = cudaThreadSynchronize();

    /*  Check for and display Error  */
    if ( cudaSuccess != err )
    {
        fprintf( stderr, "Cuda error in file '%s' in line %i : %s.\n",
                __FILE__, __LINE__, cudaGetErrorString( err) );
    }

    getchar();

    return 0;
}

As @Jon notes, you can't pass x (as you had declared) it to the GPU, because it's an address which lives on the CPU. In the code above, I create an array of char*'s and pass them to a char** which I also allocated on the GPU. Hope this helps!

查看更多
再贱就再见
4楼-- · 2020-02-07 13:33

The main problem with your code is that you're not allocating any device memory for the setValues call. You can't pass it a pointer to host memory (char *x[6]) and expect that to work; the CUDA kernels have to operate on CUDA memory. You create that memory, then operate on it, then copy it back:

#include <stdio.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>

__global__ void setValues(char *arr){
    arr[blockIdx.y * gridDim.x + blockIdx.x] = '4';
}

int main() {
    const int NCHARS=6;
    char *xd;

    cudaMalloc(&xd, NCHARS);
    dim3 grid(3,2);
    setValues<<<grid,1>>>(xd);

    char *p;
    p = (char*) malloc(20*sizeof(char));
    strcpy(p,"");

    cudaMemcpy(p, xd, NCHARS, cudaMemcpyDeviceToHost);
    p[NCHARS]='\0';

    printf("<%s>\n", p);
    getchar();

    cudaFree(xd);

    return 0;
}
查看更多
登录 后发表回答