nvlink : error : Undefined reference when trying t

2019-05-29 19:55发布

问题:

I'm getting this really bizarre (in my perspective) error in VS2012 when trying to compile some code that had previously worked. I use CUDA to generate a 2D array of data and my goal is to write it to a text file...but when I append this snippet of code from the example at the end of my main function

// basic file operations
#include <iostream>
#include <fstream>
using namespace std;

int main () {
  ofstream myfile;
  myfile.open ("example.txt");
  myfile << "Writing this to a file.\n";
  myfile.close();
  return 0;
}

I get

1>  C:\Users\Karsten Chu\New Google Drive\Research\Visual Studio 2012\Projects\Dynamic Parallelism Test\Dynamic Parallelism Test>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -dlink -o "x64\Debug\Dynamic Parallelism Test.device-link.obj" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\x64" cuda.lib cudart.lib cudadevrt.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib  -gencode=arch=compute_35,code=sm_35 -G --machine 64 "x64\Debug\CUDA Test 2.cu.obj" "x64\Debug\CUDA Test.cu.obj" "x64\Debug\RKF5 Prototype 2.cu.obj" x64\Debug\version.cu.obj 
1>nvlink : error : Undefined reference to '_ZTVSo__St14basic_ofstreamIcSt11char_traitsIcEE' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>nvlink : error : Undefined reference to '_ZTVSt9basic_iosIcSt11char_traitsIcEE__So__St14basic_ofstreamIcS1_E' in 'x64/Debug/RKF5 Prototype 2.cu.obj'
1>C:\Program Files (x86)\MSBuild\Microsoft.Cpp\v4.0\V110\BuildCustomizations\CUDA 5.5.targets(668,9): error MSB3721: The command ""C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\bin\nvcc.exe" -dlink -o "x64\Debug\Dynamic Parallelism Test.device-link.obj" -Xcompiler "/EHsc /W3 /nologo /Od /Zi /RTC1 /MDd  " -L"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v5.5\lib\x64" cuda.lib cudart.lib cudadevrt.lib kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib  -gencode=arch=compute_35,code=sm_35 -G --machine 64 "x64\Debug\CUDA Test 2.cu.obj" "x64\Debug\CUDA Test.cu.obj" "x64\Debug\RKF5 Prototype 2.cu.obj" x64\Debug\version.cu.obj" exited with code -1.
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

Now my understanding is that nvlink has to do with the CUDA linking for that part of my code...why is are these two aspects of my code interfering? I thought these errors meant that there was a library that needed to be added that isn't in my project settings or the parameters of a function definition and a prototype don't match up.

EDIT

Here's the #includes and main() of my code...all the CUDA stuff is posted in my earlier question. My compiler options I'm not sure how to get besides from the error code. The project is just a Win32 Console Application and I've got only one source file, this RKF5 Prototype 2.cu file. I tried a separate, new project and the code compiled fine for me, too.

#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
//#include <stdio.h>
#include <iostream>
#include <fstream>
//#include <iomanip>                        //display 2 decimal places
#include <math.h>
using namespace std;

__global__ void rkf5(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*, int*, size_t, double*, double*, double*);
__global__ void calcK(double*, double*, double*);
__global__ void k1(double*, double*, double*);
__global__ void k2(double*, double*, double*);
__global__ void k3(double*, double*, double*);
__global__ void k4(double*, double*, double*);
__global__ void k5(double*, double*, double*);
__global__ void k6(double*, double*, double*);
__global__ void arrAdd(double*, double*, double*);
__global__ void arrSub(double*, double*, double*);
__global__ void arrMult(double*, double*, double*);
__global__ void arrInit(double*, double);
__global__ void arrCopy(double*, double*);
__device__ void setup(double , double*, double*, double*, double*, int*);
__device__ double flux(int, double*) ;
__device__ double knowles_flux(int, double*);
__device__ void calcStepSize(double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, double*, int*);
__global__ void storeConcs(double*, size_t, double*, int);
__global__ void takeFourthOrderStep(double*, double*, double*, double*, double*, double*, double*);
__global__ void takeFifthOrderStep(double*, double*, double*, double*, double*, double*, double*, double*);

//Error checking that I don't understand yet.
#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

//Main program.
int main(int argc, char** argv)
{
    //std::cout << std::fixed;          //display 2 decimal places
    //std::cout << std::setprecision(12);   //display 2 decimal places
    const int maxlength = 1;            //Number of discrete concentrations we are tracking.
    double concs[maxlength];            //Meant to store the current concentrations 
    double temp1[maxlength];                //Used as a bin to store products of Butcher's tableau and k values.
    double temp2[maxlength];                //Used as a bin to store products of Butcher's tableau and k values.
    double tempsum[maxlength];          //Used as a bin to store cumulative sum of tableau and k values
    double k1s[maxlength];
    double k2s[maxlength];
    double k3s[maxlength];
    double k4s[maxlength];
    double k5s[maxlength];
    double k6s[maxlength];
    const int numpoints = 40;       
    double to = 0;
    double tf = 1;
    //double dt = static_cast<double>(.5)/static_cast<double>(64);
    double dt = (tf-to)/static_cast<double>(numpoints);
    double mo = 1;
    double concStorage[maxlength][numpoints];   //Stores concs vs. time                     

    //Initialize all the arrays on the host to ensure arrays of 0's are sent to the device.
    //Also, here is where we can seed the system.
    std::cout<<dt;
    std::cout<<"\n";
    concs[0]=mo;
    std::cout<<concs[0];
    std::cout<<" ";
    for (int i=0; i<maxlength; i++)
    {
        for (int j=0; j<numpoints; j++)
            concStorage[i][j]=0;
        concs[i]=0;
        temp1[i]=0;
        temp2[i]=0;
        tempsum[i]=0;
        k1s[i]=0;
        k2s[i]=0;
        k3s[i]=0;
        k4s[i]=0;
        k5s[i]=0;
        k6s[i]=0;
        std::cout<<concs[i];
        std::cout<<" ";
    }
    concs[0]=mo;
    std::cout<<"\n";

    //Define all the pointers to device array memory addresses. These contain the on-GPU
    //addresses of all the data we're generating/using.
    double *d_concs;
    double *d_temp1;
    double *d_temp2;
    double *d_tempsum;
    double *d_k1s;
    double *d_k2s;
    double *d_k3s;
    double *d_k4s;
    double *d_k5s;
    double *d_k6s;
    double *d_dt;
    int *d_maxlength;
    int *d_numpoints;
    double *d_to;
    double *d_tf;
    double *d_concStorage;

    //Calculate all the sizes of the arrays in order to allocate the proper amount of memory on the GPU.
    size_t size_concs = sizeof(concs);
    size_t size_temp1 = sizeof(temp1);
    size_t size_temp2 = sizeof(temp2);
    size_t size_tempsum = sizeof(tempsum);
    size_t size_ks = sizeof(k1s);
    size_t size_maxlength = sizeof(maxlength);
    size_t size_numpoints = sizeof(numpoints);
    size_t size_dt = sizeof(dt);
    size_t size_to = sizeof(to);
    size_t size_tf = sizeof(tf);
    size_t h_pitch = numpoints*sizeof(double);
    size_t d_pitch;

    //Calculate the "pitch" of the 2D array.  The pitch is basically the length of a 2D array's row.  IT's larger 
    //than the actual row full of data due to hadware issues.  We thusly will use the pitch instead of the data 
    //size to traverse the array.
    gpuErrchk(cudaMallocPitch( (void**)&d_concStorage, &d_pitch, numpoints * sizeof(double), maxlength)); 

    //Allocate memory on the GPU for all the arrrays we're going to use in the integrator.
    gpuErrchk(cudaMalloc((void**)&d_concs, size_concs));
    gpuErrchk(cudaMalloc((void**)&d_temp1, size_temp1));
    gpuErrchk(cudaMalloc((void**)&d_temp2, size_temp1));
    gpuErrchk(cudaMalloc((void**)&d_tempsum, size_tempsum));
    gpuErrchk(cudaMalloc((void**)&d_k1s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k2s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k3s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k4s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k5s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_k6s, size_ks));
    gpuErrchk(cudaMalloc((void**)&d_maxlength, size_maxlength));
    gpuErrchk(cudaMalloc((void**)&d_numpoints, size_numpoints));
    gpuErrchk(cudaMalloc((void**)&d_dt, size_dt));
    gpuErrchk(cudaMalloc((void**)&d_to, size_to));
    gpuErrchk(cudaMalloc((void**)&d_tf, size_tf));

    //Copy all initial values of arrays to GPU.
    gpuErrchk(cudaMemcpy2D(d_concStorage, d_pitch, concStorage, h_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_concs, &concs, size_concs, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_temp1, &temp1, size_temp1, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_temp2, &temp2, size_temp2, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tempsum, &tempsum, size_tempsum, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k1s, &k1s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k2s, &k2s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k3s, &k3s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k4s, &k4s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k5s, &k5s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_k6s, &k6s, size_ks, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_maxlength, &maxlength, size_maxlength, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_numpoints, &numpoints, size_numpoints, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_dt, &dt, size_dt, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_to, &to, size_to, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(d_tf, &tf, size_tf, cudaMemcpyHostToDevice));

    //Run the integrator.
    rkf5<<<1,1>>>(d_concs, d_concStorage, d_temp1, d_temp2, d_tempsum, d_k1s, d_k2s, d_k3s, d_k4s, d_k5s, d_k6s, d_maxlength, d_numpoints, d_pitch, d_dt, d_to, d_tf);
    gpuErrchk( cudaPeekAtLastError() );
    gpuErrchk( cudaDeviceSynchronize() );
    cudaDeviceSynchronize();
    /*
    //Sets all of concStorage to 1 after the kernel  runs. Used to make sure that 2D array copied over the array.
    std::cout << "\n";
    for (int i=0; i<maxlength; i++)
        for(int j=0; j<numpoints; j++)
            concStorage[i][j]=1;
    */

    //Copy concentrations from GPU to Host.  Almost defunct now that transferring the 2D array works.
    cudaMemcpy(concs, d_concs, size_concs, cudaMemcpyDeviceToHost);
    //Copy 2D array of concentrations vs. time from GPU to Host.
    gpuErrchk( cudaMemcpy2D(concStorage, h_pitch, d_concStorage, d_pitch, numpoints*sizeof(double), maxlength, cudaMemcpyDeviceToHost) );   

    //Print concentrations after the integrator kernel runs.  Used to test that data was transferring to and from GPU correctly.
    std::cout << "\n";
    for (int i=0; i<maxlength; i++)
    {
        std::cout<<concs[i];
        std::cout<<" ";
    }

    double a[10];
    double b[10];
    double c[10];
    for(int i = 0; i< 10; i++)
    {
        a[i]=0;
        b[i]=0;
        c[i]=0;
    }

    //Print out the concStorage array after the kernel runs.  Used to test that the 2D array transferred correctly from host to GPU and back.
    std::cout << "\n\n";
    std::cout << "Calculated Array";
    std::cout << "\n\n";
    for (int i=0; i<maxlength; i++)
    {
        for(int j=0; j<numpoints; j++)
        {
            if (j%(numpoints/10)==0)
            {
                a[j/(numpoints/10)]=concStorage[i][j];
                std::cout<<concStorage[i][j];
                std::cout<<"   ";
            }
        }
        std::cout << "\n";
    }
    std::cout << "\n";
    std::cout << "Exponential";
    std::cout << "\n\n";
    for (int i=0; i<10; i++)
    {
        b[i]=exp(-i*(tf-to)/10);
        std::cout<<exp(-i*(tf-to)/10);
        std::cout<<"   ";
    }
    std::cout << "\n\n";
    std::cout << "Error Array";
    std::cout << "\n\n";
    for (int i=0; i<10; i++)
    {
        c[i]=a[i]-b[i];
        std::cout<<c[i];
        std::cout<<"   ";
    }
    std::cout << "\n\n";

    cudaDeviceReset();  //Clean up all memory.
    ///*
    ofstream myfile;
    myfile.open ("example.txt");
    myfile << "Writing.";
    myfile.close();
    //*/

    return 0;
}

回答1:

Your simple code

#include <iostream>
#include <fstream>
using namespace std;

int main () {
    ofstream myfile;
    myfile.open ("example.txt");
    myfile << "Writing this to a file.\n";
    myfile.close();
    return 0;
}

compiled with the -rdc=true option was giving the nvlink error you reported. This can be due to failing to use statically loaded C runtime to match the CUDA runtime. This can be fixed by

Project -> Properties -> Configuration Properties -> CUDA C/C++ -> Host -> Runtime Library 

and the choose

Multi-Threaded Debug (/MTd)

if you are in Debug mode or

Multi-Threaded (/MT)

if you are in Release mode.

I hope that this fixes your problem.