Unable to create a working Makefile for CUDA C pro

I have a simple script formed by 3 CUDA files and 2 headers: main.cu, kernel.cu func.cu, kernel.h and func.h. Their goal is to calculate the sum of 2 vectors.

// main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>

#include "kernel.h"

int main(){
/* Error code to check return values for CUDA calls */
cudaError_t err = cudaSuccess;   

srand(time(NULL));
int count = 100;
int A[count], B[count];
int *h_A, *h_B;
h_A = A; h_B = B;

int i;
for(i=0;i<count;i++){
    *(h_A+i) = rand() % count; /* Oppure: h_A[i] = rand() % count; */
    *(h_B+i) = rand() % count; /* Oppure: h_B[i] = rand() % count; */
}
/* Display dei vettori A e B. */
printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}
printf("\nPrimi cinque valori di B = ");
for(i=0;i<4;i++){printf("%d ", B[i]);}


int *d_A, *d_B;

err = cudaMalloc((void**)&d_A, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_B, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}

err = cudaMemcpy(d_A, A, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_B, B, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}


int numThreads = 256;
int numBlocks = count/numThreads + 1;
AddInts<<<numBlocks,numThreads>>>(d_A,d_B); err = cudaGetLastError(); 

err = cudaMemcpy(A, d_A, count*sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}

err = cudaFree(d_A); 
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_B); 
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}

printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}

printf("\n");
return 0;}

Here there's the kernel.cu file:

// kernel.cu
#include "func.h"
#include "kernel.h"

__global__ void AddInts(int *a, int *b){
           int ID = get_global_index();
           *(a+ID) += *(b+ID);

}

Here is func.cu

// func.cu
#include "func.h"
__device__ int get_global_index(){
return (blockIdx.x * blockDim.x) + threadIdx.x;
}

Here is kernel.h

// kernel.h
__global__ void AddInts(int *a, int *b);

Here is func.h

// func.h
__device__ int get_global_index();

I am 100 % sure that the main.cu script is correct; I also know that i could just add the kernel directly in the the main script but that is not the intention of my test; I also know that I could just get rid of the __device__ function and put it directly inside of the __global__ but it's not my intention either.

Now here is the problem: i wrote a very simple makefile that should be able to compile the program but somehow it doesn't work; here is the makefile:

# Location of the CUDA Toolkit
CUDA_PATH       ?= /usr/local/cuda-6.5

OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]")
OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")

OS_SIZE    = $(shell uname -m | sed -e "s/x86_64/64/" -e "s/armv7l/32/" -e "s/aarch64/64/")
OS_ARCH    = $(shell uname -m)
ARCH_FLAGS =

DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
ifneq ($(DARWIN),)
XCODE_GE_5 = $(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5)
endif

# Take command line flags that override any of these settings
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
ifeq ($(ARMv7),1)
OS_SIZE    = 32
OS_ARCH    = armv7l
ARCH_FLAGS = -target-cpu-arch ARM
endif
ifeq ($(aarch64),1)
OS_SIZE    = 64
OS_ARCH    = aarch64
ARCH_FLAGS = -target-cpu-arch ARM
endif

# Common binaries
ifneq ($(DARWIN),)
ifeq ($(XCODE_GE_5),1)
GCC ?= clang
else
GCC ?= g++
endif
else
ifeq ($(ARMv7),1)
GCC ?= arm-linux-gnueabihf-g++
else
GCC ?= g++
endif
endif
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC)

# internal flags
NVCCFLAGS   := -m${OS_SIZE} ${ARCH_FLAGS} 
CCFLAGS     :=
LDFLAGS     :=

# Extra user flags
EXTRA_NVCCFLAGS   ?=
EXTRA_LDFLAGS     ?=
EXTRA_CCFLAGS     ?=

# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(OS_ARCH)
else
ifeq ($(OS_ARCH),armv7l)
ifeq ($(abi),androideabi)
  NVCCFLAGS += -target-os-variant Android
else
  ifeq ($(abi),gnueabi)
    CCFLAGS += -mfloat-abi=softfp
  else
    # default to gnueabihf
    override abi := gnueabihf
    LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
    CCFLAGS += -mfloat-abi=hard
  endif
endif
endif
endif

ifeq ($(ARMv7),1)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(GCC) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi)
endif
endif

# Debug build flags
ifeq ($(dbg),1)
  NVCCFLAGS += -g -G 
  TARGET := debug
else
  TARGET := release
endif

ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))

ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))

# Common includes and paths for CUDA
INCLUDES  := -I common/inc
LIBRARIES :=

################################################################################

SAMPLE_ENABLED := 1

# Gencode arguments
ifeq ($(OS_ARCH),armv7l)
SMS ?= 20 30 32 35 37 50
else
SMS ?= 11 20 30 35 37 50
endif

ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif

ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))

# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif

LIBRARIES += -lcufft

ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif

################################################################################

OBJS = main.o kernel.o func.o
CFLAGS = -rdc=true

# Target rules
all: build

build: eseguibile

check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif

main.o:main.cu 
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

kernel.o:kernel.cu kernel.h
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

func.o:func.cu func.h
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<

eseguibile: $(OBJS) 
    $(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
    $(EXEC) mkdir -p ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))
    $(EXEC) cp $@ ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))

run: build
    $(EXEC) ./eseguibile

clean:
    rm -f eseguibile $(OBJS)  
    rm -rf ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))/eseguibile

clobber: clean

Where the common/inc folder is the one containing the header files given by Nvidia necessary to make Cuda perform correctly; for what concerns the tabs, they are 100% correct in my original file, but i just couldn't replicate them in stackoverflo; the error that I get is this:

./kernel.cu(6): Error: External calls are not supported (found non-inlined call to _Z16get_global_indexv)
 make: *** [kernel.o] Error 2

The makefile is based on the one provided by Nvidia in the samples; I don't really know where the mistake may be; is it just the makefile or am i not supposed to nest the functions like i just did?

You've got a situation here that is going to require relocatable device code linking (aka separate compilation/linking), but your Makefile is not set up properly for that.

There are a number of situations that may require separate compilation and linking. One example, which is present in your project, is when a device code in one module calls a __device__ function in another module. In this case, the AddInts kernel in kernel.cu is calling the get_global_index __device__ function, which is defined in func.cu. This will require separate compilation and linking of device code.

In this case, the solution is fairly simple. We only need to change the corresponding -c compile option to -dc, whereever it is used in your Makefile. These 3 lines had to be changed:

main.o:main.cu 
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $<

kernel.o:kernel.cu kernel.h
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $<

func.o:func.cu func.h
    $(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -dc $<

                                                                       ^
                                                                       |
                                                        make changes here

In addition, you will need to change your selection of architectures to target from this:

SMS ?= 11 20 30 35 37 50

to this:

SMS ?= 20 30 35 37 50

as sm_11 is not a valid architecture for separate compilation and linking. (If you must run this code on a pre-cc2.0 device, then you will have to restructure your code to include all the device functions in the same file; which you explicitly said you did not want to do in your question.)

Note that this is not the only way to modify the Makefile. You have a definition like this:

CFLAGS = -rdc=true

but you're not using it anywhere. In lieu of making the above changes from -c to -dc, we could just add $(CFLAGS) to each of those 3 lines instead. The resultant syntax would be equivalent. (i.e. -dc is equivalent to -rdc=true -c)

An unrelated comment is that your code as posted has no dependencies on the cufft library. Therefore you could change this line:

LIBRARIES += -lcufft

to this:

LIBRARIES +=

in your Makefile. However, this change is not necessary to build correct code, according to what you have depicted here. If your project will ultimately use the cufft library, then you should probably just leave this line as-is.