I have a simple script formed by 3 CUDA files and 2 headers: main.cu, kernel.cu func.cu, kernel.h and func.h. Their goal is to calculate the sum of 2 vectors.
// main.cu
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include "kernel.h"
int main(){
/* Error code to check return values for CUDA calls */
cudaError_t err = cudaSuccess;
srand(time(NULL));
int count = 100;
int A[count], B[count];
int *h_A, *h_B;
h_A = A; h_B = B;
int i;
for(i=0;i<count;i++){
*(h_A+i) = rand() % count; /* Oppure: h_A[i] = rand() % count; */
*(h_B+i) = rand() % count; /* Oppure: h_B[i] = rand() % count; */
}
/* Display dei vettori A e B. */
printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}
printf("\nPrimi cinque valori di B = ");
for(i=0;i<4;i++){printf("%d ", B[i]);}
int *d_A, *d_B;
err = cudaMalloc((void**)&d_A, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMalloc((void**)&d_B, count*sizeof(int));
if (err != cudaSuccess){fprintf(stderr, "Failed to allocate device vector A (error code %s)! \n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_A, A, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaMemcpy(d_B, B, count*sizeof(int), cudaMemcpyHostToDevice);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
int numThreads = 256;
int numBlocks = count/numThreads + 1;
AddInts<<<numBlocks,numThreads>>>(d_A,d_B); err = cudaGetLastError();
err = cudaMemcpy(A, d_A, count*sizeof(int), cudaMemcpyDeviceToHost);
if (err != cudaSuccess){fprintf(stderr, "Failed to copy vector C from device to host (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_A);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
err = cudaFree(d_B);
if (err != cudaSuccess){fprintf(stderr, "Failed to free device vector A (error code %s)!\n", cudaGetErrorString(err));exit(EXIT_FAILURE);}
printf("\nPrimi cinque valori di A = ");
for(i=0;i<4;i++){printf("%d ", A[i]);}
printf("\n");
return 0;}
Here there's the kernel.cu file:
// kernel.cu
#include "func.h"
#include "kernel.h"
__global__ void AddInts(int *a, int *b){
int ID = get_global_index();
*(a+ID) += *(b+ID);
}
Here is func.cu
// func.cu
#include "func.h"
__device__ int get_global_index(){
return (blockIdx.x * blockDim.x) + threadIdx.x;
}
Here is kernel.h
// kernel.h
__global__ void AddInts(int *a, int *b);
Here is func.h
// func.h
__device__ int get_global_index();
I am 100 % sure that the main.cu script is correct; I also know that i could just add the kernel directly in the the main script but that is not the intention of my test; I also know that I could just get rid of the __device__
function and put it directly inside of the __global__
but it's not my intention either.
Now here is the problem: i wrote a very simple makefile that should be able to compile the program but somehow it doesn't work; here is the makefile:
# Location of the CUDA Toolkit
CUDA_PATH ?= /usr/local/cuda-6.5
OSUPPER = $(shell uname -s 2>/dev/null | tr "[:lower:]" "[:upper:]")
OSLOWER = $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
OS_SIZE = $(shell uname -m | sed -e "s/x86_64/64/" -e "s/armv7l/32/" -e "s/aarch64/64/")
OS_ARCH = $(shell uname -m)
ARCH_FLAGS =
DARWIN = $(strip $(findstring DARWIN, $(OSUPPER)))
ifneq ($(DARWIN),)
XCODE_GE_5 = $(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5)
endif
# Take command line flags that override any of these settings
ifeq ($(x86_64),1)
OS_SIZE = 64
OS_ARCH = x86_64
endif
ifeq ($(ARMv7),1)
OS_SIZE = 32
OS_ARCH = armv7l
ARCH_FLAGS = -target-cpu-arch ARM
endif
ifeq ($(aarch64),1)
OS_SIZE = 64
OS_ARCH = aarch64
ARCH_FLAGS = -target-cpu-arch ARM
endif
# Common binaries
ifneq ($(DARWIN),)
ifeq ($(XCODE_GE_5),1)
GCC ?= clang
else
GCC ?= g++
endif
else
ifeq ($(ARMv7),1)
GCC ?= arm-linux-gnueabihf-g++
else
GCC ?= g++
endif
endif
NVCC := $(CUDA_PATH)/bin/nvcc -ccbin $(GCC)
# internal flags
NVCCFLAGS := -m${OS_SIZE} ${ARCH_FLAGS}
CCFLAGS :=
LDFLAGS :=
# Extra user flags
EXTRA_NVCCFLAGS ?=
EXTRA_LDFLAGS ?=
EXTRA_CCFLAGS ?=
# OS-specific build flags
ifneq ($(DARWIN),)
LDFLAGS += -rpath $(CUDA_PATH)/lib
CCFLAGS += -arch $(OS_ARCH)
else
ifeq ($(OS_ARCH),armv7l)
ifeq ($(abi),androideabi)
NVCCFLAGS += -target-os-variant Android
else
ifeq ($(abi),gnueabi)
CCFLAGS += -mfloat-abi=softfp
else
# default to gnueabihf
override abi := gnueabihf
LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
CCFLAGS += -mfloat-abi=hard
endif
endif
endif
endif
ifeq ($(ARMv7),1)
ifneq ($(TARGET_FS),)
GCCVERSIONLTEQ46 := $(shell expr `$(GCC) -dumpversion` \<= 4.6)
ifeq ($(GCCVERSIONLTEQ46),1)
CCFLAGS += --sysroot=$(TARGET_FS)
endif
LDFLAGS += --sysroot=$(TARGET_FS)
LDFLAGS += -rpath-link=$(TARGET_FS)/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-$(abi)
endif
endif
# Debug build flags
ifeq ($(dbg),1)
NVCCFLAGS += -g -G
TARGET := debug
else
TARGET := release
endif
ALL_CCFLAGS :=
ALL_CCFLAGS += $(NVCCFLAGS)
ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
ALL_LDFLAGS :=
ALL_LDFLAGS += $(ALL_CCFLAGS)
ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
# Common includes and paths for CUDA
INCLUDES := -I common/inc
LIBRARIES :=
################################################################################
SAMPLE_ENABLED := 1
# Gencode arguments
ifeq ($(OS_ARCH),armv7l)
SMS ?= 20 30 32 35 37 50
else
SMS ?= 11 20 30 35 37 50
endif
ifeq ($(SMS),)
$(info >>> WARNING - no SM architectures have been specified - waiving sample <<<)
SAMPLE_ENABLED := 0
endif
ifeq ($(GENCODE_FLAGS),)
# Generate SASS code for each SM architecture listed in $(SMS)
$(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
# Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
HIGHEST_SM := $(lastword $(sort $(SMS)))
ifneq ($(HIGHEST_SM),)
GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
endif
endif
LIBRARIES += -lcufft
ifeq ($(SAMPLE_ENABLED),0)
EXEC ?= @echo "[@]"
endif
################################################################################
OBJS = main.o kernel.o func.o
CFLAGS = -rdc=true
# Target rules
all: build
build: eseguibile
check.deps:
ifeq ($(SAMPLE_ENABLED),0)
@echo "Sample will be waived due to the above missing dependencies"
else
@echo "Sample is ready - all dependencies have been met"
endif
main.o:main.cu
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
kernel.o:kernel.cu kernel.h
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
func.o:func.cu func.h
$(EXEC) $(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
eseguibile: $(OBJS)
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
$(EXEC) mkdir -p ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))
$(EXEC) cp $@ ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))
run: build
$(EXEC) ./eseguibile
clean:
rm -f eseguibile $(OBJS)
rm -rf ../../bin/$(OS_ARCH)/$(OSLOWER)/$(TARGET)$(if $(abi),/$(abi))/eseguibile
clobber: clean
Where the common/inc folder is the one containing the header files given by Nvidia necessary to make Cuda perform correctly; for what concerns the tabs, they are 100% correct in my original file, but i just couldn't replicate them in stackoverflo; the error that I get is this:
./kernel.cu(6): Error: External calls are not supported (found non-inlined call to _Z16get_global_indexv)
make: *** [kernel.o] Error 2
The makefile is based on the one provided by Nvidia in the samples; I don't really know where the mistake may be; is it just the makefile or am i not supposed to nest the functions like i just did?
You've got a situation here that is going to require relocatable device code linking (aka separate compilation/linking), but your Makefile is not set up properly for that.
There are a number of situations that may require separate compilation and linking. One example, which is present in your project, is when a device code in one module calls a
__device__
function in another module. In this case, theAddInts
kernel in kernel.cu is calling theget_global_index
__device__
function, which is defined in func.cu. This will require separate compilation and linking of device code.In this case, the solution is fairly simple. We only need to change the corresponding
-c
compile option to-dc
, whereever it is used in your Makefile. These 3 lines had to be changed:In addition, you will need to change your selection of architectures to target from this:
to this:
as
sm_11
is not a valid architecture for separate compilation and linking. (If you must run this code on a pre-cc2.0 device, then you will have to restructure your code to include all the device functions in the same file; which you explicitly said you did not want to do in your question.)Note that this is not the only way to modify the Makefile. You have a definition like this:
but you're not using it anywhere. In lieu of making the above changes from
-c
to-dc
, we could just add$(CFLAGS)
to each of those 3 lines instead. The resultant syntax would be equivalent. (i.e.-dc
is equivalent to-rdc=true -c
)An unrelated comment is that your code as posted has no dependencies on the cufft library. Therefore you could change this line:
to this:
in your Makefile. However, this change is not necessary to build correct code, according to what you have depicted here. If your project will ultimately use the cufft library, then you should probably just leave this line as-is.