I'm using CMake as a build system for my code, which involves CUDA. I was thinking of automating the task of deciding which compute_XX
and arch_XX
I need to to pass to my nvcc in order to compile for the GPU(s) on my current machine.
My strategy has been to compile and run a bash script that probes the card and returns the gencode for cmake. Inspiration came from University of Chicago's SLURM. To handle errors or multiple gpus or other circumstances, modify as necessary.
In your project folder create a file cudaComputeVersion.bash and ensure it is executable from the shell. Into this file put:
#!/bin/bash
# create a 'here document' that is code we compile and use to probe the card
cat << EOF > /tmp/cudaComputeVersion.cu
#include <stdio.h>
int main()
{
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop,0);
int v = prop.major * 10 + prop.minor;
printf("-gencode arch=compute_%d,code=sm_%d\n",v,v);
}
EOF
# probe the card and cleanup
/usr/local/cuda/bin/nvcc /tmp/cudaComputeVersion.cu -o /tmp/cudaComputeVersion
/tmp/cudaComputeVersion
rm /tmp/cudaComputeVersion.cu
rm /tmp/cudaComputeVersion
And in your CMakeLists.txt put:
# at cmake-build-time, probe the card and set a cmake variable
execute_process(COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/cudaComputeVersion.bash OUTPUT_VARIABLE GENCODE)
# at project-compile-time, include the gencode into the compile options
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; "${GENCODE}")
# this makes CMake all chatty and allows you to see that GENCODE was set correctly
set(CMAKE_VERBOSE_MAKEFILE TRUE)
cheers
A slight improvement over @orthopteroid's answer, which pretty much ensures a unique temporary file is generated, and only requires one instead of two temporary files.
The following goes into scripts/get_cuda_sm.sh
:
#!/bin/bash
#
# Prints the compute capability of the first CUDA device installed
# on the system, or alternatively the device whose index is the
# first command-line argument
device_index=${1:-0}
timestamp=$(date +%s.%N)
gcc_binary=$(which g++)
gcc_binary=${gcc_binary:-g++}
cuda_root=${CUDA_DIR:-/usr/local/cuda}
CUDA_INCLUDE_DIRS=${CUDA_INCLUDE_DIRS:-${cuda_root}/include}
CUDA_CUDART_LIBRARY=${CUDA_CUDART_LIBRARY:-${cuda_root}/lib64/libcudart.so}
generated_binary="/tmp/cuda-compute-version-helper-$$-$timestamp"
# create a 'here document' that is code we compile and use to probe the card
source_code="$(cat << EOF
#include <stdio.h>
#include <cuda_runtime_api.h>
int main()
{
cudaDeviceProp prop;
cudaError_t status;
int device_count;
status = cudaGetDeviceCount(&device_count);
if (status != cudaSuccess) {
fprintf(stderr,"cudaGetDeviceCount() failed: %s\n", cudaGetErrorString(status));
return -1;
}
if (${device_index} >= device_count) {
fprintf(stderr, "Specified device index %d exceeds the maximum (the device count on this system is %d)\n", ${device_index}, device_count);
return -1;
}
status = cudaGetDeviceProperties(&prop, ${device_index});
if (status != cudaSuccess) {
fprintf(stderr,"cudaGetDeviceProperties() for device ${device_index} failed: %s\n", cudaGetErrorString(status));
return -1;
}
int v = prop.major * 10 + prop.minor;
printf("%d\\n", v);
}
EOF
)"
echo "$source_code" | $gcc_binary -x c++ -I"$CUDA_INCLUDE_DIRS" -o "$generated_binary" - -x none "$CUDA_CUDART_LIBRARY"
# probe the card and cleanup
$generated_binary
rm $generated_binary
and the following goes into CMakeLists.txt
or a CMake module:
if (NOT CUDA_TARGET_COMPUTE_CAPABILITY)
if("$ENV{CUDA_SM}" STREQUAL "")
set(ENV{CUDA_INCLUDE_DIRS} "${CUDA_INCLUDE_DIRS}")
set(ENV{CUDA_CUDART_LIBRARY} "${CUDA_CUDART_LIBRARY}")
set(ENV{CMAKE_CXX_COMPILER} "${CMAKE_CXX_COMPILER}")
execute_process(COMMAND
bash -c "${CMAKE_CURRENT_SOURCE_DIR}/scripts/get_cuda_sm.sh"
OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY_)
else()
set(CUDA_TARGET_COMPUTE_CAPABILITY_ $ENV{CUDA_SM})
endif()
set(CUDA_TARGET_COMPUTE_CAPABILITY "${CUDA_TARGET_COMPUTE_CAPABILITY_}"
CACHE STRING "CUDA compute capability of the (first) CUDA device on \
the system, in XY format (like the X.Y format but no dot); see table \
of features and capabilities by capability X.Y value at \
https://en.wikipedia.org/wiki/CUDA#Version_features_and_specifications")
execute_process(COMMAND
bash -c "echo -n $(echo ${CUDA_TARGET_COMPUTE_CAPABILITY})"
OUTPUT_VARIABLE CUDA_TARGET_COMPUTE_CAPABILITY)
execute_process(COMMAND
bash -c "echo ${CUDA_TARGET_COMPUTE_CAPABILITY} | sed 's/^\\([0-9]\\)\\([0-9]\\)/\\1.\\2/;' | xargs echo -n"
OUTPUT_VARIABLE FORMATTED_COMPUTE_CAPABILITY)
message(STATUS
"CUDA device-side code will assume compute capability \
${FORMATTED_COMPUTE_CAPABILITY}")
endif()
set(CUDA_GENCODE
"arch=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}, code=compute_${CUDA_TARGET_COMPUTE_CAPABILITY}")
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -gencode ${CUDA_GENCODE} )