I'm looking for the values of epsilon (the smallest step between two numbers), min (the smallest magnitude) and max (the greatest magnitude) for CUDA devices.
I.E the equivalents to FLT_EPSILON (DBL_EPSILON), FLT_MIN (DBL_MIN) and FLT_MAX (DBL_MAX) defined in <float.h>
in gcc compilers.
Are there constants in some CUDA include file?
Any manual explaining them? Any way to write a kernel to compute them?
Thanks in advance.
Yes, you could certainly calculate these yourself if you wanted. A couple examples for how to calculate machine epsilon are given in C on the wikipedia page; similarly you can find min/max by dividing/multiplying by two until you under/overflow. (you should then search between the last valid value and the next factor of two to find the "true" min/max value, but this gives you a good starting point).
If you have a device with a compute capability of 2.0 or greater, though, then the mathematics is mostly IEEE 754 with some small deviations (eg, not all rounding modes supported) but those deviations aren't enough to effect fundamental numerical constants like these; so you'll get the standard emach for single of 5.96e-08 and double of 1.11e-16; FLT_MIN/MAX of 1.175494351e-38/3.402823466e+38, and DBL_MIN/MAX of 2.2250738585072014e-308/1.7976931348623158e+308.
On compute capability 1.3 machines, denormalized numbers weren't supported in single precision, so your FLT_MIN would be significantly greater than on the CPU.
A quick test on a compute capability 2.0 machine, with quick and dirty calculations for min/max:
#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include <cuda.h>
#include <sys/time.h>
#include <math.h>
#include <assert.h>
#include <float.h>
#define CHK_CUDA(e) {if (e != cudaSuccess) {fprintf(stderr,"Error: %s\n", cudaGetErrorString(e)); exit(-1);}}
/* from wikipedia page, for machine epsilon calculation */
/* assumes mantissa in final bits */
__device__ double machine_eps_dbl() {
typedef union {
long long i64;
double d64;
} dbl_64;
dbl_64 s;
s.d64 = 1.;
s.i64++;
return (s.d64 - 1.);
}
__device__ float machine_eps_flt() {
typedef union {
int i32;
float f32;
} flt_32;
flt_32 s;
s.f32 = 1.;
s.i32++;
return (s.f32 - 1.);
}
#define EPS 0
#define MIN 1
#define MAX 2
__global__ void calc_consts(float *fvals, double *dvals) {
int i = threadIdx.x + blockIdx.x*blockDim.x;
if (i==0) {
fvals[EPS] = machine_eps_flt();
dvals[EPS]= machine_eps_dbl();
float xf, oldxf;
double xd, oldxd;
xf = 2.; oldxf = 1.;
xd = 2.; oldxd = 1.;
/* double until overflow */
/* Note that real fmax is somewhere between xf and oldxf */
while (!isinf(xf)) {
oldxf *= 2.;
xf *= 2.;
}
while (!isinf(xd)) {
oldxd *= 2.;
xd *= 2.;
}
dvals[MAX] = oldxd;
fvals[MAX] = oldxf;
/* half until overflow */
/* Note that real fmin is somewhere between xf and oldxf */
xf = 1.; oldxf = 2.;
xd = 1.; oldxd = 2.;
while (xf != 0.) {
oldxf /= 2.;
xf /= 2.;
}
while (xd != 0.) {
oldxd /= 2.;
xd /= 2.;
}
dvals[MIN] = oldxd;
fvals[MIN] = oldxf;
}
return;
}
int main(int argc, char **argv) {
float fvals[3];
double dvals[3];
float *fvals_d;
double *dvals_d;
CHK_CUDA( cudaMalloc(&fvals_d, 3*sizeof(float)) );
CHK_CUDA( cudaMalloc(&dvals_d, 3*sizeof(double)) );
calc_consts<<<1,32>>>(fvals_d, dvals_d);
CHK_CUDA( cudaMemcpy(fvals, fvals_d, 3*sizeof(float), cudaMemcpyDeviceToHost) );
CHK_CUDA( cudaMemcpy(dvals, dvals_d, 3*sizeof(double), cudaMemcpyDeviceToHost) );
CHK_CUDA( cudaFree(fvals_d) );
CHK_CUDA( cudaFree(dvals_d) );
printf("Single machine epsilon:\n");
printf("CUDA = %g, CPU = %g\n", fvals[EPS], FLT_EPSILON);
printf("Single min value (CUDA - approx):\n");
printf("CUDA = %g, CPU = %g\n", fvals[MIN], FLT_MIN);
printf("Single max value (CUDA - approx):\n");
printf("CUDA = %g, CPU = %g\n", fvals[MAX], FLT_MAX);
printf("\nDouble machine epsilon:\n");
printf("CUDA = %lg, CPU = %lg\n", dvals[EPS], DBL_EPSILON);
printf("Double min value (CUDA - approx):\n");
printf("CUDA = %lg, CPU = %lg\n", dvals[MIN], DBL_MIN);
printf("Double max value (CUDA - approx):\n");
printf("CUDA = %lg, CPU = %lg\n", dvals[MAX], DBL_MAX);
return 0;
}
Compiling/running shows that the answers are consistent with the CPU version (except the min values; is FLT_MIN giving the min normal value rather than denormed on the CPU?)
$ nvcc -o foo foo.cu -arch=sm_20
$ ./foo
Single machine epsilon:
CUDA = 1.19209e-07, CPU = 1.19209e-07
Single min value (CUDA - approx):
CUDA = 1.4013e-45, CPU = 1.17549e-38
Single max value (CUDA - approx):
CUDA = 1.70141e+38, CPU = 3.40282e+38
Double machine epsilon:
CUDA = 2.22045e-16, CPU = 2.22045e-16
Double min value (CUDA - approx):
CUDA = 4.94066e-324, CPU = 2.22507e-308
Double max value (CUDA - approx):
CUDA = 8.98847e+307, CPU = 1.79769e+308