Libmvec is a x86_64 glibc library with SSE4, AVX, AVX2, and AVX-512 vectorized functions for
cos
, exp
, log
, sin
, pow
, and sincos
, in single precision and double precision.
The accuracy of these functions is 4-ulp maximum relative error.
Usually gcc inserts calls to Libmvec functions, such as _ZGVdN4v_cos
,
while it is auto-vectorizing scalar code, for example with -ffast-math
or
#pragma omp simd
. However, these vector functions are also suitable for
manually vectorizing C code.
Unfortunately the Libmvec document
does not give many examples that explain how to use these functions for
manually vectorizing C code.
To use the Libmvec functions one has to include the function declarations
and link the code with the -lm
(link with math library) option.
The linker doesn't need -lmvec
, linking the standard math library -lm
is
sufficient for not static builds, see here.
The -ffast-math
compiler option is not necesary.
The following example shows how to call the AVX2 vectorized functions.
This example runs successful on a standard Ubuntu 18.04 system (glibc version 2.27).
/* gcc -Wall -O3 -m64 -march=skylake libmvec_ex.c -lm */
#include <immintrin.h>
#include <stdio.h>
#include <stdint.h>
#define _GNU_SOURCE /* These two lines are optional. They are only needed to use the scalar */
#include <math.h> /* functions sin, exp, sincos,... */
__m256d _ZGVdN4v_cos(__m256d x);
__m256d _ZGVdN4v_exp(__m256d x);
__m256d _ZGVdN4v_log(__m256d x);
__m256d _ZGVdN4v_sin(__m256d x);
__m256d _ZGVdN4vv_pow(__m256d x, __m256d y);
void _ZGVdN4vvv_sincos(__m256d x, __m256i ptrs, __m256i ptrc);
__m256 _ZGVdN8v_cosf(__m256 x);
__m256 _ZGVdN8v_expf(__m256 x);
__m256 _ZGVdN8v_logf(__m256 x);
__m256 _ZGVdN8v_sinf(__m256 x);
__m256 _ZGVdN8vv_powf(__m256 x, __m256 y);
void _ZGVdN8vvv_sincosf(__m256 x, __m256i ptrs_lo, __m256i ptrs_hi, __m256i ptrc_lo, __m256i ptrc_hi);
int mm256_print_pd(__m256d x);
int mm256_print_ps(__m256 x);
int main()
{
__m256d x, y, z;
__m256 xf, yf, zf;
double z_s[4], z_c[4]; /* sincos output */
float zf_s[8], zf_c[8]; /* sincosf output */
__m256i ptrs, ptrc; /* Pointers to the elements of z_s and z_c */
__m256i ptrs_lo, ptrs_hi, ptrc_lo, ptrc_hi;
x = _mm256_set_pd (0.04, 0.03, 0.02, 0.01);
y = _mm256_set_pd (2.0, 1.5, 1.0, 0.5);
xf = _mm256_set_ps (0.08f, 0.07f, 0.06f, 0.05f, 0.04f, 0.03f, 0.02f, 0.01f);
yf = _mm256_set_ps (4.0f, 3.5f, 3.0f, 2.5f, 2.0f, 1.5f, 1.0f, 0.5f);
printf("AVX2 Double precision examples\n");
printf("x "); mm256_print_pd(x);
printf("y "); mm256_print_pd(y);
z =_ZGVdN4v_cos(x); printf("cos(x) "); mm256_print_pd(z);
z =_ZGVdN4v_exp(x); printf("exp(x) "); mm256_print_pd(z);
z =_ZGVdN4v_log(x); printf("log(x) "); mm256_print_pd(z);
z =_ZGVdN4v_sin(x); printf("sin(x) "); mm256_print_pd(z);
z =_ZGVdN4vv_pow(x, y); printf("pow(x,y) "); mm256_print_pd(z);
ptrs = _mm256_set_epi64x((uint64_t)&z_s[3],(uint64_t)&z_s[2],(uint64_t)&z_s[1],(uint64_t)&z_s[0]);
ptrc = _mm256_set_epi64x((uint64_t)&z_c[3],(uint64_t)&z_c[2],(uint64_t)&z_c[1],(uint64_t)&z_c[0]);
/* Alternative: ptrs = _mm256_add_epi64(_mm256_set1_epi64x((uint64_t)&z_s[0]),_mm256_set_epi64x(24,16,8,0)); */
/* This might be more efficient if the destination addresses are contiguous in memory. */
_ZGVdN4vvv_sincos(x, ptrs, ptrc); /* The results of _ZGVdN4vvv_sincos are scattered into the adresses in ptrs and ptrc */
printf("sincos cos(x) %12.8f %12.8f %12.8f %12.8f \n", z_c[3], z_c[2], z_c[1], z_c[0]);
printf("sincos sin(x) %12.8f %12.8f %12.8f %12.8f\n\n", z_s[3], z_s[2], z_s[1], z_s[0]);
printf("AVX2 Single precision examples\n");
printf("x "); mm256_print_ps(xf);
printf("y "); mm256_print_ps(yf);
zf =_ZGVdN8v_cosf(xf); printf("cosf(x) "); mm256_print_ps(zf);
zf =_ZGVdN8v_expf(xf); printf("expf(x) "); mm256_print_ps(zf);
zf =_ZGVdN8v_logf(xf); printf("logf(x) "); mm256_print_ps(zf);
zf =_ZGVdN8v_sinf(xf); printf("sinf(x) "); mm256_print_ps(zf);
zf =_ZGVdN8vv_powf(xf, yf);printf("powf(x,y) "); mm256_print_ps(zf);
ptrs_lo = _mm256_set_epi64x((uint64_t)&zf_s[3],(uint64_t)&zf_s[2],(uint64_t)&zf_s[1],(uint64_t)&zf_s[0]);
ptrs_hi = _mm256_set_epi64x((uint64_t)&zf_s[7],(uint64_t)&zf_s[6],(uint64_t)&zf_s[5],(uint64_t)&zf_s[4]);
ptrc_lo = _mm256_set_epi64x((uint64_t)&zf_c[3],(uint64_t)&zf_c[2],(uint64_t)&zf_c[1],(uint64_t)&zf_c[0]);
ptrc_hi = _mm256_set_epi64x((uint64_t)&zf_c[7],(uint64_t)&zf_c[6],(uint64_t)&zf_c[5],(uint64_t)&zf_c[4]);
_ZGVdN8vvv_sincosf(xf, ptrs_lo, ptrs_hi, ptrc_lo, ptrc_hi); /* The results of _ZGVdN8vvv_sincosf are scattered to the adresses in ptrs_lo, ptrs_hi, ptrc_lo, and ptrc_hi */
printf("sincosf cos(x)%12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f \n",
zf_c[7], zf_c[6], zf_c[5], zf_c[4], zf_c[3], zf_c[2], zf_c[1], zf_c[0]);
printf("sincosf sin(x)%12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f \n",
zf_s[7], zf_s[6], zf_s[5], zf_s[4], zf_s[3], zf_s[2], zf_s[1], zf_s[0]);
return 0;
}
__attribute__ ((noinline)) int mm256_print_pd(__m256d x){
double vec_x[4];
_mm256_storeu_pd(vec_x,x);
printf("%12.8f %12.8f %12.8f %12.8f \n", vec_x[3], vec_x[2], vec_x[1], vec_x[0]);
return 0;
}
__attribute__ ((noinline)) int mm256_print_ps(__m256 x){
float vec_x[8];
_mm256_storeu_ps(vec_x,x);
printf("%12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f %12.8f \n", vec_x[7], vec_x[6], vec_x[5], vec_x[4],
vec_x[3], vec_x[2], vec_x[1], vec_x[0]);
return 0;
}
/*
The output is as expected:
$ ./a.out
AVX2 Double precision examples
x 0.04000000 0.03000000 0.02000000 0.01000000
y 2.00000000 1.50000000 1.00000000 0.50000000
cos(x) 0.99920011 0.99955003 0.99980001 0.99995000
exp(x) 1.04081077 1.03045453 1.02020134 1.01005017
log(x) -3.21887582 -3.50655790 -3.91202301 -4.60517019
sin(x) 0.03998933 0.02999550 0.01999867 0.00999983
pow(x,y) 0.00160000 0.00519615 0.02000000 0.10000000
sincos cos(x) 0.99920011 0.99955003 0.99980001 0.99995000
sincos sin(x) 0.03998933 0.02999550 0.01999867 0.00999983
AVX2 Single precision examples
x 0.08000000 0.07000000 0.06000000 0.05000000 0.04000000 0.03000000 0.02000000 0.01000000
y 4.00000000 3.50000000 3.00000000 2.50000000 2.00000000 1.50000000 1.00000000 0.50000000
cosf(x) 0.99680173 0.99755102 0.99820060 0.99875027 0.99920011 0.99955004 0.99979997 0.99995005
expf(x) 1.08328700 1.07250810 1.06183648 1.05127108 1.04081070 1.03045452 1.02020133 1.01005018
logf(x) -2.52572870 -2.65926003 -2.81341076 -2.99573231 -3.21887589 -3.50655794 -3.91202307 -4.60517025
sinf(x) 0.07991469 0.06994285 0.05996400 0.04997917 0.03998933 0.02999550 0.01999867 0.00999983
powf(x,y) 0.00004096 0.00009075 0.00021600 0.00055902 0.00160000 0.00519615 0.02000000 0.10000000
sincosf cos(x) 0.99680173 0.99755102 0.99820060 0.99875027 0.99920011 0.99955004 0.99979997 0.99995005
sincosf sin(x) 0.07991469 0.06994285 0.05996400 0.04997917 0.03998933 0.02999550 0.01999867 0.00999983
Note that the vectorized sincos
functions scatters the results to 4 (double) or
8 (float), not necessarily contiguous, memory locations.
Function declarations for SSE4, AVX, and AVX2, with Intel SVML alternative.
The code block below shows the function declaration for many different cases,
and the corresponding SVML function from Intel's SVML library.
Somehow I couldn't find these declarations, with the right parameter lists,
on the internet.
/* Double precision Libmvec Intel SVML function */
/* SSE4 */
__m128d _ZGVbN2v_cos(__m128d x); /* _mm_cos_pd(x) */
__m128d _ZGVbN2v_exp(__m128d x); /* _mm_exp_pd(x) */
__m128d _ZGVbN2v_log(__m128d x); /* _mm_log_pd(x) */
__m128d _ZGVbN2v_sin(__m128d x); /* _mm_sin_pd(x) */
__m128d _ZGVbN2vv_pow(__m128d x, __m128d y); /* _mm_pow_pd(x, y) */
void _ZGVbN2vvv_sincos(__m128d x, __m128i ptrs, __m128i ptrc); /* _mm_sincos_pd */
/* AVX */
__m256d _ZGVcN4v_cos(__m256d x); /* _mm256_cos_pd(x) */
__m256d _ZGVcN4v_exp(__m256d x); /* _mm256_exp_pd(x) */
__m256d _ZGVcN4v_log(__m256d x); /* _mm256_log_pd(x) */
__m256d _ZGVcN4v_sin(__m256d x); /* _mm256_sin_pd(x) */
__m256d _ZGVcN4vv_pow(__m256d x, __m256d y); /* _mm256_pow_pd(x, y) */
void _ZGVcN4vvv_sincos(__m256d x, __m256i ptrs, __m256i ptrc); /* _mm256_sincos_pd */
/* AVX2 */
__m256d _ZGVdN4v_cos(__m256d x); /* _mm256_cos_pd(x) */
__m256d _ZGVdN4v_exp(__m256d x); /* _mm256_exp_pd(x) */
__m256d _ZGVdN4v_log(__m256d x); /* _mm256_log_pd(x) */
__m256d _ZGVdN4v_sin(__m256d x); /* _mm256_sin_pd(x) */
__m256d _ZGVdN4vv_pow(__m256d x, __m256d y); /* _mm256_pow_pd(x, y) */
void _ZGVdN4vvv_sincos(__m256d x, __m256i ptrs, __m256i ptrc); /* _mm256_sincos_pd */
/* Single precision Libmvec Intel SVML function */
/* SSE4 */
__m128 _ZGVbN4v_cosf(__m128 x); /* _mm_cos_ps(x) */
__m128 _ZGVbN4v_expf(__m128 x); /* _mm_exp_ps(x) */
__m128 _ZGVbN4v_logf(__m128 x); /* _mm_log_ps(x) */
__m128 _ZGVbN4v_sinf(__m128 x); /* _mm_sin_ps(x) */
__m128 _ZGVbN4vv_powf(__m128 x, __m128 y); /* _mm_pow_ps(x, y) */
void _ZGVbN4vvv_sincosf(__m128 x, __m128i ptrs_lo, __m128i ptrs_hi, __m128i ptrc_lo, __m128i ptrc_hi); /* _mm_sincos_ps */
/* AVX */
__m256 _ZGVcN8v_cosf(__m256 x); /* _mm256_cos_ps(x) */
__m256 _ZGVcN8v_expf(__m256 x); /* _mm256_exp_ps(x) */
__m256 _ZGVcN8v_logf(__m256 x); /* _mm256_log_ps(x) */
__m256 _ZGVcN8v_sinf(__m256 x); /* _mm256_sin_ps(x) */
__m256 _ZGVcN8vv_powf(__m256 x, __m256 y); /* _mm256_pow_ps(x, y) */
void _ZGVcN8vvv_sincosf(__m256 x, __m256i ptrs_lo, __m256i ptrs_hi, __m256i ptrc_lo, __m256i ptrc_hi); /* _mm256_sincos_ps */
/* AVX2 */
__m256 _ZGVdN8v_cosf(__m256 x); /* _mm256_cos_ps(x) */
__m256 _ZGVdN8v_expf(__m256 x); /* _mm256_exp_ps(x) */
__m256 _ZGVdN8v_logf(__m256 x); /* _mm256_log_ps(x) */
__m256 _ZGVdN8v_sinf(__m256 x); /* _mm256_sin_ps(x) */
__m256 _ZGVdN8vv_powf(__m256 x, __m256 y); /* _mm256_pow_ps(x, y) */
void _ZGVdN8vvv_sincosf(__m256 x, __m256i ptrs_lo, __m256i ptrs_hi, __m256i ptrc_lo, __m256i ptrc_hi); /* _mm256_sincos_ps */
/* Note that I only tested the AVX2 function declarations. */
/* AVX-512: _ZGVeN16v_cosf, _ZGVeN8v_cos etc. */
Useful links:
https://sourceware.org/glibc/wiki/libmvec
https://sourceware.org/glibc/wiki/libmvec?action=AttachFile&do=view&target=VectorABI.txt
https://github.com/sgallagher/glibc/blob/master/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist
https://software.intel.com/sites/default/files/managed/b4/c8/Intel-Vector-Function-ABI.pdf