I found an interesting Gamasutra article about SIMD pitfalls, which states that it is not possible to reach the performance of the "pure" __m128
type with wrapper types. Well I was skeptical, so I downloaded the project files and fabricated a comparable test case.
It turned out (for my surprise) that the wrapper version is significantly slower. Since I don't want to talk about just the thin air, the test cases are the following:
In the 1st case Vec4
is a simple alias of the __m128
type with some operators:
#include <xmmintrin.h>
#include <emmintrin.h>
using Vec4 = __m128;
inline __m128 VLoad(float f)
{
return _mm_set_ps(f, f, f, f);
};
inline Vec4& operator+=(Vec4 &va, Vec4 vb)
{
return (va = _mm_add_ps(va, vb));
};
inline Vec4& operator*=(Vec4 &va, Vec4 vb)
{
return (va = _mm_mul_ps(va, vb));
};
inline Vec4 operator+(Vec4 va, Vec4 vb)
{
return _mm_add_ps(va, vb);
};
inline Vec4 operator-(Vec4 va, Vec4 vb)
{
return _mm_sub_ps(va, vb);
};
inline Vec4 operator*(Vec4 va, Vec4 vb)
{
return _mm_mul_ps(va, vb);
};
In the 2nd case Vec4
is a lightweight wrapper around __m128
.
It is not a complete wrapper, just a short sketch which covers the issue. The operators wrap exactly the same intrinsics, the only difference is (since 16-byte alignment cannot be applied on arguments) that they take Vec4
as const
reference:
#include <xmmintrin.h>
#include <emmintrin.h>
struct Vec4
{
__m128 simd;
inline Vec4() = default;
inline Vec4(const Vec4&) = default;
inline Vec4& operator=(const Vec4&) = default;
inline Vec4(__m128 s)
: simd(s)
{}
inline operator __m128() const
{
return simd;
}
inline operator __m128&()
{
return simd;
}
};
inline __m128 VLoad(float f)
{
return _mm_set_ps(f, f, f, f);
};
inline Vec4 VAdd(const Vec4 &va, const Vec4 &vb)
{
return _mm_add_ps(va, vb);
// return _mm_add_ps(va.simd, vb.simd); // doesn't make difference
};
inline Vec4 VSub(const Vec4 &va, const Vec4 &vb)
{
return _mm_sub_ps(va, vb);
// return _mm_sub_ps(va.simd, vb.simd); // doesn't make difference
};
inline Vec4 VMul(const Vec4 &va, const Vec4 &vb)
{
return _mm_mul_ps(va, vb);
// return _mm_mul_ps(va.simd, vb.simd); // doesn't make difference
};
And here is the test kernel which produces different performance with different versions of Vec4
:
#include <xmmintrin.h>
#include <emmintrin.h>
struct EQSTATE
{
// Filter #1 (Low band)
Vec4 lf; // Frequency
Vec4 f1p0; // Poles ...
Vec4 f1p1;
Vec4 f1p2;
Vec4 f1p3;
// Filter #2 (High band)
Vec4 hf; // Frequency
Vec4 f2p0; // Poles ...
Vec4 f2p1;
Vec4 f2p2;
Vec4 f2p3;
// Sample history buffer
Vec4 sdm1; // Sample data minus 1
Vec4 sdm2; // 2
Vec4 sdm3; // 3
// Gain Controls
Vec4 lg; // low gain
Vec4 mg; // mid gain
Vec4 hg; // high gain
};
static float vsaf = (1.0f / 4294967295.0f); // Very small amount (Denormal Fix)
static Vec4 vsa = VLoad(vsaf);
Vec4 TestEQ(EQSTATE* es, Vec4& sample)
{
// Locals
Vec4 l,m,h; // Low / Mid / High - Sample Values
// Filter #1 (lowpass)
es->f1p0 += (es->lf * (sample - es->f1p0)) + vsa;
//es->f1p0 = VAdd(es->f1p0, VAdd(VMul(es->lf, VSub(sample, es->f1p0)), vsa));
es->f1p1 += (es->lf * (es->f1p0 - es->f1p1));
//es->f1p1 = VAdd(es->f1p1, VMul(es->lf, VSub(es->f1p0, es->f1p1)));
es->f1p2 += (es->lf * (es->f1p1 - es->f1p2));
//es->f1p2 = VAdd(es->f1p2, VMul(es->lf, VSub(es->f1p1, es->f1p2)));
es->f1p3 += (es->lf * (es->f1p2 - es->f1p3));
//es->f1p3 = VAdd(es->f1p3, VMul(es->lf, VSub(es->f1p2, es->f1p3)));
l = es->f1p3;
// Filter #2 (highpass)
es->f2p0 += (es->hf * (sample - es->f2p0)) + vsa;
//es->f2p0 = VAdd(es->f2p0, VAdd(VMul(es->hf, VSub(sample, es->f2p0)), vsa));
es->f2p1 += (es->hf * (es->f2p0 - es->f2p1));
//es->f2p1 = VAdd(es->f2p1, VMul(es->hf, VSub(es->f2p0, es->f2p1)));
es->f2p2 += (es->hf * (es->f2p1 - es->f2p2));
//es->f2p2 = VAdd(es->f2p2, VMul(es->hf, VSub(es->f2p1, es->f2p2)));
es->f2p3 += (es->hf * (es->f2p2 - es->f2p3));
//es->f2p3 = VAdd(es->f2p3, VMul(es->hf, VSub(es->f2p2, es->f2p3)));
h = es->sdm3 - es->f2p3;
//h = VSub(es->sdm3, es->f2p3);
// Calculate midrange (signal - (low + high))
m = es->sdm3 - (h + l);
//m = VSub(es->sdm3, VAdd(h, l));
// Scale, Combine and store
l *= es->lg;
m *= es->mg;
h *= es->hg;
//l = VMul(l, es->lg);
//m = VMul(m, es->mg);
//h = VMul(h, es->hg);
// Shuffle history buffer
es->sdm3 = es->sdm2;
es->sdm2 = es->sdm1;
es->sdm1 = sample;
// Return result
return(l + m + h);
//return(VAdd(l, VAdd(m, h)));
}
//make these as globals to enforce the function call;
static Vec4 sample[1024], result[1024];
static EQSTATE es;
#include <chrono>
#include <iostream>
int main()
{
auto t0 = std::chrono::high_resolution_clock::now();
for (int ii=0; ii<1024; ii++)
{
result[ii] = TestEQ(&es, sample[ii]);
}
auto t1 = std::chrono::high_resolution_clock::now();
auto t = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
std::cout << "timing: " << t << '\n';
std::cin.get();
return 0;
}
Link to working code
- https://godbolt.org/g/fZ8X0N
MSVC 2015 generated assembly for the 1st version:
; COMDAT ?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z
_TEXT SEGMENT
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z PROC ; TestEQ, COMDAT
; _es$dead$ = ecx
; _sample$ = edx
vmovaps xmm0, XMMWORD PTR [edx]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmulps xmm0, xmm0, xmm2
vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
vmovaps xmm0, XMMWORD PTR [edx]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3T__m128@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vsubps xmm2, xmm1, xmm0
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
vmovaps xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
vmovaps xmm0, XMMWORD PTR [edx]
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
vaddps xmm0, xmm4, xmm2
vsubps xmm0, xmm1, xmm0
vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
vaddps xmm1, xmm1, xmm0
vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
vaddps xmm0, xmm1, xmm0
ret 0
?TestEQ@@YA?AT__m128@@PAUEQSTATE@@AAT1@@Z ENDP ; TestEQ
MSVC 2015 generated assembly for the 2nd version:
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z PROC ; TestEQ, COMDAT
; ___$ReturnUdt$ = ecx
; _es$dead$ = edx
push ebx
mov ebx, esp
sub esp, 8
and esp, -8 ; fffffff8H
add esp, 4
push ebp
mov ebp, DWORD PTR [ebx+4]
mov eax, DWORD PTR _sample$[ebx]
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A
vmovaps xmm1, XMMWORD PTR ?es@@3UEQSTATE@@A+192
mov DWORD PTR [esp+4], ebp
vmovaps xmm0, XMMWORD PTR [eax]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+16
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+16, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+32
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+32, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+48
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+48, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmulps xmm0, xmm0, xmm2
vaddps xmm4, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+64
vmovaps xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+80
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+64, xmm4
vmovaps xmm0, XMMWORD PTR [eax]
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?vsa@@3UVec4@VMATH@@A
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+96
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+96, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+112
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+112, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+128
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+128, xmm0
vsubps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vmulps xmm0, xmm0, xmm2
vaddps xmm0, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+144
vsubps xmm2, xmm1, xmm0
vmovaps XMMWORD PTR ?es@@3UEQSTATE@@A+144, xmm0
vaddps xmm0, xmm2, xmm4
vsubps xmm0, xmm1, xmm0
vmulps xmm1, xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+224
vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+176
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+192, xmm0
vmovdqu xmm0, XMMWORD PTR ?es@@3UEQSTATE@@A+160
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+176, xmm0
vmovdqu xmm0, XMMWORD PTR [eax]
vmovdqu XMMWORD PTR ?es@@3UEQSTATE@@A+160, xmm0
vmulps xmm0, xmm4, XMMWORD PTR ?es@@3UEQSTATE@@A+208
vaddps xmm1, xmm0, xmm1
vmulps xmm0, xmm2, XMMWORD PTR ?es@@3UEQSTATE@@A+240
vaddps xmm0, xmm1, xmm0
vmovaps XMMWORD PTR [ecx], xmm0
mov eax, ecx
pop ebp
mov esp, ebx
pop ebx
ret 0
?TestEQ@@YA?AUVec4@VMATH@@PAUEQSTATE@@AAU12@@Z ENDP ; TestEQ
The produced assembly of the 2nd version is significantly longer and slower. It is not strictly related to Visual Studio, since Clang 3.8 produces similar performance results.
Clang 3.8 generated assembly for the 1st version:
"?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z": # @"\01?TestEQ@@YAT__m128@@PAUEQSTATE@@AAT1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0: # %entry
movl 8(%esp), %eax
movl 4(%esp), %ecx
vmovaps _vsa, %xmm0
vmovaps (%ecx), %xmm1
vmovaps 16(%ecx), %xmm2
vmovaps (%eax), %xmm3
vsubps %xmm2, %xmm3, %xmm3
vmulps %xmm3, %xmm1, %xmm3
vaddps %xmm3, %xmm0, %xmm3
vaddps %xmm3, %xmm2, %xmm2
vmovaps %xmm2, 16(%ecx)
vmovaps 32(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 32(%ecx)
vmovaps 48(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 48(%ecx)
vmovaps 64(%ecx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm2, %xmm1, %xmm1
vaddps %xmm1, %xmm3, %xmm1
vmovaps %xmm1, 64(%ecx)
vmovaps 80(%ecx), %xmm2
vmovaps 96(%ecx), %xmm3
vmovaps (%eax), %xmm4
vsubps %xmm3, %xmm4, %xmm4
vmulps %xmm4, %xmm2, %xmm4
vaddps %xmm4, %xmm0, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 96(%ecx)
vmovaps 112(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 112(%ecx)
vmovaps 128(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 128(%ecx)
vmovaps 144(%ecx), %xmm3
vsubps %xmm3, %xmm0, %xmm0
vmulps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm3, %xmm0
vmovaps %xmm0, 144(%ecx)
vmovaps 192(%ecx), %xmm2
vsubps %xmm0, %xmm2, %xmm0
vaddps %xmm0, %xmm1, %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps 208(%ecx), %xmm1, %xmm1
vmulps 224(%ecx), %xmm2, %xmm2
vmulps 240(%ecx), %xmm0, %xmm0
vmovaps 176(%ecx), %xmm3
vmovaps %xmm3, 192(%ecx)
vmovaps 160(%ecx), %xmm3
vmovaps %xmm3, 176(%ecx)
vmovaps (%eax), %xmm3
vmovaps %xmm3, 160(%ecx)
vaddps %xmm2, %xmm0, %xmm0
vaddps %xmm0, %xmm1, %xmm0
retl
Lfunc_end0:
Clang 3.8 generated assembly for the 2nd version:
"?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z": # @"\01?TestEQ@@YA?AUVec4@@PAUEQSTATE@@AAU1@@Z"
Lfunc_begin0:
Ltmp0:
# BB#0: # %entry
movl 12(%esp), %ecx
movl 8(%esp), %edx
vmovaps (%edx), %xmm0
vmovaps 16(%edx), %xmm1
vmovaps (%ecx), %xmm2
vsubps %xmm1, %xmm2, %xmm2
vmulps %xmm0, %xmm2, %xmm2
vaddps _vsa, %xmm2, %xmm2
vaddps %xmm2, %xmm1, %xmm1
vmovaps %xmm1, 16(%edx)
vmovaps 32(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm1
vaddps %xmm1, %xmm2, %xmm1
vmovaps %xmm1, 32(%edx)
vmovaps 48(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm1
vaddps %xmm1, %xmm2, %xmm1
vmovaps %xmm1, 48(%edx)
vmovaps 64(%edx), %xmm2
vsubps %xmm2, %xmm1, %xmm1
vmulps %xmm0, %xmm1, %xmm0
vaddps %xmm0, %xmm2, %xmm0
vmovaps %xmm0, 64(%edx)
vmovaps 80(%edx), %xmm1
vmovaps 96(%edx), %xmm2
vmovaps (%ecx), %xmm3
vsubps %xmm2, %xmm3, %xmm3
vmulps %xmm1, %xmm3, %xmm3
vaddps _vsa, %xmm3, %xmm3
vaddps %xmm3, %xmm2, %xmm2
vmovaps %xmm2, 96(%edx)
vmovaps 112(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 112(%edx)
vmovaps 128(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm2
vaddps %xmm2, %xmm3, %xmm2
vmovaps %xmm2, 128(%edx)
vmovaps 144(%edx), %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps %xmm1, %xmm2, %xmm1
vaddps %xmm1, %xmm3, %xmm1
vmovaps %xmm1, 144(%edx)
vmovaps 192(%edx), %xmm2
vsubps %xmm1, %xmm2, %xmm1
vaddps %xmm1, %xmm0, %xmm3
vsubps %xmm3, %xmm2, %xmm2
vmulps 208(%edx), %xmm0, %xmm0
vmulps 224(%edx), %xmm2, %xmm2
movl 4(%esp), %eax
vmulps 240(%edx), %xmm1, %xmm1
vmovaps 176(%edx), %xmm3
vmovaps %xmm3, 192(%edx)
vmovaps 160(%edx), %xmm3
vmovaps %xmm3, 176(%edx)
vmovaps (%ecx), %xmm3
vmovaps %xmm3, 160(%edx)
vaddps %xmm2, %xmm0, %xmm0
vaddps %xmm0, %xmm1, %xmm0
vmovaps %xmm0, (%eax)
retl
Lfunc_end0:
Although the number of the instructions is the same, the 1st version is still about 50% faster.
I tried to identify the cause of the issue, without success. There are suspicious things like those ugly vmovdqu
instructions in the 2nd MSVC assembly. Construction, copy assignment operator and the pass-by-reference also can unnecessarily move the data from SSE registers back to memory, however all my attempts to solve or exactly identify the issue was unsuccessful.
I really don't think that such a simple wrapper cannot reach the same performance as the bare __m128
, whatever causes the overhead it could be eliminated.
So what is going on there?