I am attempting to optimise some loops and I have managed but I wonder if I have only done it partially correct. Say for example that I have this loop:
for(i=0;i<n;i++){
b[i] = a[i]*2;
}
unrolling this by a factor of 3, produces this:
int unroll = (n/4)*4;
for(i=0;i<unroll;i+=4)
{
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
for(;i<n;i++)
{
b[i] = a[i]*2;
}
Now is the SSE translation equivalent:
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);
or is it:
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v, two_v);
_mm_storeu_ps(&b[i], ai2_v);
__m128 ai1_v = _mm_loadu_ps(&a[i+1]);
__m128 two1_v = _mm_set1_ps(2);
__m128 ai_1_2_v = _mm_mul_ps(ai1_v, two1_v);
_mm_storeu_ps(&b[i+1], ai_1_2_v);
__m128 ai2_v = _mm_loadu_ps(&a[i+2]);
__m128 two2_v = _mm_set1_ps(2);
__m128 ai_2_2_v = _mm_mul_ps(ai2_v, two2_v);
_mm_storeu_ps(&b[i+2], ai_2_2_v);
__m128 ai3_v = _mm_loadu_ps(&a[i+3]);
__m128 two3_v = _mm_set1_ps(2);
__m128 ai_3_2_v = _mm_mul_ps(ai3_v, two3_v);
_mm_storeu_ps(&b[i+3], ai_3_2_v);
I am slightly confused about the section of code:
for(;i<n;i++)
{
b[i] = a[i]*2;
}
what does this do? Is it just to do the extra parts for example if the loop is not dividable by the factor you choose to unroll it by? Thank you.
The answer is the first block:
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
It already takes four variables at a time.
Here is the full program with the equivalent section of code commented out:
#include <iostream>
int main()
{
int i{0};
float a[10] ={1,2,3,4,5,6,7,8,9,10};
float b[10] ={0,0,0,0,0,0,0,0,0,0};
int n = 10;
int unroll = (n/4)*4;
for (i=0; i<unroll; i+=4) {
//b[i] = a[i]*2;
//b[i+1] = a[i+1]*2;
//b[i+2] = a[i+2]*2;
//b[i+3] = a[i+3]*2;
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
}
for (; i<n; i++) {
b[i] = a[i]*2;
}
for (auto i : a) { std::cout << i << "\t"; }
std::cout << "\n";
for (auto i : b) { std::cout << i << "\t"; }
std::cout << "\n";
return 0;
}
As for efficiency; it seems that the assembly on my system generates movups
instructions, whereas the hand rolled code could be made to use movaps
which should be faster.
I used the following program to do some benchmarks:
#include <iostream>
//#define NO_UNROLL
//#define UNROLL
//#define SSE_UNROLL
#define SSE_UNROLL_ALIGNED
int main()
{
const size_t array_size = 100003;
#ifdef SSE_UNROLL_ALIGNED
__declspec(align(16)) int i{0};
__declspec(align(16)) float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
__declspec(align(16)) float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif
#ifndef SSE_UNROLL_ALIGNED
int i{0};
float a[array_size] ={1,2,3,4,5,6,7,8,9,10};
float b[array_size] ={0,0,0,0,0,0,0,0,0,0};
#endif
int n = array_size;
int unroll = (n/4)*4;
for (size_t j{0}; j < 100000; ++j) {
#ifdef NO_UNROLL
for (i=0; i<n; i++) {
b[i] = a[i]*2;
}
#endif
#ifdef UNROLL
for (i=0; i<unroll; i+=4) {
b[i] = a[i]*2;
b[i+1] = a[i+1]*2;
b[i+2] = a[i+2]*2;
b[i+3] = a[i+3]*2;
}
#endif
#ifdef SSE_UNROLL
for (i=0; i<unroll; i+=4) {
__m128 ai_v = _mm_loadu_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_storeu_ps(&b[i],ai2_v);
}
#endif
#ifdef SSE_UNROLL_ALIGNED
for (i=0; i<unroll; i+=4) {
__m128 ai_v = _mm_load_ps(&a[i]);
__m128 two_v = _mm_set1_ps(2);
__m128 ai2_v = _mm_mul_ps(ai_v,two_v);
_mm_store_ps(&b[i],ai2_v);
}
#endif
#ifndef NO_UNROLL
for (; i<n; i++) {
b[i] = a[i]*2;
}
#endif
}
//for (auto i : a) { std::cout << i << "\t"; }
//std::cout << "\n";
//for (auto i : b) { std::cout << i << "\t"; }
//std::cout << "\n";
return 0;
}
I got the following results (x86):
NO_UNROLL
: 0.994 seconds, no SSE chosen by compiler
UNROLL
: 3.511 seconds, uses movups
SSE_UNROLL
: 3.315 seconds, uses movups
SSE_UNROLL_ALIGNED
: 3.276 seconds, uses movaps
So it is clear that unrolling the loop has not helped in this case. Even ensuring that we use the more efficient movaps
doesn't help much.
But I got an even stranger result when compiling to 64 bit (x64):
NO_UNROLL
: 1.138 seconds, no SSE chosen by compiler
UNROLL
: 1.409 seconds, no SSE chosen by compiler
SSE_UNROLL
: 1.420 seconds, still no SSE chosen by compiler!
SSE_UNROLL_ALIGNED
: 1.476 seconds, still no SSE chosen by compiler!
It seems MSVC sees through the proposal and generates better assembly regardless, albeit still slower than had we not tried any hand optimization at all.
As usual, it is not efficient to unroll loops and try to match SSE instructions manually. Compilers can do it better than you. For example, the provided sample is compiled into SSE-enabled ASM automatically:
foo:
.LFB0:
.cfi_startproc
testl %edi, %edi
jle .L7
movl %edi, %esi
shrl $2, %esi
cmpl $3, %edi
leal 0(,%rsi,4), %eax
jbe .L8
testl %eax, %eax
je .L8
vmovdqa .LC0(%rip), %xmm1
xorl %edx, %edx
xorl %ecx, %ecx
.p2align 4,,10
.p2align 3
.L6:
addl $1, %ecx
vpmulld a(%rdx), %xmm1, %xmm0
vmovdqa %xmm0, b(%rdx)
addq $16, %rdx
cmpl %esi, %ecx
jb .L6
cmpl %eax, %edi
je .L7
.p2align 4,,10
.p2align 3
.L9:
movslq %eax, %rdx
addl $1, %eax
movl a(,%rdx,4), %ecx
addl %ecx, %ecx
cmpl %eax, %edi
movl %ecx, b(,%rdx,4)
jg .L9
.L7:
rep
ret
.L8:
xorl %eax, %eax
jmp .L9
.cfi_endproc
Loops can be unrolled as well, it would just make for a longer code, which I do not want to paster here. You can trust me - compilers do unroll loops.
Conclusion
Manual unrolling will do you no good.