SSE 4 instructions generated by Visual Studio 2013

If I compile this code in VS 2013 Update 2 or Update 3: (below comes from Update 3)

#include "stdafx.h"
#include <iostream>
#include <random>

struct Buffer
{
  long* data;
  int   count;
};

#ifndef max
#define max(a,b)            (((a) > (b)) ? (a) : (b))
#endif

long Code(long* data, int count)
{
  long nMaxY = data[0];

  for (int nNode = 0; nNode < count; nNode++)
  {
    nMaxY = max(data[nNode], nMaxY);
  }

  return(nMaxY);
}

int _tmain(int argc, _TCHAR* argv[])
{
#ifdef __AVX__
  static_assert(false, "AVX should be disabled");
#endif
#ifdef __AVX2__
  static_assert(false, "AVX2 should be disabled");
#endif
  static_assert(_M_IX86_FP == 2, "SSE2 instructions should be enabled");
  Buffer buff;
  std::mt19937 engine;
  engine.seed(std::random_device{}());
  std::uniform_int_distribution<int> distribution(0, 100);

  buff.count = 1;
  buff.data = new long[1];
  buff.data[0] = distribution(engine);

  long result = Code(buff.data, buff.count);
  std::cout << result; // ensure result is used
  return result;
}

with SSE2 instructions enabled, but not AVX/AVX2, the compiler in release generates:

  {
    nMaxY = max(data[nNode], nMaxY);
010612E1  movdqu      xmm0,xmmword ptr [eax]  
010612E5  add         esi,8  
010612E8  lea         eax,[eax+20h]  
010612EB  pmaxsd      xmm1,xmm0  
010612F0  movdqu      xmm0,xmmword ptr [eax-10h]  
010612F5  pmaxsd      xmm2,xmm0  
010612FA  cmp         esi,ebx  
010612FC  jl          Code+41h (010612E1h)  
010612FE  pmaxsd      xmm1,xmm2  
01061303  movdqa      xmm0,xmm1  
01061307  psrldq      xmm0,8  
0106130C  pmaxsd      xmm1,xmm0  
01061311  movdqa      xmm0,xmm1  
01061315  psrldq      xmm0,4  
0106131A  pmaxsd      xmm1,xmm0  
0106131F  movd        eax,xmm1  
01061323  pop         ebx  
  long nMaxY = data[0];

which contains, among other things, pmaxsd instructions.

pmaxsd instructions are SSE4_1 instructions or AVX instructions as far as I can tell, not SSE2 instructions.

Intel core2s support sse3, but not sse4, and not pmaxsd.

This does not occur in VS2013 update 1 or update 0.

Is there a way to get Visual Studio to generate SSE2 instructions but not SSE4 instructions like pmaxsd? Is this a known bug in Visual Studio update 2/3? Is there a workaround? Does Visual Studio no longer support Core2 processors?

Here is a more complex version of the above code that compiles (under default release settings) to code that crashes a Core2 CPU:

#include "stdafx.h"
#include <iostream>
#include <random>
#include <array>

enum unused_name {
  _nNumPolygons = 10,
};


#ifndef max
#define max(a,b)            (((a) > (b)) ? (a) : (b))
#endif

struct Buffer
{
  std::array<long*, _nNumPolygons> data;
  std::array<int, _nNumPolygons>   count;
};

long Code(Buffer* buff)
{
  long  nMaxY = buff->data[0][0];


  for (int nPoly = 0; nPoly < _nNumPolygons; nPoly++)
  {
    for (int nNode = 0; nNode < buff->count[nPoly]; nNode++)
    {
      nMaxY = max(buff->data[nPoly][nNode], nMaxY);
    }
  }

  return(nMaxY);
}

extern "C" __int32 __isa_available;

int _tmain(int argc, _TCHAR* argv[])
{
#ifdef __AVX__
  static_assert(false, "AVX should be disabled");
#endif
#ifdef __AVX2__
  static_assert(false, "AVX2 should be disabled");
#endif
#if !( defined( _M_AMD64 ) || defined( _M_X64 ) )
  static_assert(_M_IX86_FP == 2, "SSE2 instructions should be enabled");
#endif
  // __isa_available = 1; // to force code to act as if SSE4_2 is not available
  Buffer buff;
  std::mt19937 engine;
  engine.seed(std::random_device{}());
  std::uniform_int_distribution<int> distribution(0, 100);

  for (int i = 0; i < _nNumPolygons; ++i) {
    buff.count[i] = 10;
    buff.data[i] = new long[10];
    for (int k = 0; k < 10; ++k)
    {
      buff.data[i][k] = distribution(engine);
    }
  }

  long result = Code(&buff);
  std::cout << result; // ensure result is used
  return result;
}

Here is a link to a bug for this issue that someone else opened around the same time I posted this question.

Here is the generated .asm:

?Code2@@YAJPAUBuffer@@@Z PROC        ; Code2, COMDAT
; _buff$ = ecx
; File c:\users\adam.nevraumont.corelcorp.000\documents\visual studio 2013\projects\consoleapplication1\consoleapplication1\consoleapplication1.cpp
; Line 22
  push  ebp
  mov  ebp, esp
  sub  esp, 12          ; 0000000cH
  push  ebx
  push  esi
  push  edi
  mov  edi, ecx
; Line 26
  xor  ebx, ebx
  mov  DWORD PTR _buff$1$[ebp], edi
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  mov  eax, DWORD PTR [edi]
  mov  edx, DWORD PTR [eax]
; Line 28
  movd  xmm0, edx
  pshufd  xmm1, xmm0, 0
  movdqa  xmm2, xmm1
  npad  12
$LL6@Code2:
  lea  ecx, DWORD PTR [ebx*4]
  xor  eax, eax
  mov  esi, DWORD PTR [ecx+edi+40]
  mov  DWORD PTR tv443[ebp], ecx
  test  esi, esi
  jle  SHORT $LN5@Code2
  cmp  esi, 8
  jb  SHORT $LN25@Code2
  cmp  DWORD PTR ___isa_available, 2
  jl  SHORT $LN25@Code2
; Line 26
  mov  ebx, DWORD PTR [ecx+edi]
  mov  ecx, esi
  and  ecx, -2147483641      ; 80000007H
  jns  SHORT $LN33@Code2
  dec  ecx
  or  ecx, -8          ; fffffff8H
  inc  ecx
$LN33@Code2:
  mov  edi, esi
  sub  edi, ecx
  npad  8
$LL3@Code2:
; Line 30
  movdqu  xmm0, XMMWORD PTR [ebx+eax*4]
  pmaxsd  xmm1, xmm0
  movdqu  xmm0, XMMWORD PTR [ebx+eax*4+16]
  add  eax, 8
  pmaxsd  xmm2, xmm0
  cmp  eax, edi
  jl  SHORT $LL3@Code2
  mov  ebx, DWORD PTR _nPoly$1$[ebp]
  mov  ecx, DWORD PTR tv443[ebp]
  mov  edi, DWORD PTR _buff$1$[ebp]
$LN25@Code2:
; Line 28
  cmp  eax, esi
  jge  SHORT $LN5@Code2
; Line 26
  mov  edi, DWORD PTR [ecx+edi]
  npad  4
$LL23@Code2:
; Line 30
  cmp  DWORD PTR [edi+eax*4], edx
  cmovg  edx, DWORD PTR [edi+eax*4]
  inc  eax
  cmp  eax, esi
  jl  SHORT $LL23@Code2
$LN5@Code2:
; Line 26
  mov  edi, DWORD PTR _buff$1$[ebp]
  inc  ebx
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  cmp  ebx, 10          ; 0000000aH
  jl  $LL6@Code2
; Line 28
  movd  xmm0, edx
  pshufd  xmm0, xmm0, 0
  pmaxsd  xmm1, xmm0
  pmaxsd  xmm1, xmm2
  movdqa  xmm0, xmm1
  psrldq  xmm0, 8
  pmaxsd  xmm1, xmm0
  movdqa  xmm0, xmm1
  pop  edi
  psrldq  xmm0, 4
  pmaxsd  xmm1, xmm0
  pop  esi
  movd  eax, xmm1
  pop  ebx
; Line 35
  mov  esp, ebp
  pop  ebp
  ret  0

Here:

  cmp  esi, 8
  jb  SHORT $LN25@Code2
  cmp  DWORD PTR ___isa_available, 2
  jl  SHORT $LN25@Code2

we have the test that branches to the "single step" version if either (A) the loop is less than 8 long, or (B) we don't have SSE3/SSE4 support.

The single step version is:

$LN5@Code2:
; Line 26
  mov  edi, DWORD PTR _buff$1$[ebp]
  inc  ebx
  mov  DWORD PTR _nPoly$1$[ebp], ebx
  cmp  ebx, 10          ; 0000000aH
  jl  $LL6@Code2

which has no SSE instructions. However, the important part is the fall through. If eax (the iteration parameter) passes 10, it falls through into:

; Line 28
  movd  xmm0, edx
  pshufd  xmm0, xmm0, 0
  pmaxsd  xmm1, xmm0

which is code that finds the max of both the single step version results and the SSE4 results. The 3rd instruction is pmaxsd, which is an SSE4_1 instruction, and it is not guarded by __isa_available.

Is there a compiler setting or workaround that can leave the auto-vectorization intact, while not invoking SSE4_1 instructions on Core2 SSE2 enabled computers? Is there a bug in my code that is causing this to happen?

Note that my attempts to remove the double-nested nature of the loop seem to make the problem go away.

This is documented behaviour:

The Auto-Vectorizer also uses the newer, SSE4.2 instruction set if your computer supports it.

If you look closer at the code the compiler generates you'll see that the use of the SSE4.2 instructions is dependent on a runtime test:

cmp DWORD PTR ___isa_available, 2
jl  SHORT $LN11@Code

The value 2 here apparently means SSE4.2.

I was however able to confirm the bug in your second example. It turns out the Core 2 PC I was using supports SSE4.1 and the PMAXSD instruction, so I had to test it in on a PC with a Pentium 4 CPU to get the illegal instruction exception. You should submit a bug report to Microsoft Connect. Be sure to mention the specific Core 2 CPU model your example code fails on.

As for a workaround I can only suggest changing the optimization level for the affected function. Switching from optimizing for speed to optimizing for size seems to generate much the same code as would be used with only SSE2 instructions. You can use #pragma optimize to switch the optimization level like this:

#pragma optimize("s", on)

long Code(Buffer* buff)
{
     ...
}

#pragma optimize("", on)

As documented on this bug report, /d2Qvec-sse2only is an undocumented flag that works on update 3 (and possibly update 2) to prevent the compiler from outputing SSE4 instructions. This can prevent some loops from being vectorized, naturally. /d2Qvec-sse2only may cease to work at any point (it is "subject to future change without notice"), possibly on future versions of VC.

Microsoft claims that this problem is fixed in Update 4, and in the Update 4 CTP 2 (not for production use).