Transpose an 8x8 float using AVX/AVX2

2020-01-27 04:08发布

Transposing a 8x8 matrix can be achieved by making four 4x4 matrices, and transposing each of them. This is not want I'm going for.

In another question, one answer gave a solution that would only require 24 instructions for an 8x8 matrix. However, this does not apply to floats.

Since the AVX2 contains registers of 256 bits, each register would fit eight 32 bits integers (floats). But the question is:

How to transpose an 8x8 float matrix, using AVX/AVX2, with the smallest instructions possible?

标签: simd avx avx2
5条回答
成全新的幸福
2楼-- · 2020-01-27 04:41

Further to previous answers, the usage of shuffleps is pretty much overkill in this scenario, since we can just unpacklo/unpackhi our way to the result. shuffle & unpack instructions have the same latency/throughput, however shuffle generates an additional byte in the machine code op (i.e. 5 bytes for shuffle, 4 for unpack).

At some point, we will require 8 permutes across lanes. This is a slower operation (at 3 cycles latency), so we want to kick off those ops earlier if possible. Assuming the transpose8f method gets inlined (it should do!), then any loads required for the a->h args should be fused into the unpack instructions.

The only minor issue you may face is that because you are using more than 8 registers here, you may spill into YMM9 and up. That can cause VEX2 ops to be generated as VEX3, which will add a byte per op.

As a result, with a bit of jiggling around, you'll end up with this:

typedef __m256 f256;
#define unpacklo8f _mm256_unpacklo_ps
#define unpackhi8f _mm256_unpackhi_ps

template<uint8_t X, uint8_t Y>
inline f256 permute128f(const f256 a, const f256 b)
{
  return _mm256_permute2f128_ps(a, b, X | (Y << 4)); 
}

inline void transpose8f(
  const f256 a, const f256 b, const f256 c, const f256 d, 
  const f256 e, const f256 f, const f256 g, const f256 h, 
  f256& s, f256& t, f256& u, f256& v,
  f256& x, f256& y, f256& z, f256& w)
{
  const f256 t00 = unpacklo8f(a, c);
  const f256 t02 = unpacklo8f(b, d);
  const f256 t20 = unpacklo8f(e, g);
  const f256 t22 = unpacklo8f(f, h);

  const f256 t10 = unpacklo8f(t00, t02);
  const f256 t30 = unpacklo8f(t20, t22);
  const f256 t11 = unpackhi8f(t00, t02);
  s = permute128f<0, 2>(t10, t30);
  const f256 t31 = unpackhi8f(t20, t22);
  x = permute128f<1, 3>(t10, t30);
  const f256 t01 = unpackhi8f(a, c);
  t = permute128f<0, 2>(t11, t31);
  const f256 t21 = unpackhi8f(e, g);
  y = permute128f<1, 3>(t11, t31);
  const f256 t03 = unpackhi8f(b, d);
  const f256 t23 = unpackhi8f(f, h);

  const f256 t12 = unpacklo8f(t01, t03);
  const f256 t13 = unpackhi8f(t01, t03);
  const f256 t32 = unpacklo8f(t21, t23);
  const f256 t33 = unpackhi8f(t21, t23);

  u = permute128f<0, 2>(t12, t32);
  z = permute128f<1, 3>(t12, t32);
  v = permute128f<0, 2>(t13, t33);
  w = permute128f<1, 3>(t13, t33);
}

You won't improve on this (You can do the 128bit permutes first, and unpacks second, but they'll end up being identical).

查看更多
戒情不戒烟
3楼-- · 2020-01-27 04:57

This is my solution with less instructions and the performance is very good about 8 times faster. I've tested using ICC, GCC and Clang in Fedora.

#include <stdio.h>
#include <x86intrin.h>
#define MAX1    128
#define MAX2 MAX1

float __attribute__(( aligned(32))) a_tra[MAX2][MAX1], __attribute__(( aligned(32))) a[MAX1][MAX2] ;
int main()
{

    int i,j;//, ii=0,jj=0;
    // variables for vector section
    int vindexm [8]={0, MAX1, MAX1*2, MAX1*3, MAX1*4, MAX1*5, MAX1*6, MAX1*7 };
    __m256i vindex = _mm256_load_si256((__m256i *) &vindexm[0]);
    __m256 vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;

        for(i=0; i<MAX1;  i+=8){            
            for(j=0; j<MAX2;  j+=8){
                //loading from columns
                vec1 = _mm256_i32gather_ps (&a[i][j+0],vindex,4);
                vec2 = _mm256_i32gather_ps (&a[i][j+1],vindex,4);
                vec3 = _mm256_i32gather_ps (&a[i][j+2],vindex,4);
                vec4 = _mm256_i32gather_ps (&a[i][j+3],vindex,4);
                vec5 = _mm256_i32gather_ps (&a[i][j+4],vindex,4);
                vec6 = _mm256_i32gather_ps (&a[i][j+5],vindex,4);
                vec7 = _mm256_i32gather_ps (&a[i][j+6],vindex,4);
                vec8 = _mm256_i32gather_ps (&a[i][j+7],vindex,4);

                //storing to the rows
                _mm256_store_ps(&a_tra[j+0][i], vec1);
                _mm256_store_ps(&a_tra[j+1][i], vec2);
                _mm256_store_ps(&a_tra[j+2][i], vec3);
                _mm256_store_ps(&a_tra[j+3][i], vec4);
                _mm256_store_ps(&a_tra[j+4][i], vec5);
                _mm256_store_ps(&a_tra[j+5][i], vec6);
                _mm256_store_ps(&a_tra[j+6][i], vec7);
                _mm256_store_ps(&a_tra[j+7][i], vec8);  
            }
        }
    return 0;
}
查看更多
不美不萌又怎样
4楼-- · 2020-01-27 04:59

I already answered this question Fast memory transpose with SSE, AVX, and OpenMP.

Let me repeat the solution for transposing an 8x8 float matrix with AVX. Let me know if this is any faster than using 4x4 blocks and _MM_TRANSPOSE4_PS. I used it for a kernel in a larger matrix transpose which was memory bound so that was probably not a fair test.

inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
__m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
__m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
__t0 = _mm256_unpacklo_ps(row0, row1);
__t1 = _mm256_unpackhi_ps(row0, row1);
__t2 = _mm256_unpacklo_ps(row2, row3);
__t3 = _mm256_unpackhi_ps(row2, row3);
__t4 = _mm256_unpacklo_ps(row4, row5);
__t5 = _mm256_unpackhi_ps(row4, row5);
__t6 = _mm256_unpacklo_ps(row6, row7);
__t7 = _mm256_unpackhi_ps(row6, row7);
__tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
__tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
__tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
__tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
__tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
__tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
__tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
__tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}

Based on this comment I learned that there are more efficient methods which to do the 8x8 transpose. See Example 11-19 and and 11-20 in the Intel optimization manual under section "11.11 Handling Port 5 Pressure". Example 11-19 uses the same number of instructions but reduces the pressure on port5 by using blends which go to port0 as well. I may implement this with intrinsics at some point but I don't have a need for this at this point.


I looked more carefully into Example 11-19 and 11-20 in the Intel Manuals I mentioned above. It turns out that example 11-19 uses 4 more shuffle operations than necessary. It has 8 unpack, 12 shuffles, and 8 128-bit permutes. My method uses 4 fewer shuffles. They replace 8 of the shuffles with blends. So 4 shuffles and 8 blends. I doubt that's better than my method with only eight shuffles.

Example 11-20 is, however, an improvement if you need to load the matrix from memory. This uses 8 unpacks, 8 inserts, 8 shuffles, 8 128-bit loads, and 8 stores. The 128-bit loads reduce the port pressure. I went ahead and implemented this using intrinsics.

//Example 11-20. 8x8 Matrix Transpose Using VINSERTF128 loads
void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+0])), _mm_load_ps(&mat[4*8+0]), 1);
  r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+0])), _mm_load_ps(&mat[5*8+0]), 1);
  r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+0])), _mm_load_ps(&mat[6*8+0]), 1);
  r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+0])), _mm_load_ps(&mat[7*8+0]), 1);
  r4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+4])), _mm_load_ps(&mat[4*8+4]), 1);
  r5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+4])), _mm_load_ps(&mat[5*8+4]), 1);
  r6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+4])), _mm_load_ps(&mat[6*8+4]), 1);
  r7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+4])), _mm_load_ps(&mat[7*8+4]), 1);

  t0 = _mm256_unpacklo_ps(r0,r1);
  t1 = _mm256_unpackhi_ps(r0,r1);
  t2 = _mm256_unpacklo_ps(r2,r3);
  t3 = _mm256_unpackhi_ps(r2,r3);
  t4 = _mm256_unpacklo_ps(r4,r5);
  t5 = _mm256_unpackhi_ps(r4,r5);
  t6 = _mm256_unpacklo_ps(r6,r7);
  t7 = _mm256_unpackhi_ps(r6,r7);

  r0 = _mm256_shuffle_ps(t0,t2, 0x44);
  r1 = _mm256_shuffle_ps(t0,t2, 0xEE);
  r2 = _mm256_shuffle_ps(t1,t3, 0x44);
  r3 = _mm256_shuffle_ps(t1,t3, 0xEE);
  r4 = _mm256_shuffle_ps(t4,t6, 0x44);
  r5 = _mm256_shuffle_ps(t4,t6, 0xEE);
  r6 = _mm256_shuffle_ps(t5,t7, 0x44);
  r7 = _mm256_shuffle_ps(t5,t7, 0xEE);

  _mm256_store_ps(&matT[0*8], r0);
  _mm256_store_ps(&matT[1*8], r1);
  _mm256_store_ps(&matT[2*8], r2);
  _mm256_store_ps(&matT[3*8], r3);
  _mm256_store_ps(&matT[4*8], r4);
  _mm256_store_ps(&matT[5*8], r5);
  _mm256_store_ps(&matT[6*8], r6);
  _mm256_store_ps(&matT[7*8], r7);
}

So I looked into example 11-19 again. The basic idea as far as I can tell is that two shuffle instructions (shufps) can be replaced by one shuffle and two blends. For example

r0 = _mm256_shuffle_ps(t0,t2, 0x44);
r1 = _mm256_shuffle_ps(t0,t2, 0xEE);

can be replace with

v = _mm256_shuffle_ps(t0,t2, 0x4E);
r0 = _mm256_blend_ps(t0, v, 0xCC);
r1 = _mm256_blend_ps(t2, v, 0x33);

This explains why my original code used 8 shuffles and Example 11-19 uses 4 shuffles and eight blends.

The blends are good for throughput because shuffles only go to one port (creating a bottleneck on the shuffle port), but blends can run on multiple ports and thus don't compete. But what is better: 8 shuffles or 4 shuffles and 8 blends?

This has to be tested, and can depend on surrounding code. If you mostly bottleneck on total uop throughput with a lot of other uops in the loop that don't need port 5, you might go for the pure shuffle version. Ideally you should do some computation on the transposed data before storing it, while it's already in registers. See https://agner.org/optimize/ and other performance links in the x86 tag wiki.

I don't, however, see a way to replace the unpack instructions with blends.

Here is full code which combines Example 11-19 converting 2 shuffles to 1 shuffle and two blends and Example 11-20 which uses vinsertf128 loads (which on Intel Haswell/Skylake CPUs are 2 uops: one ALU for any port, one memory. They unfortunately don't micro-fuse. vinsertf128 with all register operands is 1 uop for the shuffle port on Intel, so this is good because the compiler folds the load into a memory operand for vinsertf128.) This has the advantage of only needing the source data 16-byte aligned for maximum performance, avoiding any cache-line splits.

#include <stdio.h>
#include <x86intrin.h>
#include <omp.h>   

/*
void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_load_ps(&mat[0*8]);
  r1 = _mm256_load_ps(&mat[1*8]);
  r2 = _mm256_load_ps(&mat[2*8]);
  r3 = _mm256_load_ps(&mat[3*8]);
  r4 = _mm256_load_ps(&mat[4*8]);
  r5 = _mm256_load_ps(&mat[5*8]);
  r6 = _mm256_load_ps(&mat[6*8]);
  r7 = _mm256_load_ps(&mat[7*8]);

  t0 = _mm256_unpacklo_ps(r0, r1);
  t1 = _mm256_unpackhi_ps(r0, r1);
  t2 = _mm256_unpacklo_ps(r2, r3);
  t3 = _mm256_unpackhi_ps(r2, r3);
  t4 = _mm256_unpacklo_ps(r4, r5);
  t5 = _mm256_unpackhi_ps(r4, r5);
  t6 = _mm256_unpacklo_ps(r6, r7);
  t7 = _mm256_unpackhi_ps(r6, r7);

  r0 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(1,0,1,0));  
  r1 = _mm256_shuffle_ps(t0,t2,_MM_SHUFFLE(3,2,3,2));
  r2 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(1,0,1,0));
  r3 = _mm256_shuffle_ps(t1,t3,_MM_SHUFFLE(3,2,3,2));
  r4 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(1,0,1,0));
  r5 = _mm256_shuffle_ps(t4,t6,_MM_SHUFFLE(3,2,3,2));
  r6 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(1,0,1,0));
  r7 = _mm256_shuffle_ps(t5,t7,_MM_SHUFFLE(3,2,3,2));

  t0 = _mm256_permute2f128_ps(r0, r4, 0x20);
  t1 = _mm256_permute2f128_ps(r1, r5, 0x20);
  t2 = _mm256_permute2f128_ps(r2, r6, 0x20);
  t3 = _mm256_permute2f128_ps(r3, r7, 0x20);
  t4 = _mm256_permute2f128_ps(r0, r4, 0x31);
  t5 = _mm256_permute2f128_ps(r1, r5, 0x31);
  t6 = _mm256_permute2f128_ps(r2, r6, 0x31);
  t7 = _mm256_permute2f128_ps(r3, r7, 0x31);

  _mm256_store_ps(&matT[0*8], t0);
  _mm256_store_ps(&matT[1*8], t1);
  _mm256_store_ps(&matT[2*8], t2);
  _mm256_store_ps(&matT[3*8], t3);
  _mm256_store_ps(&matT[4*8], t4);
  _mm256_store_ps(&matT[5*8], t5);
  _mm256_store_ps(&matT[6*8], t6);
  _mm256_store_ps(&matT[7*8], t7);
}
*/

void tran(float* mat, float* matT) {
  __m256  r0, r1, r2, r3, r4, r5, r6, r7;
  __m256  t0, t1, t2, t3, t4, t5, t6, t7;

  r0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+0])), _mm_load_ps(&mat[4*8+0]), 1);
  r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+0])), _mm_load_ps(&mat[5*8+0]), 1);
  r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+0])), _mm_load_ps(&mat[6*8+0]), 1);
  r3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+0])), _mm_load_ps(&mat[7*8+0]), 1);
  r4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[0*8+4])), _mm_load_ps(&mat[4*8+4]), 1);
  r5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[1*8+4])), _mm_load_ps(&mat[5*8+4]), 1);
  r6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[2*8+4])), _mm_load_ps(&mat[6*8+4]), 1);
  r7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&mat[3*8+4])), _mm_load_ps(&mat[7*8+4]), 1);

  t0 = _mm256_unpacklo_ps(r0,r1);
  t1 = _mm256_unpackhi_ps(r0,r1);
  t2 = _mm256_unpacklo_ps(r2,r3);
  t3 = _mm256_unpackhi_ps(r2,r3);
  t4 = _mm256_unpacklo_ps(r4,r5);
  t5 = _mm256_unpackhi_ps(r4,r5);
  t6 = _mm256_unpacklo_ps(r6,r7);
  t7 = _mm256_unpackhi_ps(r6,r7);

  __m256 v;

  //r0 = _mm256_shuffle_ps(t0,t2, 0x44);
  //r1 = _mm256_shuffle_ps(t0,t2, 0xEE);  
  v = _mm256_shuffle_ps(t0,t2, 0x4E);
  r0 = _mm256_blend_ps(t0, v, 0xCC);
  r1 = _mm256_blend_ps(t2, v, 0x33);

  //r2 = _mm256_shuffle_ps(t1,t3, 0x44);
  //r3 = _mm256_shuffle_ps(t1,t3, 0xEE);
  v = _mm256_shuffle_ps(t1,t3, 0x4E);
  r2 = _mm256_blend_ps(t1, v, 0xCC);
  r3 = _mm256_blend_ps(t3, v, 0x33);

  //r4 = _mm256_shuffle_ps(t4,t6, 0x44);
  //r5 = _mm256_shuffle_ps(t4,t6, 0xEE);
  v = _mm256_shuffle_ps(t4,t6, 0x4E);
  r4 = _mm256_blend_ps(t4, v, 0xCC);
  r5 = _mm256_blend_ps(t6, v, 0x33);

  //r6 = _mm256_shuffle_ps(t5,t7, 0x44);
  //r7 = _mm256_shuffle_ps(t5,t7, 0xEE);
  v = _mm256_shuffle_ps(t5,t7, 0x4E);
  r6 = _mm256_blend_ps(t5, v, 0xCC);
  r7 = _mm256_blend_ps(t7, v, 0x33);

  _mm256_store_ps(&matT[0*8], r0);
  _mm256_store_ps(&matT[1*8], r1);
  _mm256_store_ps(&matT[2*8], r2);
  _mm256_store_ps(&matT[3*8], r3);
  _mm256_store_ps(&matT[4*8], r4);
  _mm256_store_ps(&matT[5*8], r5);
  _mm256_store_ps(&matT[6*8], r6);
  _mm256_store_ps(&matT[7*8], r7);
}


int verify(float *mat) {
    int i,j;
    int error = 0;
    for(i=0; i<8; i++) {
      for(j=0; j<8; j++) {
        if(mat[j*8+i] != 1.0f*i*8+j) error++;
      }
    }
    return error;
}

void print_mat(float *mat) {
    int i,j;
    for(i=0; i<8; i++) {
      for(j=0; j<8; j++) printf("%2.0f ", mat[i*8+j]);
      puts("");
    }
    puts("");
}

int main(void) {
    int i,j, rep;
    float mat[64] __attribute__((aligned(64)));
    float matT[64] __attribute__((aligned(64)));
    double dtime;

    rep = 10000000;
    for(i=0; i<64; i++) mat[i] = i;
    print_mat(mat);

    tran(mat,matT);
    //dtime = -omp_get_wtime();
    //tran(mat, matT, rep);
    //dtime += omp_get_wtime();
    printf("errors %d\n", verify(matT));
    //printf("dtime %f\n", dtime);
    print_mat(matT);
}
查看更多
叛逆
5楼-- · 2020-01-27 05:01

I decided to do a full test of 3 different routines in an apples to apples comparison.

  // transpose.cpp : 
  /*
  Transpose8x8Shuff 100,000,000 times
  (0.750000 seconds).
  Transpose8x8Permute 100,000,000 times
  (0.749000 seconds).
  Transpose8x8Insert 100,000,000 times
  (0.858000 seconds).
  */

  #include <stdio.h>
  #include <time.h>
  #include <thread>
  #include <chrono>
  #include <xmmintrin.h>
  #include <emmintrin.h>
  #include <tmmintrin.h>
  #include <immintrin.h>

  inline void Transpose8x8Shuff(unsigned long *in)
  {
     __m256 *inI = reinterpret_cast<__m256 *>(in);
     __m256 rI[8];
     rI[0] = _mm256_unpacklo_ps(inI[0], inI[1]); 
     rI[1] = _mm256_unpackhi_ps(inI[0], inI[1]); 
     rI[2] = _mm256_unpacklo_ps(inI[2], inI[3]); 
     rI[3] = _mm256_unpackhi_ps(inI[2], inI[3]); 
     rI[4] = _mm256_unpacklo_ps(inI[4], inI[5]); 
     rI[5] = _mm256_unpackhi_ps(inI[4], inI[5]); 
     rI[6] = _mm256_unpacklo_ps(inI[6], inI[7]); 
     rI[7] = _mm256_unpackhi_ps(inI[6], inI[7]); 

     __m256 rrF[8];
     __m256 *rF = reinterpret_cast<__m256 *>(rI);
     rrF[0] = _mm256_shuffle_ps(rF[0], rF[2], _MM_SHUFFLE(1,0,1,0));
     rrF[1] = _mm256_shuffle_ps(rF[0], rF[2], _MM_SHUFFLE(3,2,3,2));
     rrF[2] = _mm256_shuffle_ps(rF[1], rF[3], _MM_SHUFFLE(1,0,1,0)); 
     rrF[3] = _mm256_shuffle_ps(rF[1], rF[3], _MM_SHUFFLE(3,2,3,2));
     rrF[4] = _mm256_shuffle_ps(rF[4], rF[6], _MM_SHUFFLE(1,0,1,0));
     rrF[5] = _mm256_shuffle_ps(rF[4], rF[6], _MM_SHUFFLE(3,2,3,2));
     rrF[6] = _mm256_shuffle_ps(rF[5], rF[7], _MM_SHUFFLE(1,0,1,0));
     rrF[7] = _mm256_shuffle_ps(rF[5], rF[7], _MM_SHUFFLE(3,2,3,2));

     rF = reinterpret_cast<__m256 *>(in);
     rF[0] = _mm256_permute2f128_ps(rrF[0], rrF[4], 0x20);
     rF[1] = _mm256_permute2f128_ps(rrF[1], rrF[5], 0x20);
     rF[2] = _mm256_permute2f128_ps(rrF[2], rrF[6], 0x20);
     rF[3] = _mm256_permute2f128_ps(rrF[3], rrF[7], 0x20);
     rF[4] = _mm256_permute2f128_ps(rrF[0], rrF[4], 0x31);
     rF[5] = _mm256_permute2f128_ps(rrF[1], rrF[5], 0x31);
     rF[6] = _mm256_permute2f128_ps(rrF[2], rrF[6], 0x31);
     rF[7] = _mm256_permute2f128_ps(rrF[3], rrF[7], 0x31);
  }

  inline void Transpose8x8Permute(unsigned long *in)
  {
     __m256i *inI = reinterpret_cast<__m256i *>(in);
     __m256i rI[8];
     rI[0] = _mm256_permute2f128_si256(inI[0], inI[4], 0x20); 
     rI[1] = _mm256_permute2f128_si256(inI[0], inI[4], 0x31); 
     rI[2] = _mm256_permute2f128_si256(inI[1], inI[5], 0x20); 
     rI[3] = _mm256_permute2f128_si256(inI[1], inI[5], 0x31); 
     rI[4] = _mm256_permute2f128_si256(inI[2], inI[6], 0x20); 
     rI[5] = _mm256_permute2f128_si256(inI[2], inI[6], 0x31); 
     rI[6] = _mm256_permute2f128_si256(inI[3], inI[7], 0x20); 
     rI[7] = _mm256_permute2f128_si256(inI[3], inI[7], 0x31); 

     __m256 rrF[8];
     __m256 *rF = reinterpret_cast<__m256 *>(rI);
     rrF[0] = _mm256_unpacklo_ps(rF[0], rF[4]); 
     rrF[1] = _mm256_unpackhi_ps(rF[0], rF[4]); 
     rrF[2] = _mm256_unpacklo_ps(rF[1], rF[5]); 
     rrF[3] = _mm256_unpackhi_ps(rF[1], rF[5]); 
     rrF[4] = _mm256_unpacklo_ps(rF[2], rF[6]); 
     rrF[5] = _mm256_unpackhi_ps(rF[2], rF[6]); 
     rrF[6] = _mm256_unpacklo_ps(rF[3], rF[7]); 
     rrF[7] = _mm256_unpackhi_ps(rF[3], rF[7]); 

     rF = reinterpret_cast<__m256 *>(in);
     rF[0] = _mm256_unpacklo_ps(rrF[0], rrF[4]); 
     rF[1] = _mm256_unpackhi_ps(rrF[0], rrF[4]); 
     rF[2] = _mm256_unpacklo_ps(rrF[1], rrF[5]); 
     rF[3] = _mm256_unpackhi_ps(rrF[1], rrF[5]); 
     rF[4] = _mm256_unpacklo_ps(rrF[2], rrF[6]); 
     rF[5] = _mm256_unpackhi_ps(rrF[2], rrF[6]); 
     rF[6] = _mm256_unpacklo_ps(rrF[3], rrF[7]); 
     rF[7] = _mm256_unpackhi_ps(rrF[3], rrF[7]); 
  }

  inline void Transpose8x8Insert(unsigned long *in)
  {
     __m256i *inIa = reinterpret_cast<__m256i *>(in);
     __m256i *inIb = reinterpret_cast<__m256i *>(&reinterpret_cast<__m128i *>(in)[1]);
     __m128i *inI128 = reinterpret_cast<__m128i *>(in);
     __m256i rI[8];
     rI[0] = _mm256_insertf128_si256(inIa[0], inI128[8], 1); 
     rI[1] = _mm256_insertf128_si256(inIb[0], inI128[9], 1); 
     rI[2] = _mm256_insertf128_si256(inIa[1], inI128[10], 1); 
     rI[3] = _mm256_insertf128_si256(inIb[1], inI128[11], 1); 
     rI[4] = _mm256_insertf128_si256(inIa[2], inI128[12], 1); 
     rI[5] = _mm256_insertf128_si256(inIb[2], inI128[13], 1); 
     rI[6] = _mm256_insertf128_si256(inIa[3], inI128[14], 1); 
     rI[7] = _mm256_insertf128_si256(inIb[3], inI128[15], 1); 

     __m256 rrF[8];
     __m256 *rF = reinterpret_cast<__m256 *>(rI);
     rrF[0] = _mm256_unpacklo_ps(rF[0], rF[4]); 
     rrF[1] = _mm256_unpackhi_ps(rF[0], rF[4]); 
     rrF[2] = _mm256_unpacklo_ps(rF[1], rF[5]); 
     rrF[3] = _mm256_unpackhi_ps(rF[1], rF[5]); 
     rrF[4] = _mm256_unpacklo_ps(rF[2], rF[6]); 
     rrF[5] = _mm256_unpackhi_ps(rF[2], rF[6]); 
     rrF[6] = _mm256_unpacklo_ps(rF[3], rF[7]); 
     rrF[7] = _mm256_unpackhi_ps(rF[3], rF[7]); 

     rF = reinterpret_cast<__m256 *>(in);
     rF[0] = _mm256_unpacklo_ps(rrF[0], rrF[4]); 
     rF[1] = _mm256_unpackhi_ps(rrF[0], rrF[4]); 
     rF[2] = _mm256_unpacklo_ps(rrF[1], rrF[5]); 
     rF[3] = _mm256_unpackhi_ps(rrF[1], rrF[5]); 
     rF[4] = _mm256_unpacklo_ps(rrF[2], rrF[6]); 
     rF[5] = _mm256_unpackhi_ps(rrF[2], rrF[6]); 
     rF[6] = _mm256_unpacklo_ps(rrF[3], rrF[7]); 
     rF[7] = _mm256_unpackhi_ps(rrF[3], rrF[7]); 
  }

  int main()
  {
  #define dwordCount 64
     alignas(32) unsigned long mat[dwordCount];
     for (int i = 0; i < dwordCount; i++) {
        mat[i] = i;
     }
     clock_t t;
     printf ("Transpose8x8Shuff 100,000,000 times\n");
     Transpose8x8Shuff(mat);
     t = clock();
     int i = 100000000;
     do {
        Transpose8x8Shuff(mat);
     } while (--i >= 0);
     t = clock() - t;
     volatile int dummy = mat[2];
     printf ("(%f seconds).\n",((float)t)/CLOCKS_PER_SEC);
     printf ("Transpose8x8Permute 100,000,000 times\n");
     Transpose8x8Permute(mat);
     t = clock();
     i = 100000000;
     do {
        Transpose8x8Permute(mat);
     } while (--i >= 0);
     t = clock() - t;
     volatile int dummy = mat[2];
     printf ("(%f seconds).\n",((float)t)/CLOCKS_PER_SEC);
     printf ("Transpose8x8Insert 100,000,000 times\n");
     Transpose8x8Insert(mat);
     t = clock();
     i = 100000000;
     do {
        Transpose8x8Insert(mat);
     } while (--i >= 0);
     t = clock() - t;
     volatile int dummy = mat[2];
     printf ("(%f seconds).\n",((float)t)/CLOCKS_PER_SEC);
     char c = getchar(); 
     return 0;
  }

This is benchmarking latency, not throughput (because the output for one transpose is the input for the next), but it probably bottlenecks on shuffle throughput anyway.


Results on Skylake i7-6700k @ 3.9GHz for a modified version of the above code (see it on the Godbolt compiler explorer), fixing the following bugs:

  • printf outside the timed regions, before starting the clock()
  • volatile dummy = in[2] at the end so all the transposes don't optimize away (which gcc actually does otherwise).
  • portable C++11, doesn't require MSVC (alignas(32) instead of __declspec, and don't include stdafx.h.)
  • sleep removed so the CPU won't downclock to idle speed between tests.

I didn't fix the unnecessary mixing of __m256i* / __m256*, and I didn't check if that led to worse code-gen with gcc or clang. I also didn't use a std::chrono high-rez clock because clock() was accurate enough for this many repeats.

g++7.3 -O3 -march=native on Arch Linux: Z Boson's version is fastest

Transpose8x8Shuff 100,000,000 times
(0.637479 seconds).
Transpose8x8Permute 100,000,000 times
(0.792658 seconds).
Transpose8x8Insert 100,000,000 times
(0.846590 seconds).

clang++ 5.0.1 -O3 -march=native: 8x8Permute gets optimized to something even faster than anything gcc did, but 8x8Insert is pessimized horribly.

Transpose8x8Shuff 100,000,000 times
(0.642185 seconds).
Transpose8x8Permute 100,000,000 times
(0.622157 seconds).
Transpose8x8Insert 100,000,000 times
(2.149958 seconds).

The asm instructions generated from the source won't match the intrinsics exactly: especially clang has a shuffle optimizer that really compiles the shuffles the same way it optimizes scalar code like + on integers. Transpose8x8Insert should not be that much slower, so clang must have chosen poorly.

查看更多
混吃等死
6楼-- · 2020-01-27 05:06

Here's an AVX2 solution which works for 8 x 8 32 bit ints. You can of course cast float vectors to int and back if you want to transpose 8 x 8 floats. It might also be possible to do an AVX-only version (i.e. not requiring AVX2) just for floats but I haven't tried that yet.

//
// tranpose_8_8_avx2.c
//

#include <stdio.h>

#include <immintrin.h>

#define V_ELEMS 8

static inline void _mm256_merge_epi32(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)
{
    __m256i va = _mm256_permute4x64_epi64(v0, _MM_SHUFFLE(3, 1, 2, 0));
    __m256i vb = _mm256_permute4x64_epi64(v1, _MM_SHUFFLE(3, 1, 2, 0));
    *vl = _mm256_unpacklo_epi32(va, vb);
    *vh = _mm256_unpackhi_epi32(va, vb);
}

static inline void _mm256_merge_epi64(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)
{
    __m256i va = _mm256_permute4x64_epi64(v0, _MM_SHUFFLE(3, 1, 2, 0));
    __m256i vb = _mm256_permute4x64_epi64(v1, _MM_SHUFFLE(3, 1, 2, 0));
    *vl = _mm256_unpacklo_epi64(va, vb);
    *vh = _mm256_unpackhi_epi64(va, vb);
}

static inline void _mm256_merge_si128(const __m256i v0, const __m256i v1, __m256i *vl, __m256i *vh)
{
    *vl = _mm256_permute2x128_si256(v0, v1, _MM_SHUFFLE(0, 2, 0, 0));
    *vh = _mm256_permute2x128_si256(v0, v1, _MM_SHUFFLE(0, 3, 0, 1));
}

//
// Transpose_8_8
//
// in place transpose of 8 x 8 int array
//

static void Transpose_8_8(
    __m256i *v0,
    __m256i *v1,
    __m256i *v2,
    __m256i *v3,
    __m256i *v4,
    __m256i *v5,
    __m256i *v6,
    __m256i *v7)
{
    __m256i w0, w1, w2, w3, w4, w5, w6, w7;
    __m256i x0, x1, x2, x3, x4, x5, x6, x7;

    _mm256_merge_epi32(*v0, *v1, &w0, &w1);
    _mm256_merge_epi32(*v2, *v3, &w2, &w3);
    _mm256_merge_epi32(*v4, *v5, &w4, &w5);
    _mm256_merge_epi32(*v6, *v7, &w6, &w7);

    _mm256_merge_epi64(w0, w2, &x0, &x1);
    _mm256_merge_epi64(w1, w3, &x2, &x3);
    _mm256_merge_epi64(w4, w6, &x4, &x5);
    _mm256_merge_epi64(w5, w7, &x6, &x7);

    _mm256_merge_si128(x0, x4, v0, v1);
    _mm256_merge_si128(x1, x5, v2, v3);
    _mm256_merge_si128(x2, x6, v4, v5);
    _mm256_merge_si128(x3, x7, v6, v7);
}

int main(void)
{
    int32_t buff[V_ELEMS][V_ELEMS] __attribute__ ((aligned(32)));
    int i, j;
    int k = 0;

    // init buff
    for (i = 0; i < V_ELEMS; ++i)
    {
        for (j = 0; j < V_ELEMS; ++j)
        {
            buff[i][j] = k++;
        }
    }

    // print buff
    printf("\nBEFORE:\n");
    for (i = 0; i < V_ELEMS; ++i)
    {
        for (j = 0; j < V_ELEMS; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("\n");
    }

    // transpose
    Transpose_8_8((__m256i *)buff[0], (__m256i *)buff[1], (__m256i *)buff[2], (__m256i *)buff[3], (__m256i *)buff[4], (__m256i *)buff[5], (__m256i *)buff[6], (__m256i *)buff[7]);

    // print buff
    printf("\nAFTER:\n");
    for (i = 0; i < V_ELEMS; ++i)
    {
        for (j = 0; j < V_ELEMS; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("\n");
    }

    // transpose
    Transpose_8_8((__m256i *)buff[0], (__m256i *)buff[1], (__m256i *)buff[2], (__m256i *)buff[3], (__m256i *)buff[4], (__m256i *)buff[5], (__m256i *)buff[6], (__m256i *)buff[7]);

    // print buff
    printf("\nAFTER x2:\n");
    for (i = 0; i < V_ELEMS; ++i)
    {
        for (j = 0; j < V_ELEMS; ++j)
        {
            printf("%4d", buff[i][j]);
        }
        printf("\n");
    }

    return 0;
}

Transpose_8_8 compiles to around 56 instructions with clang, including loads and stores - I think it should be possible to improve on this with some more effort.

Compile and test:

$ gcc -Wall -mavx2 -O3 transpose_8_8_avx2.c && ./a.out

BEFORE:
   0   1   2   3   4   5   6   7
   8   9  10  11  12  13  14  15
  16  17  18  19  20  21  22  23
  24  25  26  27  28  29  30  31
  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63

AFTER:
   0   8  16  24  32  40  48  56
   1   9  17  25  33  41  49  57
   2  10  18  26  34  42  50  58
   3  11  19  27  35  43  51  59
   4  12  20  28  36  44  52  60
   5  13  21  29  37  45  53  61
   6  14  22  30  38  46  54  62
   7  15  23  31  39  47  55  63

AFTER x2:
   0   1   2   3   4   5   6   7
   8   9  10  11  12  13  14  15
  16  17  18  19  20  21  22  23
  24  25  26  27  28  29  30  31
  32  33  34  35  36  37  38  39
  40  41  42  43  44  45  46  47
  48  49  50  51  52  53  54  55
  56  57  58  59  60  61  62  63

$ 
查看更多
登录 后发表回答