Fast memory transpose with SSE, AVX, and OpenMP

2019-02-04 23:17发布

问题:

I need a fast memory transpose algorithm for my Gaussian convolution function in C/C++. What I do now is

convolute_1D
transpose
convolute_1D
transpose

It turns out that with this method the filter size has to be large (or larger than I expected) or the transpose takes longer than the convolution (e.g. for a 1920x1080 matrix the convolution takes the same time as the transpose for a filter size of 35). The current transpose algorithm I am using uses loop blocking/tiling along with SSE and OpenMP. I have tried a version using AVX but it's no faster. Any suggestions on how I can speed this up?

inline void transpose4x4_SSE(float *A, float *B, const int lda, const int ldb) {
    __m128 row1 = _mm_load_ps(&A[0*lda]);
    __m128 row2 = _mm_load_ps(&A[1*lda]);
    __m128 row3 = _mm_load_ps(&A[2*lda]);
    __m128 row4 = _mm_load_ps(&A[3*lda]);
     _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
     _mm_store_ps(&B[0*ldb], row1);
     _mm_store_ps(&B[1*ldb], row2);
     _mm_store_ps(&B[2*ldb], row3);
     _mm_store_ps(&B[3*ldb], row4);
}
//block_size = 16 works best
inline void transpose_block_SSE4x4(float *A, float *B, const int n, const int m, const int lda, const int ldb ,const int block_size) {
    #pragma omp parallel for
    for(int i=0; i<n; i+=block_size) {
        for(int j=0; j<m; j+=block_size) {
            int max_i2 = i+block_size < n ? i + block_size : n;
            int max_j2 = j+block_size < m ? j + block_size : m;
            for(int i2=i; i2<max_i2; i2+=4) {
                for(int j2=j; j2<max_j2; j2+=4) {
                    transpose4x4_SSE(&A[i2*lda +j2], &B[j2*ldb + i2], lda, ldb);
                }
            }
        }
    }
}

Transpose 8x8 float matrix using AVX. It's no faster than four 4x4 transposes.

inline void transpose8_ps(__m256 &row0, __m256 &row1, __m256 &row2, __m256 &row3, __m256 &row4, __m256 &row5, __m256 &row6, __m256 &row7) {
    __m256 __t0, __t1, __t2, __t3, __t4, __t5, __t6, __t7;
    __m256 __tt0, __tt1, __tt2, __tt3, __tt4, __tt5, __tt6, __tt7;
    __t0 = _mm256_unpacklo_ps(row0, row1);
    __t1 = _mm256_unpackhi_ps(row0, row1);
    __t2 = _mm256_unpacklo_ps(row2, row3);
    __t3 = _mm256_unpackhi_ps(row2, row3);
    __t4 = _mm256_unpacklo_ps(row4, row5);
    __t5 = _mm256_unpackhi_ps(row4, row5);
    __t6 = _mm256_unpacklo_ps(row6, row7);
    __t7 = _mm256_unpackhi_ps(row6, row7);
    __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
    __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
    __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
    __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
    __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
    __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
    __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
    __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
    row0 = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
    row1 = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
    row2 = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
    row3 = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
    row4 = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
    row5 = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
    row6 = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
    row7 = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
}

inline void transpose8x8_avx(float *A, float *B, const int lda, const int ldb) {
    __m256 row0 = _mm256_load_ps(&A[0*lda]);
    __m256 row1 = _mm256_load_ps(&A[1*lda]);
    __m256 row2 = _mm256_load_ps(&A[2*lda]);
    __m256 row3 = _mm256_load_ps(&A[3*lda]);
    __m256 row4 = _mm256_load_ps(&A[4*lda]);
    __m256 row5 = _mm256_load_ps(&A[5*lda]);
    __m256 row6 = _mm256_load_ps(&A[6*lda]);
    __m256 row7 = _mm256_load_ps(&A[7*lda]);
    transpose8_ps(row0, row1, row2, row3, row4, row5, row6, row7);
    _mm256_store_ps(&B[0*ldb], row0);
    _mm256_store_ps(&B[1*ldb], row1);
    _mm256_store_ps(&B[2*ldb], row2);
    _mm256_store_ps(&B[3*ldb], row3);
    _mm256_store_ps(&B[4*ldb], row4);
    _mm256_store_ps(&B[5*ldb], row5);
    _mm256_store_ps(&B[6*ldb], row6);
    _mm256_store_ps(&B[7*ldb], row7);

}

回答1:

I'd guess that your best bet would be to try and combine the convolution and the transpose - i.e. write out the results of the convolve transposed as you go. You're almost certainly memory bandwidth limited on the transpose so reducing the number of instructions used for the transpose isn't really going to help (hence the lack of improvement from using AVX). Reducing the number of passes over your data is going to give you the best performance improvements.



回答2:

FWIW, on a 3 years old Core i7 M laptop CPU, this naive 4x4 transpose was barely slower than your SSE version, while almost 40% faster on a newer Intel Xeon E5-2630 v2 @ 2.60GHz desktop CPU.

inline void transpose4x4_naive(float *A, float *B, const int lda, const int ldb) {
    const float r0[] = { A[0], A[1], A[2], A[3] }; // memcpy instead?
    A += lda;
    const float r1[] = { A[0], A[1], A[2], A[3] };
    A += lda;
    const float r2[] = { A[0], A[1], A[2], A[3] };
    A += lda;
    const float r3[] = { A[0], A[1], A[2], A[3] };

    B[0] = r0[0];
    B[1] = r1[0];
    B[2] = r2[0];
    B[3] = r3[0];
    B += ldb;
    B[0] = r0[1];
    B[1] = r1[1];
    B[2] = r2[1];
    B[3] = r3[1];
    B += ldb;
    B[0] = r0[2];
    B[1] = r1[2];
    B[2] = r2[2];
    B[3] = r3[2];
    B += ldb;
    B[0] = r0[3];
    B[1] = r1[3];
    B[2] = r2[3];
    B[3] = r3[3];
}

Strangely enough, the older laptop CPU is faster than the dual E5-2630 v2 desktop with twice the core, but that's a different story :)

Otherwise, you might also be interested in http://research.colfaxinternational.com/file.axd?file=2013%2F8%2FColfax_Transposition-7110P.pdf http://colfaxresearch.com/multithreaded-transposition-of-square-matrices-with-common-code-for-intel-xeon-processors-and-intel-xeon-phi-coprocessors/ (requires login now...)



回答3:

Consider this 4x4 transpose.

struct MATRIX {
    union {
        float  f[4][4];
        __m128 m[4];
        __m256 n[2];
    };
};
MATRIX myTranspose(MATRIX in) {

    // This takes 15 assembler instructions (compile not inline), 
    // and is faster than XMTranspose
    // Comes in like this  1  2  3  4  5  6  7  8
    //                     9 10 11 12 13 14 15 16
    //
    // Want the result     1  5  9 13  2  6 10 14
    //                     3  7 11 15  4  8 12 16

    __m256 t0, t1, t2, t3, t4, t5, n0, n1;
    MATRIX result;

    n0 = in.n[0];                                               // n0 =  1,  2,  3,  4,  5,  6,  7,  8
    n1 = in.n[1];                                               // n1 =  9, 10, 11, 12, 13, 14, 15, 16
    t0 = _mm256_unpacklo_ps(n0, n1);                            // t0 =  1,  9,  2, 10,  5, 13,  6, 14
    t1 = _mm256_unpackhi_ps(n0, n1);                            // t1 =  3, 11,  4, 12,  7, 15,  8, 16

    t2 = _mm256_permute2f128_ps(t0, t1, 0x20);                  // t2 =  1,  9,  2, 10,  3, 11,  4, 12 
    t3 = _mm256_permute2f128_ps(t0, t1, 0x31);                  // t3 =  5, 13,  6, 14,  7, 15,  8, 16

    t4 = _mm256_unpacklo_ps(t2, t3);                            // t2 =  1,  5,  9, 13,  3,  7, 11, 15
    t5 = _mm256_unpackhi_ps(t2, t3);                            // t3 =  2,  6, 10, 14,  4,  8, 12, 16

    result.n[0] = _mm256_permute2f128_ps(t4, t5, 0x20);         // t6 =  1,  5,  9, 13,  2,  6, 10, 14
    result.n[1] = _mm256_permute2f128_ps(t4, t5, 0x31);         // t7 =  3,  7, 11, 15,  4,  8, 12, 16
    return result;
}