Version A code is not vectorized while version B code is vectorized.
How to make version A vectorize and keep the variable extents (without using literal extents)?
The nested loop is for multiplication with broadcasting as in numpy library of python and matlab. Description of broadcasting in numpy library is here.
Version A code (no std::vector. no vectorization.)
This only uses imull (%rsi), %edx
in .L169
, which is not a SIMD instruction.
gcc godbolt
#include <iostream>
#include <stdint.h>
typedef int32_t DATA_TYPE;
template <size_t N>
size_t cal_size(size_t (&Ashape)[N]){
size_t size = 1;
for(size_t i = 0; i < N; ++i) size *= Ashape[i];
return size;
}
template <size_t N>
size_t * cal_stride(size_t (&Ashape)[N] ) {
size_t size = cal_size(Ashape);
size_t * Astride = new size_t[N];
Astride[0] = size/Ashape[0];
for(size_t i = 1; i < N; ++i){
Astride[i] = Astride[i-1]/Ashape[i];
}
return Astride;
}
template <size_t N>
DATA_TYPE * init_data( size_t (&Ashape)[N] ){
size_t size = cal_size(Ashape);
DATA_TYPE * data = new DATA_TYPE[size];
for(size_t i = 0; i < size; ++i){
data[i] = i + 1;
}
return data;
}
template <size_t N>
void print_data(DATA_TYPE * Adata, size_t (&Ashape)[N] ){
size_t size = cal_size(Ashape);
for(size_t i = 0; i < size; ++i){
std::cout << Adata[i] << ", ";
}
std::cout << std::endl;
}
int main(void){
constexpr size_t nd = 3;
size_t Ashape[] = {20,3,4};
size_t Bshape[] = {1,3,1};
auto Astride = cal_stride(Ashape);
auto Bstride = cal_stride(Bshape);
auto Adata = init_data(Ashape);
auto Bdata = init_data(Bshape);
size_t c[nd] = {0,0,0};
///counter
size_t hint[nd] = {0,2,0};
//hint tells which are the broadcasting axes.
size_t A_i, B_i;
for(c[0] = 0; c[0] < Ashape[0]; ++c[0]){ // Ashape as hint[0] = 0
for(c[1] = 0; c[1] < Bshape[1]; ++c[1]){ //Bshape as hint[1] = 2
for(c[2] = 0; c[2] < Ashape[2];++c[2]){ //Asape as hint[2] = 0
A_i = c[0]*Astride[0] + c[1]*Astride[1] + c[2]*Astride[2];
B_i = c[1]*Bstride[1];
Adata[A_i] *= Bdata[B_i];
}
}
}
print_data(Adata, Ashape);
}
Version B Code (no std::vector. literal extents, and this vectorizes)
This uses pmulld %xmm3, %xmm2
in .L20
, which is a SIMD instruction.
gcc godbolt
#include <iostream>
#include <stdint.h>
typedef int32_t DATA_TYPE;
void print_data(DATA_TYPE * Adata, size_t size ){
for(size_t i = 0; i < size; ++i){
std::cout << Adata[i] << ", ";
}
std::cout << std::endl;
}
int main(void){
int32_t Adata[240];
int32_t Bdata[3];
size_t A_i, B_i, i,j,k;
for(i = 0; i < 20; ++i){
for(j = 0; j < 3; ++j){
for(k = 0; k < 4;++k){
A_i = i*12 + j*4 + k*1;
B_i = j*1;
Adata[A_i] *= Bdata[B_i];
}
}
}
print_data(Adata, 240);
}
boost multiarray vectorize but why? Not sure if it use simd alignment for memory.
gcc godbolt
#include "boost/multi_array.hpp"
#include <iostream>
int
main () {
// Create a 3D array that is 3 x 4 x 2
int d1,d2,d3;
typedef boost::multi_array<int, 3> array_type;
typedef array_type::index index;
array_type A(boost::extents[d1][d2][d3]);
array_type B(boost::extents[1][d2][1]);
// Assign values to the elements
for(index i = 0; i != d1; ++i)
for(index j = 0; j != d2; ++j)
for(index k = 0; k != d3; ++k)
A[i][j][k] *= B[0][j][0];
for(index i = 0; i != d1; ++i)
for(index j = 0; j != d2; ++j)
for(index k = 0; k != d3; ++k)
std::cout << A[i][j][k];
return 0;
}
2004 pdf at gcc.gnu.org that describes some loop optimization of gcc. I hope the "Symbolic Chrecs" (which corresponds to unanalyzed variables) means gcc can fuse nested loop with variable extents.
The last resort is to implement loop fusion with meta-programming.