Running a multithreaded program, I noticed that the program was running faster using 1 thread compared to 4 threads, despite the CPU having 4 cores.
After investigating, I found out that the issue appears only when shuffling something.
Below the minimal program I created to reproduce the problem:
#include <math.h>
#include <future>
#include <ctime>
#include <vector>
#include <iostream>
#include <algorithm>
#define NB_JOBS 5000.0
#define MAX_CORES 8
static bool _no_shuffle(int nb_jobs){
bool b=false;
for(int i=0;i<nb_jobs;i++){
std::vector<float> v;
for(float i=0;i<100.0;i+=0.01) v.push_back(i);
float sum = 0;
// no meaning, just using CPU
for(int i=0;i<v.size();i++) sum+=pow(sin(v[i]),1.1);
if(sum==100) b=true;
}
return b;
}
static bool _shuffle(int nb_jobs){
bool b=false;
for(int i=0;i<nb_jobs;i++){
std::vector<float> v;
for(float i=0;i<100.0;i+=0.01) v.push_back(i);
std::random_shuffle(v.begin(), v.end()); // !!!
if (v[0]==0.0) b=true;
}
return b;
}
static double _duration(int nb_cores){
auto start = std::chrono::system_clock::now();
int nb_jobs_per_core = rint ( NB_JOBS / (float)nb_cores );
std::vector < std::future<bool> > futures;
for(int i=0;i<nb_cores;i++){
futures.push_back( std::async(std::launch::async,_shuffle,nb_jobs_per_core));
}
for (auto &e: futures) {
bool foo = e.get();
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed = end - start;
return elapsed.count();
}
int main(){
for(int nb_cores=1 ; nb_cores<=MAX_CORES ; nb_cores++){
double duration = _duration(nb_cores);
std::cout << nb_cores << " threads: " << duration << " seconds\n";
}
return 0;
}
which prints:
1 threads: 1.18503 seconds
2 threads: 9.6502 seconds
3 threads: 3.64973 seconds
4 threads: 9.8834 seconds
5 threads: 10.7937 seconds
6 threads: 11.6447 seconds
7 threads: 11.9236 seconds
8 threads: 12.1254 seconds
Using threads slow down the program !
on the other hand, when replacing:
std::async(std::launch::async,_shuffle,nb_jobs_per_core));
with:
std::async(std::launch::async,_no_shuffle,nb_jobs_per_core));
then:
1 threads: 3.24132 seconds
2 threads: 1.62207 seconds
3 threads: 1.1745 seconds
4 threads: 1.09769 seconds
5 threads: 1.03182 seconds
6 threads: 0.892274 seconds
7 threads: 0.782815 seconds
8 threads: 0.704777 seconds
which seems as expected, indicating shuffling is indeed the issue.
Is shuffling not thread friendly, and if so, how to shuffle a vector in a multi-threaded program ?