RBM no improvement with OpenACC on the code yet

RBM algorithm is open source algorithm the source code is available here: https://github.com/yusugomori/DeepLearning/tree/master/cpp

I tried to get improvement with OpenACC by different ways but the sequential code still better So can you tell me what should be done (part needs to improved) to get high improvement

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;


RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
N = size;
n_visible = n_v;
n_hidden = n_h;

#pragma acc enter data copyin ( this)

//#pragma acc enter data copy ( W[0:n_hidden][0:n_visible] )
if(w == NULL) {
W = new double*[n_hidden];
for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
double a = 1.0 / n_visible;

for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
W[i][j] = uniform(-a, a);
}
}
} else {
W = w;
}

if(hb == NULL) {
hbias = new double[n_hidden];
for(int i=0; i<n_hidden; i++) hbias[i] = 0;
} else {
hbias = hb;
}

if(vb == NULL) {
vbias = new double[n_visible];
for(int i=0; i<n_visible; i++) vbias[i] = 0;
} else {
vbias = vb;
}
}

RBM::~RBM() {

#pragma acc exit data delete ( W[0:n_hidden][0:n_visible],this )

for(int i=0; i<n_hidden; i++) delete[] W[i];
delete[] W;
delete[] hbias;
delete[] vbias;
}


void RBM::contrastive_divergence(int *input, double lr, int k) {
double *ph_mean = new double[n_hidden];
int *ph_sample = new int[n_hidden];
double *nv_means = new double[n_visible];
int *nv_samples = new int[n_visible];
double *nh_means = new double[n_hidden];
int *nh_samples = new int[n_hidden];

/* CD-k */
sample_h_given_v(input, ph_mean, ph_sample);

for(int step=0; step<k; step++) {
if(step == 0) {
gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
} else {
gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
}
}

for(int i=0; i<n_hidden; i++) {
for(int j=0; j<n_visible; j++) {
// W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
}
hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
}

for(int i=0; i<n_visible; i++) {
vbias[i] += lr * (input[i] - nv_samples[i]) / N;
}

delete[] ph_mean;
delete[] ph_sample;
delete[] nv_means;
delete[] nv_samples;
delete[] nh_means;
delete[] nh_samples;
}

void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {
for(int i=0; i<n_hidden; i++) {
mean[i] = propup(v0_sample, W[i], hbias[i]);
sample[i] = binomial(1, mean[i]);
}
}

void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
for(int i=0; i<n_visible; i++) {
mean[i] = propdown(h0_sample, i, vbias[i]);
sample[i] = binomial(1, mean[i]);
}
}

double RBM::propup(int *v, double *w, double b) {

double pre_sigmoid_activation = 0.0;
#pragma acc enter data present ( this )
#pragma acc data copyin(v[0:n_visible],w[0:n_visible])


#pragma acc parallel
{

#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_visible; j++) {
pre_sigmoid_activation += w[j] * v[j];
}
}

pre_sigmoid_activation += b;
return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {

double pre_sigmoid_activation = 0.0;

#pragma acc enter data present ( this)//,W[0:n_hidden][0:n_visible] )
#pragma acc enter data copyin ( W[0:n_hidden][0:n_visible] )
#pragma acc data copyin(h[0:n_hidden]) 

#pragma acc parallel 

{
#pragma acc loop reduction(+:pre_sigmoid_activation) 
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
}

pre_sigmoid_activation += b;

return sigmoid(pre_sigmoid_activation);

}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, \
        double *nh_means, int *nh_samples) {
sample_v_given_h(h0_sample, nv_means, nv_samples);
sample_h_given_v(nv_samples, nh_means, nh_samples);
}

void RBM::reconstruct(int *v, double *reconstructed_v) {
double *h = new double[n_hidden];
double pre_sigmoid_activation;

for(int i=0; i<n_hidden; i++) {
h[i] = propup(v, W[i], hbias[i]);
}

for(int i=0; i<n_visible; i++) {
pre_sigmoid_activation = 0.0;
for(int j=0; j<n_hidden; j++) {
pre_sigmoid_activation += W[j][i] * h[j];
}
pre_sigmoid_activation += vbias[i];

reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
}

delete[] h;

//----------------------------------------------------The main
void test_rbm() {

srand(0);

double learning_rate = 0.1;

int training_epochs = 1000;

int k = 1;



int train_N = 6;

int test_N = 2;

int n_visible = 6;

int n_hidden = 3;



// training data

int train_X[6][6] = {

{1, 1, 1, 0, 0, 0},

{1, 0, 1, 0, 0, 0},

{1, 1, 1, 0, 0, 0},

{0, 0, 1, 1, 1, 0},

{0, 0, 1, 0, 1, 0},

{0, 0, 1, 1, 1, 0}

};





// construct RBM

RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);



// train

for(int epoch=0; epoch<training_epochs; epoch++) {

for(int i=0; i<train_N; i++) {

rbm.contrastive_divergence(train_X[i], learning_rate, k);

}

}



// test data

int test_X[2][6] = {

{1, 1, 0, 0, 0, 0},

{0, 0, 0, 1, 1, 0}

};

double reconstructed_X[2][6];





// test

for(int i=0; i<test_N; i++) {

rbm.reconstruct(test_X[i], reconstructed_X[i]);

for(int j=0; j<n_visible; j++) {

printf("%.5f ", reconstructed_X[i][j]);

}

cout << endl;

}



}







int main() {

test_rbm();

return 0;

标签： c++ parallel-processing openacc

2条回答

兄弟一词,经得起流年.

2楼-- · 2019-09-14 21:23

You have a few errors which were giving you wrong answers. I've correct these below.

As for performance, you don't have enough parallelism to out perform running the code sequentially. The loops you're parallelizing have very little compute, use a reductions, and are very small. To see speed-up over the host, you'll need to use larger sizes (lengths of many thousands) and preferably push the gang level of the parallelism to a higher loop. I tried this, but the binomial routine has a dependency (the call to rand) which prevents the parallelization of the loops in "sample_[vh]_given[_vh]".

#include <iostream>
#include <math.h>
#include "utils.h"
#include "RBM.h"
using namespace std;
using namespace utils;

RBM::RBM(int size, int n_v, int n_h, double **w, double *hb, double *vb) {
        N = size;
        n_visible = n_v;
        n_hidden = n_h;

        if(w == NULL) {
                W = new double*[n_hidden];
                for(int i=0; i<n_hidden; i++) W[i] = new double[n_visible];
                double a = 1.0 / n_visible;

                for(int i=0; i<n_hidden; i++) {
                        for(int j=0; j<n_visible; j++) {
                                W[i][j] = uniform(-a, a);
                        }
                }
        } else {
                W = w;
        }

        if(hb == NULL) {
                hbias = new double[n_hidden];
                for(int i=0; i<n_hidden; i++) hbias[i] = 0;
        } else {
                hbias = hb;
        }

        if(vb == NULL) {
                vbias = new double[n_visible];
                for(int i=0; i<n_visible; i++) vbias[i] = 0;
        } else {
                vbias = vb;
        }
#pragma acc enter data copyin (this,W[0:n_hidden][0:n_visible],hbias[0:n_hidden],vbias[0:n_visible])
}

RBM::~RBM() {

#pragma acc exit data delete ( hbias[0:n_hidden],vbias[0:n_visible],W[0:n_hidden][0:n_visible],this )
        for(int i=0; i<n_hidden; i++) delete[] W[i];
        delete[] W;
        delete[] hbias;
        delete[] vbias;
}

void RBM::contrastive_divergence(int *input, double lr, int k) {
        double *ph_mean = new double[n_hidden];
        int *ph_sample = new int[n_hidden];
        double *nv_means = new double[n_visible];
        int *nv_samples = new int[n_visible];
        double *nh_means = new double[n_hidden];
        int *nh_samples = new int[n_hidden];

        /* CD-k */
        sample_h_given_v(input, ph_mean, ph_sample);

        for(int step=0; step<k; step++) {
                if(step == 0) {
                        gibbs_hvh(ph_sample, nv_means, nv_samples, nh_means, nh_samples);
                } else {
                        gibbs_hvh(nh_samples, nv_means, nv_samples, nh_means, nh_samples);
                }
        }

        for(int i=0; i<n_hidden; i++) {
                for(int j=0; j<n_visible; j++) {
                        // W[i][j] += lr * (ph_sample[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                        W[i][j] += lr * (ph_mean[i] * input[j] - nh_means[i] * nv_samples[j]) / N;
                }
                hbias[i] += lr * (ph_sample[i] - nh_means[i]) / N;
        }

        for(int i=0; i<n_visible; i++) {
                vbias[i] += lr * (input[i] - nv_samples[i]) / N;
        }
#pragma acc update device(vbias[0:n_visible],hbias[0:n_hidden],W[0:n_hidden][0:n_visible])

        delete[] ph_mean;
        delete[] ph_sample;
        delete[] nv_means;
        delete[] nv_samples;
        delete[] nh_means;
        delete[] nh_samples;
}

void RBM::sample_h_given_v(int *v0_sample, double *mean, int *sample) {

#pragma acc data copyin(v0_sample[0:n_visible])
        {
                for(int i=0; i<n_hidden; i++) {
                        mean[i] = propup(v0_sample, W[i], hbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}

void RBM::sample_v_given_h(int *h0_sample, double *mean, int *sample) {
#pragma acc data copyin(h0_sample[0:n_visible])
        {
                for(int i=0; i<n_visible; i++) {
                        mean[i] = propdown(h0_sample, i, vbias[i]);
                        sample[i] = binomial(1, mean[i]);
                }
        }
}

double RBM::propup(int *v, double *w, double b) {

        double pre_sigmoid_activation = 0.0;
#pragma acc parallel present(w,v)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_visible; j++) {
                        pre_sigmoid_activation += w[j] * v[j];
                }
        }

        pre_sigmoid_activation += b;
        return sigmoid(pre_sigmoid_activation);
}
double RBM::propdown(int *h, int i, double b) {

        double pre_sigmoid_activation = 0.0;

#pragma acc parallel present(W,h)
        {
#pragma acc loop reduction(+:pre_sigmoid_activation)
                for(int j=0; j<n_hidden; j++) {
                        pre_sigmoid_activation += W[j][i] * h[j];
                }
        }

        pre_sigmoid_activation += b;

        return sigmoid(pre_sigmoid_activation);

}
void RBM::gibbs_hvh(int *h0_sample, double *nv_means, int *nv_samples, \
                double *nh_means, int *nh_samples) {
        sample_v_given_h(h0_sample, nv_means, nv_samples);
        sample_h_given_v(nv_samples, nh_means, nh_samples);
}

void RBM::reconstruct(int *v, double *reconstructed_v) {
        double *h = new double[n_hidden];
        double pre_sigmoid_activation;

#pragma acc data copyin(v[0:n_visible])
        {

                for(int i=0; i<n_hidden; i++) {
                        h[i] = propup(v, W[i], hbias[i]);
                }

                for(int i=0; i<n_visible; i++) {
                        pre_sigmoid_activation = 0.0;
                        for(int j=0; j<n_hidden; j++) {
                                pre_sigmoid_activation += W[j][i] * h[j];
                        }
                        pre_sigmoid_activation += vbias[i];

                        reconstructed_v[i] = sigmoid(pre_sigmoid_activation);
                }
        }
        delete[] h;
}

//----------------------------------------------------The main
void test_rbm() {

        srand(0);
        double learning_rate = 0.1;
        int training_epochs = 1000;
        int k = 1;
        int train_N = 6;
        int test_N = 2;
        int n_visible = 6;
        int n_hidden = 3;

        // training data
        int train_X[6][6] = {
                {1, 1, 1, 0, 0, 0},
                {1, 0, 1, 0, 0, 0},
                {1, 1, 1, 0, 0, 0},
                {0, 0, 1, 1, 1, 0},
                {0, 0, 1, 0, 1, 0},
                {0, 0, 1, 1, 1, 0}
        };

        // construct RBM
        RBM rbm(train_N, n_visible, n_hidden, NULL, NULL, NULL);

        // train
        for(int epoch=0; epoch<training_epochs; epoch++) {
                for(int i=0; i<train_N; i++) {
                        rbm.contrastive_divergence(train_X[i], learning_rate, k);
                }
        }

        // test data
        int test_X[2][6] = {
                {1, 1, 0, 0, 0, 0},
                {0, 0, 0, 1, 1, 0}
        };

        double reconstructed_X[2][6];

        // test
        for(int i=0; i<test_N; i++) {
                rbm.reconstruct(test_X[i], reconstructed_X[i]);
                for(int j=0; j<n_visible; j++) {
                        printf("%20.15f ", reconstructed_X[i][j]);
                }
                cout << endl;
        }
}

int main() {
        test_rbm();
        return 0;
}

0人赞添加讨论(0) 举报

别忘想泡老子

3楼-- · 2019-09-14 21:33

You could put a bounded binomial distribution (as an array of int) inside the kernel function. Thereby ruling out the need for rand().

0人赞添加讨论(0) 举报

RBM no improvement with OpenACC on the code yet

采纳回答

编辑标签

举报内容

检举类型

检举原因

检举说明(必填)

打开微信“扫一扫”，打开网页后点击屏幕右上角分享按钮

付费偷看金额在0.1-10元之间