I have following matlab code;
tempx = full(sum(X.^2, 2));
tempc = full(sum(C.^2, 2).');
D = -2*(X * C.');
D = bsxfun(@plus, D, tempx);
D = bsxfun(@plus, D, tempc);
where X is nxm and W is kxm matrices realtively. One is the data and the other is the weight matrix. I find the distance matrix D with the given code. I am watching an efficient Cublas or Thrust implementation of this operations. I succeeded the line D = -2*(X * C.');
by cublas but the residual part is still a question as a newbie? Can anybody help with a snippet or give suggestions?
Here is what I have so far:
Edit: I add some more code and I need bsxfun like implementation of summation. Sum vector V with all the columns and sum V2 with all the rows as a final step.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#define N 4
#define K 4
#define M 3
template <typename T>
struct square_op{
__host__ __device__ T operator()(const T& x)const{
return x*x;
}
};
int main (void){
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// Fill with random data
thrust::host_vector<float> C_h(N*K);
thrust::host_vector<float> A_h(N*M); //data matrix
thrust::host_vector<float> B_h(K*M); //weight matrix
thrust::sequence(A_h.begin(),A_h.end());
thrust::sequence(B_h.begin(),B_h.end());
// std::generate(A_h.begin(), A_h.end(), rand);
// std::generate(B_h.begin(), B_h.end(), rand);
thrust::device_vector<float> A_d = A_h;
thrust::device_vector<float> B_d = B_h;
thrust::device_vector<float> C_d(N*K);
thrust::device_vector<float> dummy_x(M,1);
thrust::device_vector<float> A_sum_vec_d(N,0);
thrust::device_vector<float> B_sum_vec_d(K,0);
// TEST variables
thrust::host_vector<float> A_sum_vec_h(N,0);
thrust::host_vector<float> B_sum_vec_h(K,0);
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
std::cout<< "Starting GPU run" <<std::endl; //add this line
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//************************************
// Calculate Square Elements
//************************************
square_op<float> unary_op = square_op<float>();
thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);
// TEST
thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
printf("Matrix A after square!!\n");
for (int i = 0; i < N; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",A_h[i*M+j]);
}
printf("\n");
}
printf("\n");
thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
printf("Matrix B after square!!\n");
for (int i = 0; i < K; ++i) {
for (int j = 0; j < M; ++j) {
printf("%f ",B_h[i*M+j]);
}
printf("\n");
}
printf("\n");
//************************************
// Sum of the Rows
//************************************
float alpha = 1.0f;
float beta = 0.0f;
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("1 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("2 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// TEST
thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());
printf("A_vec after row sum!!\n");
for (int j = 0; j < N; ++j) {
printf("%f ",A_sum_vec_h[j]);
}
printf("\n \n");
thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());
printf("B_vec after row sum!!\n");
for (int j = 0; j < K; ++j) {
printf("%f ",B_sum_vec_h[j]);
}
printf("\n \n");
//************************************
// Matrix Multiplication
//************************************
alpha = 2.0f;
beta = 0.0f;
//alpha*(A*B')+beta in row_major_order
stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
float totalTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
totalTime = elapsedTime/1000;
std::cout<<"Elapsed_time:"<< totalTime<<std::endl;
//copy back data
thrust::copy(C_d.begin(),C_d.end(),C_h.begin());
for (int i = 0; i < N; ++i) {
for (int j = 0; j < K; ++j) {
printf("%f ",C_h[i*K+j]);
}
printf("\n");
}
//************************************
// Final summation
//************************************
//.... NEED CODE
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
printf("Execution ends!!\n");
return EXIT_SUCCESS;
}
Here is my final code that is working as in my description. It is possibly not the most efficient one but works at least.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda_runtime.h>
#include "cublas_v2.h"
#include <algorithm>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/system_error.h>
#include <thrust/sequence.h>
#include <thrust/device_free.h>
#include <thrust/reduce.h>
#include <thrust/copy.h>
#define N 4
#define K 4
#define M 3
//----------------------------
// Find min argument of array
// ---------------------------
// C-style indexing
int ci(int row, int column, int nColumns) {
return row*nColumns+column;
}
// Convert a linear index to a row index
template <typename T>
struct linear_index_to_row_index : public thrust::unary_function<T,T>
{
T C; // number of columns
__host__ __device__
linear_index_to_row_index(T C) : C(C) {}
__host__ __device__
T operator()(T i)
{
return i / C;
}
};
typedef thrust::tuple<int,float> argMinType;
struct argMin : public thrust::binary_function<argMinType,argMinType,argMinType>
{
__host__ __device__
argMinType operator()(const argMinType& a, const argMinType& b) const
{
if (thrust::get<1>(a) < thrust::get<1>(b)){
return a;
} else {
return b;
}
}
};
thrust::device_vector<argMinType> minsOfRowSpace(thrust::device_vector<float> A, int nRows, int nColumns) {
// allocate storage for row argmins and indices
thrust::device_vector<argMinType> row_argmins(nRows);
thrust::device_vector<int> row_indices(nRows);
// compute row argmins by finding argmin values with equal row indices
thrust::reduce_by_key
(thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)),
thrust::make_transform_iterator(thrust::counting_iterator<int>(0), linear_index_to_row_index<int>(nColumns)) + (nRows*nColumns),
thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<int>(0),A.begin())),
row_indices.begin(),
row_argmins.begin(),
thrust::equal_to<int>(),
argMin());
return row_argmins;
}
template <typename T>
struct square_op{
__host__ __device__ T operator()(const T& x)const{
return x*x;
}
};
int main (void){
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasCreate(&handle);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
// Fill with random data
thrust::host_vector<float> C_h(N*K);
thrust::host_vector<argMinType> minOfC_h(N);
thrust::host_vector<float> A_h(N*M); //data matrix
thrust::host_vector<float> B_h(K*M); //weight matrix
thrust::sequence(A_h.begin(),A_h.end());
thrust::sequence(B_h.begin(),B_h.end());
// std::generate(A_h.begin(), A_h.end(), rand);
// std::generate(B_h.begin(), B_h.end(), rand);
thrust::device_vector<float> A_d = A_h;
thrust::device_vector<float> B_d = B_h;
thrust::device_vector<float> C_d(N*K);
thrust::device_vector<float> dummy_x(M,1);
thrust::device_vector<float> A_sum_vec_d(N,0);
thrust::device_vector<float> B_sum_vec_d(K,0);
thrust::device_vector<argMinType> minsOfC_d(N);
// TEST variables
thrust::host_vector<float> A_sum_vec_h(N,0);
thrust::host_vector<float> B_sum_vec_h(K,0);
// TEST
// for (int i = 0; i < N; ++i) {
// for (int j = 0; j < M; ++j) {
// printf("%f ",A_h[i*M+j]);
// }
// printf("\n");
// }
//
// printf("\n");
//
// for (int i = 0; i < K; ++i) {
// for (int j = 0; j < M; ++j) {
// printf("%f ",B_h[i*M+j]);
// }
// printf("\n");
// }
//
// printf("\n");
std::cout<< "Starting GPU run" <<std::endl; //add this line
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
//************************************
// Matrix Multiplication
//************************************
float alpha = -2.0f;
float beta = 0.0f;
//alpha*(A*B')+beta in row_major_order
stat = cublasSgemm_v2(handle,CUBLAS_OP_T,CUBLAS_OP_N,N,K,M,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&B_d[0]), M, &beta,thrust::raw_pointer_cast(&C_d[0]), N);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
//TEST
// C_h = C_d;
// printf("After Matrix Multiplicaton\n");
// for (int i = 0; i < N; ++i) {
// for (int j = 0; j < K; ++j) {
// printf("%f ",C_h[i*K+j]);
// }
// printf("\n");
// }
// printf("\n");
//************************************
// Calculate Square Elements
//************************************
try{
square_op<float> unary_op = square_op<float>();
thrust::transform(A_d.begin(),A_d.end(),A_d.begin(),unary_op);
thrust::transform(B_d.begin(),B_d.end(),B_d.begin(),unary_op);
}catch(thrust::system_error &e)
{
// output an error message and exit
std::cerr << "Error Transform square elements: " << e.what() << std::endl;
exit(-1);
}
// TEST
// thrust::copy(A_d.begin(),A_d.end(), A_h.begin());
// printf("Matrix A after square!!\n");
// for (int i = 0; i < N; ++i) {
// for (int j = 0; j < M; ++j) {
// printf("%f ",A_h[i*M+j]);
// }
// printf("\n");
// }
//
// printf("\n");
//
// thrust::copy(B_d.begin(),B_d.end(), B_h.begin());
// printf("Matrix B after square!!\n");
// for (int i = 0; i < K; ++i) {
// for (int j = 0; j < M; ++j) {
// printf("%f ",B_h[i*M+j]);
// }
// printf("\n");
// }
//
// printf("\n");
//************************************
// Sum of the Rows
//************************************
alpha = 1.0f;
beta = 0.0f;
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,N,&alpha,thrust::raw_pointer_cast(&A_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("1 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
stat = cublasSgemv_v2(handle,CUBLAS_OP_T,M,K,&alpha,thrust::raw_pointer_cast(&B_d[0]),M,thrust::raw_pointer_cast(&dummy_x[0]),1,&beta,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("2 CUBLAS initialization failure!!\n");
return EXIT_FAILURE;
}
//thrust::device_free(thrust::raw_pointer_cast(&dummy_x[0]));
// TEST
// thrust::copy(A_sum_vec_d.begin(), A_sum_vec_d.end(), A_sum_vec_h.begin());
//
// printf("A_vec after row sum!!\n");
// for (int j = 0; j < N; ++j) {
// printf("%f ",A_sum_vec_h[j]);
// }
// printf("\n \n");
//
// thrust::copy(B_sum_vec_d.begin(), B_sum_vec_d.end(), B_sum_vec_h.begin());
//
// printf("B_vec after row sum!!\n");
// for (int j = 0; j < K; ++j) {
// printf("%f ",B_sum_vec_h[j]);
// }
// printf("\n \n");
//************************************
// Final summation
//************************************
thrust::device_vector<float> dummy_y(N,1);
alpha = 1.0f;
// Column-wise sum
stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&A_sum_vec_d[0]),1,thrust::raw_pointer_cast(&C_d[0]),K);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS final summation failure!!\n");
return EXIT_FAILURE;
}
// Row-wise sum
stat = cublasSger_v2(handle,K,N,&alpha,thrust::raw_pointer_cast(&B_sum_vec_d[0]),1,thrust::raw_pointer_cast(&dummy_y[0]),1,thrust::raw_pointer_cast(&C_d[0]),K);
if (stat != CUBLAS_STATUS_SUCCESS){
printf("CUBLAS final summation failure!!\n");
return EXIT_FAILURE;
}
//thrust::device_free(&dummy_y[0]);
// TEST
// printf("C matrix after first final sum\n");
// C_h = C_d;
// for (int i = 0; i < N; ++i) {
// for (int j = 0; j < K; ++j) {
// printf("%f ",C_h[i*K+j]);
// }
// printf("\n");
// }
// printf("\n");
//*****************************
// Find min elements
// ****************************
minsOfC_d=minsOfRowSpace(C_d, N, K);
//thrust::copy(minsOfC_d.begin(),minsOfC_d.end(),minOfC_h.begin());
minOfC_h = minsOfC_d;
// TEST
// for (size_t i = 0; i < N; i++){
// for (size_t j = 0; j < 1; j++){
// argMinType tmp=minOfC_h[ci(i,j,1)];
// std::cout << thrust::get<0>(tmp) << " ";
// }
// std::cout << "\n";
// }
// std::cout << "\n";
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
float totalTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
cudaEventDestroy(start);
cudaEventDestroy(stop);
totalTime = elapsedTime/1000;
std::cout<<"Elapsed_time:"<< totalTime<<std::endl;
printf("Execution ends!!\n");
return EXIT_SUCCESS;
}