I have two arrays array1 and array2 with n and m elements respectively. I want to find all pair distances between the elements. A brute force algorithm on the CPU is then:
for(int i =0; i<n; i++)
{
for(int j =0; j<m; j++)
{
array_pair_distances[i][j] = array1[i]-array2[j];
}
}
Using CUDA Thrust I have simply turned this n*m problem into a n or m problem by using thrust::transform and a single for-loop. My question is how can I remove this last for-loop using Thrust?
EDIT: Added example of implementation with Thrust and one for-loop. The code checks if the pair-distance is greater than 0.1 and returns an int.
#include <stdio.h>
#include <iostream>
#include <cuda.h>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <thrust/random.h>
#include <thrust/fill.h>
#include <thrust/transform.h>
#include <thrust/reduce.h>
struct PairDistanceCheck : public thrust::binary_function<float,float,int>
{
__host__ __device__
int operator()(const float& a, const float& b) const
{
if(thrust::get<0>(a) - thrust::get<0>(b) > 0.1)
{
return 1;
}
else return 0;
}
};
void function()
{
int n = 1000;
int m = 2000;
// Initialization of host vectors
thrust::host_vector<float> h_1 (n);
thrust::host_vector<float> h_2 (m);
// Fill host_vectors with data
*
*
*
//
// Copy host_vectors to device_vectors
thrust::device_vector<float> d_1 = h_1;
thrust::device_vector<float> d_2 = h_2;
thrust::device_vector<float> d_temp (m);
thrust::device_vector<int> d_sum (m);
thrust::fill(d_sum.begin(), d_sum.end(), 0);
thrust::device_vector<int> d_result (m);
for (int i=0; i<n; i++)
{
// Filling device_vector d_temp with element i from d_2
thrust::fill(d_temp.begin(), d_temp.end(), d_2[i]);
thrust::transform((d_1.begin(), d_1.end(), d_temp.begin(), d_result.begin(), PairDistanceCheck());
// Summing the vectors
thrust::transform(d_sum.begin(), d_sum.end(), d_result.begin(), d_sum.begin(), thrust::plus<int>());
}
// Final sum
int sum = thrust::reduce(d_sum.begin(), d_sum.end(), (int) 0, thrust::plus<int>());
return 0;
}
The very short answer is that you can't.
Thrust has no outer product algorithms, which is what would be required to perform the sort of calculation you would be interested in. You could do this by filling two matrices with rows/columns of the two input vectors and then directly subtract those. But that would be very inefficient (both memory and performance) compared to a proper outer product implementation.