I'm refactoring thrust code by converting from an AoS to SoA approach to take advantage of memory coalescing. To that end, I have two vectors that are reduced by a common key, and which are then used to calculate the values for an output vector. The original code did this with a single functor, which I'd like to emulate.
Essentially:
Oᵢ = Rᵢ / Sᵢ, where Rᵢ and Sᵢ are vectors reduced by the same key, and Oᵢ is the corresponding output vector.
Below is code that exemplifies what I'm trying to do:
typedef tuple<int,int> Tuple;
struct BinaryTupleOp : public thrust::binary_function<Tuple const &, Tuple const &, int>
{
__host__ __device__
int operator()(Tuple const & lhs, Tuple const & rhs) const {
// get<0> = vals, get<1> = other_vals
return (get<0>(lhs) + get<0>(rhs)) / (get<1>(lhs) + get<1>(rhs));
}
};
int main(int argc, char ** argv)
{
const int N = 7;
device_vector<int> keys(N);
keys[0] = 1; // represents sorted keys
keys[1] = 1;
keys[2] = 2;
keys[3] = 2;
keys[4] = 3;
keys[5] = 3;
keys[6] = 3;
device_vector<int> vals(N);
vals[0] = 6; // just some random numbers
vals[1] = 3;
vals[2] = 9;
vals[3] = 4;
vals[4] = 6;
vals[5] = 1;
vals[6] = 5;
device_vector<int> other_vals(N);
other_vals[0] = 4; // more randomness
other_vals[1] = 1;
other_vals[2] = 3;
other_vals[3] = 6;
other_vals[4] = 2;
other_vals[5] = 5;
other_vals[6] = 7;
device_vector<int> new_keys(N);
device_vector<int> output(N);
typedef device_vector<int>::iterator Iterator;
thrust::pair<Iterator, Iterator> new_end;
thrust::equal_to<int> binary_pred;
new_end = thrust::reduce_by_key(keys.begin(), keys.end(),
make_zip_iterator(make_tuple(vals.begin(), other_vals.begin())),
new_keys.begin(),
output.begin(),
binary_pred,
BinaryTupleOp() );
Iterator i = new_keys.begin();
Iterator j = output.begin();
for (;
i != new_end.first;
i++, j++ ) {
std::cout << "key " << *i << " sum " << *j << endl;
}
return 0;
}
Unfortunately this yields such errors as error: no operator "=" matches these operands
, error: no suitable conversion function from "InputValueType" to "TemporaryType" exists
, and error: no suitable conversion function from "const thrust::tuple<int, int, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type, thrust::null_type>" to "int" exists
. I've played around with parameter type variants in the functor since I think that's the ultimate source of the problem, but to no avail.
As a workaround I'll probably break out the two reductions separately, then use a transform to create the output vector. (Which maybe suggests linking together various transform_reduce()
calls, but it seems I'd want the inverse, something like a reduce_transform()
, which doesn't exist, AFAIK.)
Meanwhile, what am I doing wrong?
Meanwhile, what am I doing wrong?
thrust::reduce
(or thrust::reduce_by_key
) will perform a parallel reduction. This parallel reduction requires a reduction operator that can be applied pairwise. To take a very simple example, suppose we want to reduce 3 elements (E1
, E2
, and E3
) and we have a binary operation (bOp
) that we will use to define the reduction operation. Thrust might do something like this:
E1 E2 E3
\ /
bOp
\ /
bOp
|
result
That is, the binary op will be used to combine or "reduce" elements E1
and E2
into a single temporary partial result, and this result will be fed back into the binary op combined with element E3
to produce the final result
.
The implication of this is that the output of the binary op (and therefore the output type of the values output iterator) must match its input type (and therefore the input type of the values input iterator).
But your binary op does not meet this requirement, nor do the types of the iterators you have passed for values input and values output:
new_end = thrust::reduce_by_key(keys.begin(), keys.end(),
make_zip_iterator(make_tuple(vals.begin(), other_vals.begin())),
/* dereferencing the above iterator produces an <int, int> tuple */
new_keys.begin(),
output.begin(),
/* dereferencing the above iterator produces an int */
binary_pred,
BinaryTupleOp() );
I believe the above general requirement for the binary op (i.e. for the values input and output iterator types) is expressed in the thrust docs for this function as:
InputIterator2's value_type is convertible to OutputIterator2's value_type.
I can think of two approaches to solve the problem you indicated here:
Oᵢ = Rᵢ / Sᵢ, where Rᵢ and Sᵢ are vectors reduced by the same key, and Oᵢ is the corresponding output vector.
the first of which I think you've mentioned already:
Perform a reduce_by_key
just on the two values sequences zipped together. This can be done in a single reduce_by_key
call. Then pass the zipped result sequence to a thrust::transform
call to perform the elementwise division.
If you're desperate to accomplish everything in a single thrust call, then the clever output iterator work done by @m.s. here may be a possible approach, to perform the transform operation on the values output. (edit: after some study, I'm not sure this method can be used with a reduction)
If you didn't like either of the above suggestions, it's possible to realize your desired reduction in a single call to reduce_by_key
, at the expense of some additional storage (and arguably some wasted operations). The basic idea is to organize around a 3-tuple instead of a 2-tuple. At each reduction step, we will combine (sum) the corresponding (first, and second) components of our lhs
and rhs
tuples, storing these results in the first and second positions of the output tuple. In addition, we will compute the value of the 3rd position of the output tuple as the result of the division of the first and second positions of the output tuple. You are doing int
operations here, including division, so I chose to modify your input data slightly for easy result verification. Here is a worked example, based loosely on your code:
$ cat t1143.cu
#include <thrust/device_vector.h>
#include <thrust/reduce.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
using namespace std;
using namespace thrust;
typedef tuple<int,int,int> Tuple;
struct BinaryTupleOp : public thrust::binary_function<const Tuple &, const Tuple &, Tuple>
{
__host__ __device__
Tuple operator()(const Tuple & lhs, const Tuple & rhs) const {
Tuple temp;
get<0>(temp) = get<0>(lhs)+get<0>(rhs);
get<1>(temp) = get<1>(lhs)+get<1>(rhs);
get<2>(temp) = get<0>(temp)/get<1>(temp);
return temp;
}
};
int main(int argc, char ** argv)
{
const int N = 7;
device_vector<int> keys(N);
keys[0] = 1; // represents sorted keys
keys[1] = 1;
keys[2] = 2;
keys[3] = 2;
keys[4] = 3;
keys[5] = 3;
keys[6] = 3;
device_vector<int> vals(N);
vals[0] = 6; // just some random numbers
vals[1] = 3;
vals[2] = 8;
vals[3] = 4;
vals[4] = 5;
vals[5] = 5;
vals[6] = 5;
device_vector<int> other_vals(N);
other_vals[0] = 1; // more randomness
other_vals[1] = 2;
other_vals[2] = 1;
other_vals[3] = 2;
other_vals[4] = 1;
other_vals[5] = 1;
other_vals[6] = 1;
device_vector<int> new_keys(N);
device_vector<int> output(N);
device_vector<int> v1(N);
device_vector<int> v2(N);
thrust::equal_to<int> binary_pred;
int rsize = thrust::get<0>(thrust::reduce_by_key(keys.begin(), keys.end(),
make_zip_iterator(make_tuple(vals.begin(), other_vals.begin(), thrust::constant_iterator<int>(0))),
new_keys.begin(),
make_zip_iterator(make_tuple(v1.begin(), v2.begin(), output.begin())),
binary_pred,
BinaryTupleOp() )) - new_keys.begin();
for (int i = 0; i < rsize; i++){
int key = new_keys[i];
int val = output[i];
std::cout << "key " << key << " sum " << val << endl;
}
return 0;
}
$ nvcc -o t1143 t1143.cu
$ ./t1143
key 1 sum 3
key 2 sum 4
key 3 sum 5
$
Others may have better ideas about how to craft your desired result.