Is it possible to do segmented sort in with CUDPP in CUDA? By segmented sort, I mean to sort elements of array which are protected by flags like below.
A[10,9,8,7,6,5,4,3,2,1]
Flag array[1,0,1,0,0,1,0,0,0,0]
Sort elements of A which are between consecutive 1.
Expected output
[9,10,6,7,8,1,2,3,4,5]
you can do this in a single sorting pass: the idea is to adjust the elements in your array such that sort will relocate elements only within the "segments"
for your example:
A[10,9,8,7,6,5,4,3,2,1]
flag[0,0,1,0,0,1,0,0,0,0]
(I removed the first 1 since it's not needed)
first scan the flag array:
scanned_flag[0,0,1,1,1,2,2,2,2,2]
then you have many options depending on the number types, e.g. for unsigned integers you can set the highest bits to distinguish the "segments".
The easiest way is just to add the largest element multiplied by scanned_flags:
A + scanned_flag*10 = [10,9,18,17,16,25,24,23,22,21]
the rest is easy: sort the array and reverse the transformation.
Here are the two versions: using Arrayfire and thrust. Check whichever you like more.
Arrayfire:
void af_test() {
int A[] = {10,9,8,7,6,5,4,3,2,1};
int S[] = {0, 0,1,0,0,1,0,0,0,0};
int n = sizeof(A) / sizeof(int);
af::array devA(n, A, af::afHost);
af::array devS(n, S, af::afHost);
// obtain the max element
int maxi = af::max< int >(devS);
// scan the keys
// keys = 0,0,1,1,1,2,2,2,2,2
af::array keys = af::accum(devS);
// compute: A = A + keys * maxi
// A = 10,9,18,17,16,25,24,23,22,21
devA = devA + keys * maxi;
// sort the array
// A = 9,10,16,17,18,21,22,23,24,25
devA = af::sort(devA);
// compute: A = A - keys * maxi
// A = 9,10,6,7,8,1,2,3,4,5
devA = devA - keys * maxi;
// print the results
print(devA);
}
Thrust:
template<typename T>
struct add_mul : public binary_function<T,T,T>
{
add_mul(const T& _factor) : factor(_factor) {
}
__host__ __device__ T operator()(const T& a, const T& b) const
{
return (a + b * factor);
}
const T factor;
};
void thrust_test()
{
int A[] = {10,9,8,7,6,5,4,3,2,1};
int S[] = {0, 0,1,0,0,1,0,0,0,0};
int n = sizeof(A) / sizeof(int);
thrust::host_vector< int > hA(A, A + n), hS(S, S + n);
thrust::device_vector< int > devA = hA, devS = hS, keys(n);
// scan the keys
thrust::inclusive_scan(devS.begin(), devS.end(), keys.begin());
// obtain the maximal element
int maxi = *(thrust::max_element(devA.begin(), devA.end()));
// compute: A = A + keys * maxi
thrust::transform(devA.begin(), devA.end(), keys.begin(), devA.begin(), add_mul< int >(maxi));
// sort the array
thrust::sort(devA.begin(), devA.end());
// compute: A = A - keys * maxi
thrust::transform(devA.begin(), devA.end(), keys.begin(), devA.begin(), add_mul< int >(-maxi));
// copy back to the host
hA = devA;
std::cout << "\nSorted array\n";
thrust::copy(hA.begin(), hA.end(), std::ostream_iterator<int>(std::cout, "\n"));
}