It's possible to do this in thrust. One possible approach would be to create a custom sort functor that traverses the rows that are given to it (let's say the rows are identified via indices passed to the functor), and then decides the ordering of those rows.
To implement this, we can create an index array, one index per row, that we will sort. We will sort this index array based on the given data array (using the custom sort functor that orders rows).
At the end, the only thing we have sorted is the index array, but it is now in the order needed to re-arrange the rows if desired.
Here's a fully worked example:
$ cat t631.cu
#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#define DWIDTH 10
typedef int mytype;
struct my_sort_functor
{
int my_width;
mytype *my_data;
my_sort_functor(int _my_width, mytype * _my_data): my_width(_my_width), my_data(_my_data) {};
__host__ __device__
bool operator()(const int idx1, const int idx2) const
{
bool flip = false;
for (int col_idx = 0; col_idx < my_width; col_idx++){
mytype d1 = my_data[(idx1*my_width)+col_idx];
mytype d2 = my_data[(idx2*my_width)+col_idx];
if (d1 > d2) break;
if (d1 < d2) {flip = true; break;}
}
return flip;
}
};
int main(){
mytype data[] = {
3, 2, 2, 3, 2, 2, 3, 3, 3, 3,
3, 3, 2, 2, 2, 2, 3, 3, 2, 2,
3, 2, 2, 3, 2, 2, 3, 3, 3, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 2, 2, 2, 2, 2, 3, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 2, 3, 2, 2, 3, 3, 2, 3,
3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
3, 2, 2, 3, 2, 2, 3, 3, 2, 3,
3, 3, 2, 3, 2, 2, 3, 3, 3, 3 };
int cols = DWIDTH;
int dsize = sizeof(data)/sizeof(mytype);
int rows = dsize/cols;
thrust::host_vector<mytype> h_data(data, data+dsize);
thrust::device_vector<mytype> d_data = h_data;
thrust::device_vector<int> idxs(rows);
thrust::sequence(idxs.begin(), idxs.end());
thrust::sort(idxs.begin(), idxs.end(), my_sort_functor(cols, thrust::raw_pointer_cast(d_data.data())));
thrust::host_vector<int> h_idxs = idxs;
for (int i = 0; i<rows; i++){
thrust::copy(h_data.begin()+h_idxs[i]*cols, h_data.begin()+(h_idxs[i]+1)*cols, std::ostream_iterator<mytype>(std::cout, ", "));
std::cout << std::endl;}
return 0;
}
$ nvcc -o t631 t631.cu
$ ./t631
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 2, 2, 2, 2, 2, 3, 2, 2, 2,
3, 2, 2, 3, 2, 2, 3, 3, 2, 3,
3, 2, 2, 3, 2, 2, 3, 3, 3, 2,
3, 2, 2, 3, 2, 2, 3, 3, 3, 3,
3, 3, 2, 2, 2, 2, 3, 3, 2, 2,
3, 3, 2, 2, 2, 2, 3, 3, 3, 3,
3, 3, 2, 3, 2, 2, 3, 3, 2, 3,
3, 3, 2, 3, 2, 2, 3, 3, 3, 3,
$
I'm pretty sure this would be significantly more efficient if the data could be delivered in transposed form, and rearrange the code to sort columns instead of rows (i.e. sort the index vector based on columns in the data array, rather than rows). This would be more efficient for the underlying data access that would be driven by the sort functor.
I've omitted the step that actually moves the rows to their new positions, but hopefully this should be straightforward. The general methodology is hinted at in the method to output the result, although it can be done with a single thrust call if desired.