Thrust copy - OutputIterator column-major order

I have a vector of matrices (stored as column major arrays) that I want to concat vertically. Therefore, I want to utilize the copy function from the thrust framework as in the following example snippet:

int offset = 0;
for(int i = 0; i < matrices.size(); ++i) {
    thrust::copy(
        thrust::device_ptr<float>(matrices[i]),
        thrust::device_ptr<float>(matrices[i]) + rows[i] * cols[i],
        thrust::device_ptr<float>(result) + offset
    );

    offset += rows[i] * cols[i];
}

EDIT: extended example:

The problem is, that if I have a matrix A = [[1, 2, 3], [4, 5, 6]] (2 rows, 3 cols; in memory [1, 4, 2, 5, 3, 6]) and another B = [[7, 8, 9]] (1 row, 3 cols; in memory [7, 8, 9]), the resulting matrix C is not [[1, 2, 3], [4, 5, 6], [7, 8, 9]] (3 row, 3 cols; in memory [1, 4, 7, 2, 5, 8, 3, 6, 9]), but [[1, 5, 7], [4, 3, 8], [2, 6, 9]] (3 row, 3 cols; in memory [1, 4, 2, 5, 3, 6, 7, 8, 9]).

Is there an way to create an special OutputIterator for this problem (I have searched for it, but found nothing), or a fast alternative way?

EDIT: SSCCE

#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/device_vector.h>
#include <iostream>

void printMat2d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<float>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<float>& mat, int rows, int cols, int add) {
    thrust::host_vector<float> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}

int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<float> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<float> result(resultRows * resultCols);
    int offset = 0;
    for(int i = 0; i < matrices.size(); ++i) {
        thrust::copy(
            matrices[i].begin(),
            matrices[i].end(),
            result.begin() + offset
        );

        offset += rows[i] * cols[i];
    }

    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

EDIT: I've replaced my previous answer that used the strided range per row method, with a slightly different approach, that gets the copy operation down to a single thrust call (per matrix to be copied).

The key idea here was to use a functor that converts row-major memory indexing to column-major memory indexing. This functor can then be used with a counting_iterator to create arbitrary row-major to column major memory indices (via make_transform_iterator). These indices can then be used in a permutation_iterator for the source matrix to select the element to be copied and a permutation_iterator for the destination matrix to select the memory position to copy to. For a general review of transform_iterator, counting_iterator, and permutation_iterator, refer to the thrust quick start guide. I happened to be using CUDA 5.0 and thrust 1.5.3 for this exercise.

#include <thrust/device_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/functional.h>
#include <thrust/copy.h>
#include <iostream>

struct rm2cm_idx_functor : public thrust::unary_function<int, int>
{
  int r;
  int c;

  rm2cm_idx_functor(int _r, int _c) : r(_r), c(_c) {};

  __host__ __device__
  int operator() (int idx)  {
    unsigned my_r = idx/c;
    unsigned my_c = idx%c;
    return (my_c * r) + my_r;
  }
};

typedef float my_type;


void printMat2d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            std::cout << mat[row + col * rows] << " ";
        }
        std::cout << std::endl;
    }
}

void printMat1d(thrust::device_vector<my_type>& mat, int rows, int cols) {
    for(int idx = 0; idx < cols*rows; ++idx) {
            std::cout << mat[idx] << " ";
    }
    std::cout << std::endl;
}

void generateMat(thrust::device_vector<my_type>& mat, int rows, int cols, int add) {
    thrust::host_vector<my_type> matHost(rows * cols);
    int val = 0;
    for(int row = 0; row < rows; ++row) {
        for(int col = 0; col < cols; ++col) {
            matHost[row + col * rows] = val + add;
            val++;
        }
    }
    mat = matHost;
}


void copyMat(thrust::device_vector<my_type>& src, thrust::device_vector<my_type>& dst, unsigned src_rows, unsigned src_cols, unsigned dst_rows, unsigned offset){
   thrust::copy_n(thrust::make_permutation_iterator(src.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), rm2cm_idx_functor(src_rows, src_cols))), src_rows*src_cols, thrust::make_permutation_iterator(dst.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(offset), rm2cm_idx_functor(dst_rows, src_cols))));
}



int main() {
    std::vector<int> rows(2);
    rows[0] = 2;
    rows[1] = 3;
    std::vector<int> cols(2);
    cols[0] = 3;
    cols[1] = 3;

    //generate matrices
    std::vector<thrust::device_vector<my_type> > matrices(2);
    for(size_t i = 0; i < matrices.size(); ++i) {
        generateMat(matrices[i], rows[i], cols[i], i*10);

        std::cout << "mat_ " << i << " = " << std::endl;
        printMat2d(matrices[i], rows[i], cols[i]);
        printMat1d(matrices[i], rows[i], cols[i]);
    }

    //copy
    int resultRows = 5;
    int resultCols = 3;
    thrust::device_vector<my_type> result(resultRows * resultCols);
    int offset = 0;

    for(int i = 0; i < matrices.size(); ++i) {
      copyMat(matrices[i], result, rows[i], cols[i], resultRows, offset);
      offset += rows[i]*cols[i];
    }


    std::cout << "result = " << std::endl;
    printMat2d(result, resultRows, resultCols);
    printMat1d(result, resultRows, resultCols);

    return 0;
}

This also assumes that source columns == destination columns, which seems to be implicit in your problem statement. Standard caveat: not saying this is bug free, but it seems to work for the test case built into the original problem statement.

This approach can probably still be further improved. Right now both the read operation and the write operation associated with the thrust::copy_n call will be uncoalesced. We can further improve this by making one of these two operations coalesced. This would necessitate combining the effect of index conversion functor for both read and write into a single mapping functor, which takes into account both source and destination dimensions. With a single mapping functor, the first term of the copy_n call could be just the source vector. I think it should also be possible to alternatively use thrust::gather or thrust::scatter. However, I haven't fully worked it out.