Splicing two different length vectors based on the

this problem has been on my mind for several years. I have been learning a great deal of c++ and cuda from this forum. Previously I wrote the following in fortran serial code with a lot of conditional statements, and using gotos because I could not find a clever way to do it.
Here is the problem.

Given 4 vectors:

int indx(nshape);
float dnx(nshape); 
/* nshape > nord */
int indy(nord);
float dny(nord);

indx and indy are index vectors (keys for values dnx, dny respectively) containing global coordinates. It is unknown before parsing to this desired interlace/splice function their global ranges. All that is known is the length of the possible local range can be [0,nord*nord] and the max and min values within the vectors indx and indy.

I want to create new vectors dn1 and dn2 of the same length containing the original values of dnx and dny but are extended to pad out the original vectors dnx and dny with zeros for all the global coordinates which they DON'T contain of the other vector. They will form the vectors for an outerproduct which needs global addresses aligned.

I have not been able to find any reference on the web to using logical masks in c++ like in fortran to parallelise. My starting point is to use thrust libraries stable_sort to get in ascending order, binary_search to compare the arrays, partition etc. Perhaps there is a clear and concise way of doing this.

the example indices and value vecs below do not generally kick off from 0 or coincide with the local addressing of temporary indexing vectors, nor any even odd pattern - these values are just to help illustrate.)

indx[]={0,2,4,6,8,10,12};  indy[]={1, 2, 3, 4};
dnx[]={99,99,99,99,99,99,99};  dny[]={66,66,66,66};

ind[]={0,1,2,3,4,6,8,10,12}
dn1[]={99,0,99,0,99,99,99,99,99}
dn2[]={0,66,66,66,66,0,0,0,0}

Previously I did something like the following where kernel applied the comparisons, fill-ins and flow was based on following conditions and continued back to enter again through one of these conditional lines until the largest local index exceeded length of largest vector i,e i , j > nshape :

3
if(indx[i] < indy[j]{kernel_1; i++; if(i > nshape){return}; goto 3}
if(indx[i] == indy[j]){kernel_2;i++;j++; if(i || j > nshape) {return}; goto 3}
if(indx[i] > indy[j]{kernel_3, j++, if(j>nshape){return}; goto 3}

Sorry about the mongrel pseudocode. I really look forward to any ideas or better still solutions with c++, cuda, thrust. Many thanks. Mark

One approach that occurs to me is to do a parallel binary search, taking each value from the global index vector, and seeing if it has a match in the key vector. If it has a match in the key vector, then place the corresponding value in the result. If it does not have a match in the key vector, then place 0 in the result.

So for each position in:

ind[]={0,1,2,3,4,6,8,10,12}

see if there is a matching index in:

indy[]={1, 2, 3, 4};

We'll use a parallel binary search to do that, and to return the corresponding index (of the matching value in indy).

If we do find a match, then at the position in question in the result, we will place the value corresponding to indy[matching_index], ie. dny[matching_index]. Otherwise put zero in the result.

In the case of thrust, I was able to reduce this to two thrust calls.

The first is a thrust::lower_bound operation, which is effectively a vectorized/parallel binary search. Just as in the CUDA case, we use the binary search to take each element of the global vector (ind) and see if there is a match in the key vector (e.g. indx), returning the index of the matching location in the key vector (lower bound).

The second call is a somewhat complicated use of thrust::for_each. We create a special functor (extend_functor) for the for_each operation, which is initialized with a pointer to the start of the key vector (e.g. indx), and its length, as well as a pointer to the values vector (e.g. dnx). The extend_functor then takes a 3-tuple of the global vector, the lower-bound vector, and the result vector values, and performs the remaining steps. If the lower-bound value is within the length of the key vector, then check if the lower bound produces a match between key and global vectors. If it does, place the corresponding value into the result vector, otherwise place zero in the result vector.

The follow code implements this using CUDA, and also with thrust.

#include <iostream>
#include <thrust/device_vector.h>
#include <thrust/binary_search.h>
#include <thrust/copy.h>

#define MAX_DSIZE 32768
#define nTPB 256


struct extend_functor
{
  int vec_len;
  int *vec1;
  float *vec2;
  extend_functor(int *_vec1, int _vec_len, float *_vec2) : vec1(_vec1), vec2(_vec2),  vec_len(_vec_len) {};
  template <typename Tuple>
  __device__ __host__
  void operator()(const Tuple &my_tuple) {
    float result = 0.0f;
    if (thrust::get<1>(my_tuple) < vec_len)
      if (thrust::get<0>(my_tuple) == vec1[thrust::get<1>(my_tuple)]) result = vec2[thrust::get<1>(my_tuple)];
    thrust::get<2>(my_tuple) = result;
  }

};

// binary search, looking for key in a

__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
  unsigned lower = 0;
  unsigned upper = len_a;
  unsigned midpt;
  while (lower < upper){
    midpt = (lower + upper)>>1;
    if (a[midpt] < key) lower = midpt +1;
    else upper = midpt;
    }
  *idx = lower;
  return;
  }


// k is the key vector
// g is the global index vector
// v is the value vector
// r is the result vector

__global__ void extend_kernel(const int *k, const unsigned len_k, const int *g,  const unsigned len_g, const float *v, float *r){
  unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
  if (idx < len_g) {
    unsigned my_idx;
    int g_key = g[idx];
    bsearch_range(k, g_key, len_k, &my_idx);
    int my_key = -1;
    if (my_idx < len_k)
      my_key = k[my_idx];
    float my_val;
    if (g_key == my_key) my_val = v[my_idx];
    else my_val = 0.0f;
    r[idx] = my_val;
    }
}


int main(){

  int len_x = 7;
  int len_y = 4;
  int len_g = 9;
  int indx[]={0,2,4,6,8,10,12};
  int indy[]={1, 2, 3, 4};
  float dnx[]={91.0f,92.0f,93.0f,94.0f,95.0f,96.0f,97.0f};
  float dny[]={61.0f,62.0f,63.0f,64.0f};
  int ind[]={0,1,2,3,4,6,8,10,12};

  int *h_k, *d_k, *h_g, *d_g;
  float *h_v, *d_v, *h_r, *d_r;

  h_k = (int *)malloc(MAX_DSIZE*sizeof(int));
  h_g = (int *)malloc(MAX_DSIZE*sizeof(int));
  h_v = (float *)malloc(MAX_DSIZE*sizeof(float));
  h_r = (float *)malloc(MAX_DSIZE*sizeof(float));

  cudaMalloc(&d_k, MAX_DSIZE*sizeof(int));
  cudaMalloc(&d_g, MAX_DSIZE*sizeof(int));
  cudaMalloc(&d_v, MAX_DSIZE*sizeof(float));
  cudaMalloc(&d_r, MAX_DSIZE*sizeof(float));

// test case x

  memcpy(h_k, indx, len_x*sizeof(int));
  memcpy(h_g, ind, len_g*sizeof(int));
  memcpy(h_v, dnx, len_x*sizeof(float));

  cudaMemcpy(d_k, h_k, len_x*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_g, h_g, len_g*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_v, h_v, len_x*sizeof(float), cudaMemcpyHostToDevice);
  extend_kernel<<<(len_g+nTPB-1)/nTPB, nTPB >>>(d_k, len_x, d_g, len_g, d_v, d_r);
  cudaMemcpy(h_r, d_r, len_g*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "case x result: ";
  for (int i=0; i < len_g; i++)
    std::cout << h_r[i] << " ";
  std::cout << std::endl;


// test case y

  memcpy(h_k, indy, len_y*sizeof(int));
  memcpy(h_g, ind, len_g*sizeof(int));
  memcpy(h_v, dny, len_y*sizeof(float));

  cudaMemcpy(d_k, h_k, len_y*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_g, h_g, len_g*sizeof(int), cudaMemcpyHostToDevice);
  cudaMemcpy(d_v, h_v, len_y*sizeof(float), cudaMemcpyHostToDevice);
  extend_kernel<<<(len_g+nTPB-1)/nTPB, nTPB >>>(d_k, len_y, d_g, len_g, d_v, d_r);
  cudaMemcpy(h_r, d_r, len_g*sizeof(float), cudaMemcpyDeviceToHost);

  std::cout << "case y result: ";
  for (int i=0; i < len_g; i++)
    std::cout << h_r[i] << " ";
  std::cout << std::endl;

// using thrust

  thrust::device_vector<int> tind(ind, ind+len_g);
  thrust::device_vector<int> tindx(indx, indx+len_x);
  thrust::device_vector<float> tdnx(dnx, dnx+len_x);
  thrust::device_vector<float> tresult(len_g);

  thrust::device_vector<int> tbound(len_g);
  thrust::lower_bound(tindx.begin(), tindx.end(), tind.begin(), tind.end(), tbound.begin());
  thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(tind.begin(), tbound.begin(), tresult.begin())), thrust::make_zip_iterator(thrust::make_tuple(tind.end(), tbound.end(), tresult.end())), extend_functor(thrust::raw_pointer_cast(tindx.data()), len_x, thrust::raw_pointer_cast(tdnx.data())));

  std::cout << "thrust case x result: ";
  thrust::copy(tresult.begin(), tresult.end(), std::ostream_iterator<float>(std::cout, " "));
  std::cout << std::endl;
  return 0;
}