How to implement nested loops in cuda thrust

I currently have to run a nested loop as follow:

for(int i = 0; i < N; i++){
    for(int j = i+1; j <= N; j++){
        compute(...)//some calculation here
    }
}

I've tried leaving the first loop in CPU and do the second loop in GPU. Results are too many memory access. Is there any other ways to do it? For example by thrust::reduce_by_key?

The whole program is here:

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/generate.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/random.h>
#include <cmath>
#include <iostream>
#include <iomanip>

#define N 1000000

// define a 2d point pair
typedef thrust::tuple<float, float> Point;

// return a random Point in [0,1)^2
Point make_point(void)
{
  static thrust::default_random_engine rng(12345);
  static thrust::uniform_real_distribution<float> dist(0.0f, 1.0f);
  float x = dist(rng);
  float y = dist(rng);
  return Point(x,y);
}

struct sqrt_dis: public thrust::unary_function<Point, double>
{
  float x, y;
  double tmp;
  sqrt_dis(float _x, float _y): x(_x), y(_y){}
  __host__ __device__
  float operator()(Point a)
 {
    tmp =(thrust::get<0>(a)-x)*(thrust::get<0>(a)-x)+\
    (thrust::get<1>(a)-y)*(thrust::get<1>(a)-y);
    tmp = -1.0*(sqrt(tmp));
    return (1.0/tmp);
 }

};


int main(void) {
  clock_t t1, t2;
  double result;

  t1 = clock();
  // allocate some random points in the unit square on the host
  thrust::host_vector<Point> h_points(N);
  thrust::generate(h_points.begin(), h_points.end(), make_point);

  // transfer to device
  thrust::device_vector<Point> points = h_points;

  thrust::plus<double> binary_op;
  float init = 0;

  for(int i = 0; i < N; i++){
    Point tmp_i = points[i];
    float x = thrust::get<0>(tmp_i);
    float y = thrust::get<1>(tmp_i);
    result += thrust::transform_reduce(points.begin()+i,\
                                       points.end(),sqrt_dis(x,y),\
                                       init,binary_op);
    std::cout<<"result"<<i<<": "<<result<<std::endl;
  }
  t2 = clock()-t1;
  std::cout<<"result: ";
  std::cout.precision(10);
  std::cout<< result <<std::endl;
  std::cout<<"run time: "<<t2/CLOCKS_PER_SEC<<"s"<<std::endl;
  return 0;
 }

EDIT: Now that you have posted an example, here is how you could solve it:

You have n 2D points stored in a linear array like this (here n=4)

points = [p0 p1 p2 p3]

Based on your code I assume you want to calculate:

result = f(p0, p1) + f(p0, p2) + f(p0, p3) +
         f(p1, p2) + f(p1, p3) +
         f(p2, p3)

Where f() is your distance function which needs to be executed m times in total:

m = (n-1)*n/2

in this example: m=6

You can look at this problem as a triangular matrix:

[ p0 p1 p2 p3 ] 
[    p1 p2 p3 ]
[       p2 p3 ]
[          p3 ]

Transforming this matrix into a linear vector with m elements while leaving out the diagonal elements results in:

[p1 p2 p3 p2 p3 p3]

The index of an element in the vector is k = [0,m-1]. Index k can be remapped to columns and rows of the triangular matrix to k -> (i,j):

i = n - 2 - floor(sqrt(-8*k + 4*n*(n-1)-7)/2.0 - 0.5)
j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2

i is the row and j is the column.

In our example:

0 -> (0, 1)
1 -> (0, 2)
2 -> (0, 3)
3 -> (1, 2)
4 -> (1, 3)
5 -> (2, 3)

Now you can put all this together and execute a modified distance functor m times which applies the aforementioned mapping to get the corresponding pairs based on the index and then sum up everything.

I modified your code accordingly:

#include <thrust/device_vector.h>
#include <thrust/generate.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform_reduce.h>
#include <thrust/random.h>
#include <math.h>
#include <iostream>
#include <stdio.h>
#include <stdint.h>

#define PRINT_DEBUG

typedef float Float;

// define a 2d point pair
typedef thrust::tuple<Float, Float> Point;

// return a random Point in [0,1)^2
Point make_point(void)
{
    static thrust::default_random_engine rng(12345);
    static thrust::uniform_real_distribution<Float> dist(0.0, 1.0);
    Float x = dist(rng);
    Float y = dist(rng);
    return Point(x,y);
}


struct sqrt_dis_new
{
    typedef thrust::device_ptr<Point> DevPtr;

    DevPtr points;
    const uint64_t n;

    __host__
    sqrt_dis_new(uint64_t n, DevPtr p) : n(n), points(p)
    {
    }

    __device__ 
    Float operator()(uint64_t k) const
    {
        // calculate indices in triangular matrix
        const uint64_t i = n - 2 - floor(sqrt((double)(-8*k + 4*n*(n-1)-7))/2.0 - 0.5);
        const uint64_t j = k + i + 1 - n*(n-1)/2 + (n-i)*((n-i)-1)/2;

#ifdef PRINT_DEBUG
        printf("%llu -> (%llu, %llu)\n", k,i,j);
#endif

        const Point& p1 = *(points.get()+j);
        const Point& p2 = *(points.get()+i);

        const Float xm = thrust::get<0>(p1)-thrust::get<0>(p2);
        const Float ym = thrust::get<1>(p1)-thrust::get<1>(p2);

        return 1.0/(-1.0 * sqrt(xm*xm + ym*ym));
    }
};


int main()
{
    const uint64_t N = 4;

    // allocate some random points in the unit square on the host
    thrust::host_vector<Point> h_points(N);
    thrust::generate(h_points.begin(), h_points.end(), make_point);

    // transfer to device
    thrust::device_vector<Point> d_points = h_points;

    const uint64_t count = (N-1)*N/2;

    std::cout << count << std::endl;


    thrust::plus<Float> binary_op;
    const Float init = 0.0;

    Float result = thrust::transform_reduce(thrust::make_counting_iterator((uint64_t)0),
                                            thrust::make_counting_iterator(count),
                                            sqrt_dis_new(N, d_points.data()),
                                            init,
                                            binary_op);

    std::cout.precision(10);  
    std::cout<<"result: " << result << std::endl;

    return 0;
}

It depends on your compute function which you do not specify. Usually you unroll the loops and launch the kernel in a 2D manner for every combination of i and j if the computations are independent. Have a look at the Thrust examples and identify similar use cases to your problem.