large integer addition with CUDA

2019-01-14 23:37发布

站内文章 / C

32 0

祖国的老花朵

女 | 书童

私信

可以将文章内容翻译成中文,广告屏蔽插件可能会导致该功能失效(如失效，请关闭广告屏蔽插件后再试):

问题:

I've been developing a cryptographic algorithm on the GPU and currently stuck with an algorithm to perform large integer addition. Large integers are represented in a usual way as a bunch of 32-bit words.

For example, we can use one thread to add two 32-bit words. For simplicity, let assume that the numbers to be added are of the same length and number of threads per block == number of words. Then:

__global__ void add_kernel(int *C, const int *A, const int *B) {
     int x = A[threadIdx.x];
     int y = B[threadIdx.x];
     int z = x + y;
     int carry = (z < x);
     /** do carry propagation in parallel somehow ? */
     ............

     z = z + newcarry; // update the resulting words after carry propagation
     C[threadIdx.x] = z;
 }

I am pretty sure that there is a way to do carry propagation via some tricky reduction procedure but could not figure it out..

I had a look at CUDA thrust extensions but big integer package seems not to be implemented yet. Perhaps someone can give me a hint how to do that on CUDA ?

回答1:

You are right, carry propagation can be done via prefix sum computation but it's a bit tricky to define the binary function for this operation and prove that it is associative (needed for parallel prefix sum). As a matter of fact, this algorithm is used (theoretically) in Carry-lookahead adder.

Suppose we have two large integers a[0..n-1] and b[0..n-1]. Then we compute (i = 0..n-1):

s[i] = a[i] + b[i]l;
carryin[i] = (s[i] < a[i]);

We define two functions:

generate[i] = carryin[i];
propagate[i] = (s[i] == 0xffffffff);

with quite intuitive meaning: generate[i] == 1 means that the carry is generated at position i while propagate[i] == 1 means that the carry will be propagated from position (i - 1) to (i + 1). Our goal is to compute the function carryout[0..n-1] used to update the resulting sum s[0..n-1]. carryout can be computed recursively as follows:

carryout[i] = generate[i] OR (propagate[i] AND carryout[i-1])
carryout[0] = 0

Here carryout[i] == 1 if carry is generated at position i OR it is generated sometimes earlier AND propagated to position i. Finally, we update the resulting sum:

s[i] = s[i] + carryout[i-1];  for i = 1..n-1
carry = carryout[n-1];

Now it is quite straightforward to prove that carryout function is indeed binary associative and hence parallel prefix sum computation applies. To implement this on CUDA, we can merge both flags 'generate' and 'propagate' in a single variable since they are mutually exclusive, i.e.:

cy[i] = (s[i] == -1u ? -1u : 0) | carryin[i];

In other words,

cy[i] = 0xffffffff  if propagate[i]
cy[i] = 1           if generate[i]
cy[u] = 0           otherwise

Then, one can verify that the following formula computes prefix sum for carryout function:

cy[i] = max((int)cy[i], (int)cy[k]) & cy[i];

for all k < i. The example code below shows large addition for 2048-word integers. Here I used CUDA blocks with 512 threads:

// add & output carry flag
#define UADDO(c, a, b) \ 
     asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));
// add with carry & output carry flag
#define UADDC(c, a, b) \ 
     asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(c) : "r"(a) , "r"(b));

#define WS 32

__global__ void bignum_add(unsigned *g_R, const unsigned *g_A,const unsigned *g_B) {

extern __shared__ unsigned shared[];
unsigned *r = shared; 

const unsigned N_THIDS = 512;
unsigned thid = threadIdx.x, thid_in_warp = thid & WS-1;
unsigned ofs, cf;

uint4 a = ((const uint4 *)g_A)[thid],
      b = ((const uint4 *)g_B)[thid];

UADDO(a.x, a.x, b.x) // adding 128-bit chunks with carry flag
UADDC(a.y, a.y, b.y)
UADDC(a.z, a.z, b.z)
UADDC(a.w, a.w, b.w)
UADDC(cf, 0, 0) // save carry-out

// memory consumption: 49 * N_THIDS / 64
// use "alternating" data layout for each pair of warps
volatile short *scan = (volatile short *)(r + 16 + thid_in_warp +
        49 * (thid / 64)) + ((thid / 32) & 1);

scan[-32] = -1; // put identity element
if(a.x == -1u && a.x == a.y && a.x == a.z && a.x == a.w)
    // this indicates that carry will propagate through the number
    cf = -1u;

// "Hillis-and-Steele-style" reduction 
scan[0] = cf;
cf = max((int)cf, (int)scan[-2]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-4]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-8]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-16]) & cf;
scan[0] = cf;
cf = max((int)cf, (int)scan[-32]) & cf;
scan[0] = cf;

int *postscan = (int *)r + 16 + 49 * (N_THIDS / 64);
if(thid_in_warp == WS - 1) // scan leading carry-outs once again
    postscan[thid >> 5] = cf;

__syncthreads();

if(thid < N_THIDS / 32) {
    volatile int *t = (volatile int *)postscan + thid;
    t[-8] = -1; // load identity symbol
    cf = t[0];
    cf = max((int)cf, (int)t[-1]) & cf;
    t[0] = cf;
    cf = max((int)cf, (int)t[-2]) & cf;
    t[0] = cf;
    cf = max((int)cf, (int)t[-4]) & cf;
    t[0] = cf;
}
__syncthreads();

cf = scan[0];
int ps = postscan[(int)((thid >> 5) - 1)]; // postscan[-1] equals to -1
scan[0] = max((int)cf, ps) & cf; // update carry flags within warps
cf = scan[-2];

if(thid_in_warp == 0)
    cf = ps;
if((int)cf < 0)
    cf = 0;

UADDO(a.x, a.x, cf) // propagate carry flag if needed
UADDC(a.y, a.y, 0)
UADDC(a.z, a.z, 0)
UADDC(a.w, a.w, 0)
((uint4 *)g_R)[thid] = a;
}

Note that macros UADDO / UADDC might not be necessary anymore since CUDA 4.0 has corresponding intrinsics (however I am not entirely sure).

Also remark that, though parallel reduction is quite fast, if you need to add several large integers in a row, it might be better to use some redundant representation (which was suggested in comments above), i.e., first accumulate the results of additions in 64-bit words, and then perform one carry propagation at the very end in "one sweep".

回答2:

I thought I would post my answer also, in addition to @asm, so this SO question can be a sort of repository of ideas. Similar to @asm, I detect and store the carry condition as well as the "carry-through" condition, ie. when the intermediate word result is all 1's (0xF...FFF) so that if a carry were to propagate into this word, it would "carry-through" to the next word.

I didn't use any PTX or asm in my code, so I chose to use 64-bit unsigned ints instead of 32-bit, to achieve the 2048x32bit capability, using 1024 threads.

A larger difference from @asm's code is in my parallel carry propagation scheme. I construct a bit-packed array ("carry") where each bit represents the carry condition generated from the independent intermediate 64-bit adds from each of the 1024 threads. I also construct a bit-packed array ("carry_through") where each bit represents the carry_through condition of the individual 64-bit intermediate results. For 1024 threads, this amounts to 1024/64 = 16x64 bit words of shared memory for each bit-packed array, so total shared mem usage is 64+3 32bit quantites. With these bit packed arrays, I perform the following to generate a combined propagated carry indicator:

carry = carry | (carry_through ^ ((carry & carry_through) + carry_through);

(note that carry is shifted left by one: carry[i] indicates that the result of a[i-1] + b[i-1] generated a carry) The explanation is as follows:

the bitwise and of carry and carry_through generates the candidates where a carry will interact with a sequence of one or more carry though conditions
adding the result of step one to carry_through generates a result which has changed bits which represent all words that will be affected by the propagation of the carry into the carry_through sequence
taking the exclusive-or of carry_through plus the result from step 2 shows the affected results indicated with a 1 bit
taking the bitwise or of the result from step 3 and the ordinary carry indicators gives a combined carry condition, which is then used to update all the intermediate results.

Note that the addition in step 2 requires another multi-word add (for big ints composed of more than 64 words). I believe this algorithm works, and it has passed the test cases I have thrown at it.

Here is my example code which implements this:

// parallel add of large integers
// requires CC 2.0 or higher
// compile with:
// nvcc -O3 -arch=sm_20 -o paradd2 paradd2.cu
#include <stdio.h>
#include <stdlib.h>

#define MAXSIZE 1024 // the number of 64 bit quantities that can be added
#define LLBITS 64  // the number of bits in a long long
#define BSIZE ((MAXSIZE + LLBITS -1)/LLBITS) // MAXSIZE when packed into bits
#define nTPB MAXSIZE

// define either GPU or GPUCOPY, not both -- for timing
#define GPU
//#define GPUCOPY

#define LOOPCNT 1000

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

// perform c = a + b, for unsigned integers of psize*64 bits.
// all work done in a single threadblock.
// multiple threadblocks are handling multiple separate addition problems
// least significant word is at a[0], etc.

__global__ void paradd(const unsigned size, const unsigned psize, unsigned long long *c, const unsigned long long *a, const unsigned long long *b){

  __shared__ unsigned long long carry_through[BSIZE];
  __shared__ unsigned long long carry[BSIZE+1];
  __shared__ volatile unsigned mcarry;
  __shared__ volatile unsigned mcarry_through;

  unsigned idx = threadIdx.x + (psize * blockIdx.x);
  if ((threadIdx.x < psize) && (idx < size)){
    // handle 64 bit unsigned add first
    unsigned long long cr1 = a[idx];
    unsigned long long lc = cr1 + b[idx];
    // handle carry
    if (threadIdx.x < BSIZE){
      carry[threadIdx.x] = 0;
      carry_through[threadIdx.x] = 0;
      }
    if (threadIdx.x == 0){
      mcarry = 0;
      mcarry_through = 0;
      }
    __syncthreads();
    if (lc < cr1){
      if ((threadIdx.x%LLBITS) != (LLBITS-1))  
        atomicAdd(&(carry[threadIdx.x/LLBITS]), (2ull<<(threadIdx.x%LLBITS)));
      else atomicAdd(&(carry[(threadIdx.x/LLBITS)+1]), 1);
      }
    // handle carry-through
    if (lc == 0xFFFFFFFFFFFFFFFFull) 
      atomicAdd(&(carry_through[threadIdx.x/LLBITS]), (1ull<<(threadIdx.x%LLBITS))); 
    __syncthreads();
    if (threadIdx.x < ((psize + LLBITS-1)/LLBITS)){
      // only 1 warp executing within this if statement
      unsigned long long cr3 = carry_through[threadIdx.x];
      cr1 = carry[threadIdx.x] & cr3;
      // start of sub-add
      unsigned long long cr2 = cr3 + cr1;
      if (cr2 < cr1) atomicAdd((unsigned *)&mcarry, (2u<<(threadIdx.x)));
      if (cr2 == 0xFFFFFFFFFFFFFFFFull) atomicAdd((unsigned *)&mcarry_through, (1u<<threadIdx.x));
      if (threadIdx.x == 0) {
        unsigned cr4 = mcarry & mcarry_through;
        cr4 += mcarry_through;
        mcarry |= (mcarry_through ^ cr4); 
        }
      if (mcarry & (1u<<threadIdx.x)) cr2++;
      // end of sub-add
      carry[threadIdx.x] |= (cr2 ^ cr3);
      }
    __syncthreads();
    if (carry[threadIdx.x/LLBITS] & (1ull<<(threadIdx.x%LLBITS))) lc++;
    c[idx] = lc;
  }
}

int main() {

  unsigned long long *h_a, *h_b, *h_c, *d_a, *d_b, *d_c, *c;
  unsigned at_once = 256;   // valid range = 1 .. 65535
  unsigned prob_size = MAXSIZE ; // valid range = 1 .. MAXSIZE
  unsigned dsize = at_once * prob_size;
  cudaEvent_t t_start_gpu, t_start_cpu, t_end_gpu, t_end_cpu;
  float et_gpu, et_cpu, tot_gpu, tot_cpu;
  tot_gpu = 0;
  tot_cpu = 0;


  if (sizeof(unsigned long long) != (LLBITS/8)) {printf("Word Size Error\n"); return 1;}
  if ((c = (unsigned long long *)malloc(dsize * sizeof(unsigned long long)))  == 0) {printf("Malloc Fail\n"); return 1;}

  cudaHostAlloc((void **)&h_a, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
  cudaCheckErrors("cudaHostAlloc1 fail");
  cudaHostAlloc((void **)&h_b, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
  cudaCheckErrors("cudaHostAlloc2 fail");
  cudaHostAlloc((void **)&h_c, dsize * sizeof(unsigned long long), cudaHostAllocDefault);
  cudaCheckErrors("cudaHostAlloc3 fail");

  cudaMalloc((void **)&d_a, dsize * sizeof(unsigned long long));
  cudaCheckErrors("cudaMalloc1 fail");
  cudaMalloc((void **)&d_b, dsize * sizeof(unsigned long long));
  cudaCheckErrors("cudaMalloc2 fail");
  cudaMalloc((void **)&d_c, dsize * sizeof(unsigned long long));
  cudaCheckErrors("cudaMalloc3 fail");
  cudaMemset(d_c, 0, dsize*sizeof(unsigned long long));

  cudaEventCreate(&t_start_gpu);
  cudaEventCreate(&t_end_gpu);
  cudaEventCreate(&t_start_cpu);
  cudaEventCreate(&t_end_cpu);

  for (unsigned loops = 0; loops <LOOPCNT; loops++){
  //create some test cases
  if (loops == 0){
  for (int j=0; j<at_once; j++)
  for (int k=0; k<prob_size; k++){
    int i= (j*prob_size) + k;
    h_a[i] = 0xFFFFFFFFFFFFFFFFull;
    h_b[i] = 0;
    }
    h_a[prob_size-1] = 0;
    h_b[prob_size-1] = 1;
    h_b[0] = 1;
  }
  else if (loops == 1){
  for (int i=0; i<dsize; i++){
    h_a[i] = 0xFFFFFFFFFFFFFFFFull;
    h_b[i] = 0;
    }
    h_b[0] = 1;
  }
  else if (loops == 2){
  for (int i=0; i<dsize; i++){
    h_a[i] = 0xFFFFFFFFFFFFFFFEull;
    h_b[i] = 2;
    }
    h_b[0] = 1;
  }
  else {
  for (int i = 0; i<dsize; i++){
    h_a[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
    h_b[i] = (((unsigned long long)lrand48())<<33) + (unsigned long long)lrand48();
    }
  }
#ifdef GPUCOPY
  cudaEventRecord(t_start_gpu, 0);
#endif
  cudaMemcpy(d_a, h_a, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy1 fail");
  cudaMemcpy(d_b, h_b, dsize*sizeof(unsigned long long), cudaMemcpyHostToDevice);
  cudaCheckErrors("cudaMemcpy2 fail");
#ifdef GPU
  cudaEventRecord(t_start_gpu, 0);
#endif
  paradd<<<at_once, nTPB>>>(dsize, prob_size, d_c, d_a, d_b);
  cudaCheckErrors("Kernel Fail");
#ifdef GPU
  cudaEventRecord(t_end_gpu, 0);
#endif
  cudaMemcpy(h_c, d_c, dsize*sizeof(unsigned long long), cudaMemcpyDeviceToHost);
  cudaCheckErrors("cudaMemcpy3 fail");
#ifdef GPUCOPY
  cudaEventRecord(t_end_gpu, 0);
#endif
  cudaEventSynchronize(t_end_gpu);
  cudaEventElapsedTime(&et_gpu, t_start_gpu, t_end_gpu);
  tot_gpu += et_gpu;
  cudaEventRecord(t_start_cpu, 0);
  //also compute result on CPU for comparison
  for (int j=0; j<at_once; j++) {
    unsigned rc=0;
    for (int n=0; n<prob_size; n++){
      unsigned i = (j*prob_size) + n;
      c[i] = h_a[i] + h_b[i];
      if (c[i] < h_a[i]) {
        c[i] += rc;
        rc=1;}
      else {
        if ((c[i] += rc) != 0) rc=0;
        }
      if (c[i] != h_c[i]) {printf("Results mismatch at offset %d, GPU = 0x%lX, CPU = 0x%lX\n", i, h_c[i], c[i]); return 1;}
      }
    }
  cudaEventRecord(t_end_cpu, 0);
  cudaEventSynchronize(t_end_cpu);
  cudaEventElapsedTime(&et_cpu, t_start_cpu, t_end_cpu);
  tot_cpu += et_cpu;
  if ((loops%(LOOPCNT/10)) == 0) printf("*\n");
  }
  printf("\nResults Match!\n");
  printf("Average GPU time = %fms\n", (tot_gpu/LOOPCNT));
  printf("Average CPU time = %fms\n", (tot_cpu/LOOPCNT));

  return 0;
}