I'm trying to parallelize a code I've written. I'm having problems performing reducitons on arrays. It all seems to work fine for smallish arrays, however when the array size goes above a certain point I either get a stack overflow error or a crash.
I've tried to increased the stack size using the /F at compile time, I'm using ifort on windows, I've also tried passing set KMP_STACKSIZE=xxx the intel specific stacksize decleration. This sometimes helps and allows the code to progress further through my loop but in the end doesn't resolve the issue, even with a stack size of 1Gb or greater.
Below is a small self-contained working example of my code. It works in serial, and with one thread. Or with many threads but a small 'N'. A large N (i.e. like 250,000 in the example) causes problems.
I didn't think these arrays were so massive so as to cause major problems, and presumed increasing my stack space would help - are there any other options, or have I missed something important in my coding ?
program testreduction
use omp_lib
implicit none
integer :: i, j, nthreads, Nsize
integer iseed /3/
REAL, allocatable :: A(:,:), B(:), C(:), posi(:,:)
REAL :: dx, dy, r, Axi, Ayi, m, F
!Set size of matrix, and main loop
Nsize = 250000
m = 1.0
F = 1.0
!Allocate posi array
allocate(posi(2,Nsize))
!Fill with random numbers
do i=1,Nsize
do j=1,2
posi(j,i) = (ran(iseed))
end do
end do
!Allocate other arrays
allocate(A(2,Nsize), C(Nsize), B(Nsize))
print*, sizeof(A)+sizeof(B)+sizeof(C)
!$OMP parallel
nthreads = omp_get_num_threads()
!$OMP end parallel
print*, "Number of threads ", nthreads
!Go through each array and do some work, calculating a reduction on A, B and C.
!$OMP parallel do schedule(static) private(i, j, dx, dy, r, Axi, Ayi) reduction(+:C, B, A)
do i=1,Nsize
do j=1,Nsize
!print*, i
dx = posi(1,i) - posi(1,j)
dy = posi(2,i) - posi(2,j)
r = sqrt(dx**2+dy**2)
Axi = -m*(F)*(dx/(r))
Ayi = -m*(F)*(dy/(r))
A(1,i) = A(1,i) + Axi
A(2,i) = A(2,i) + Ayi
B(i) = B(i) + (Axi+Ayi)
C(i) = C(i) + dx/(r) + dy/(r)
end do
END DO
!$OMP END parallel do
end program
UPDATE
A better example of what I'm talking about ..
program testreduction2
use omp_lib
implicit none
integer :: i, j, nthreads, Nsize, q, k, nsize2
REAL, allocatable :: A(:,:), B(:), C(:)
integer, ALLOCATABLE :: PAIRI(:), PAIRJ(:)
Nsize = 25
Nsize2 = 19
q=0
allocate(A(2,Nsize), C(Nsize), B(Nsize))
ALLOCATE(PAIRI(nsize*nsize2), PAIRJ(nsize*nsize2))
do i=1,nsize
do j =1,nsize2
q=q+1
PAIRI(q) = i
PAIRJ(q) = j
end do
end do
A = 0
B = 0
C = 0
!$OMP parallel do schedule(static) private(i, j, k)
do k=1,q
i=PAIRI(k)
j=PAIRJ(k)
A(1,i) = A(1,i) + 1
A(2,i) = A(2,i) + 1
B(i) = B(i) + 1
C(i) = C(i) + 1
END DO
!$OMP END parallel do
PRINT*, A
PRINT*, B
PRINT*, C
END PROGRAM