Calculate rowSums in Chapel for a matrix

Continuing my Chapel adventures...

I have a matrix A.

var idx = {1..n};
var adom = {idx, idx};
var A: [adom] int;
//populate A;

var rowsums: [idx] int;

What is the most efficient way to populate rowsums?

回答1:

The most efficient solution is hard to define. However, here is one way to compute rowsums that is both parallel and elegant:

config const       n = 8;          // "naked" n would cause compilation to fail
const indices = 1..n;              // tio.chpl:1: error: 'n' undeclared (first use this function)
const adom = {indices, indices};
var A: [adom] int;

// Populate A
[(i,j) in adom] A[i, j] = i*j;

var rowsums: [indices] int;


forall i in indices {
  rowsums[i] = + reduce(A[i, ..]);
}

writeln(rowsums);

Try it online!

This is utilizing the + reduction over array slices of A.

Note that both the forall and + reduce introduce parallelism to the program above. It may be more efficient to only use a for loop, avoiding task-spawning overhead, if the size of indices is sufficiently small.

回答2:

A few hints
to make the code actually run-live in both `SEQ` and `PAR` mode:

Besides a few implementation details, the above stated @bencray's assumption about the assumed overhead costs for a PAR setup, which may favor a purely serial processing in a SEQ setup, was not experimentally confirmed. It is fair to also note here, that a distributed mode was not tested on live <TiO>-IDE due to obvious reasons, whereas a small-if-not-tiny-scale distributed implementation is by far more an oxymoron, than a scientifically meaningful experiment to run.

Facts matter

A rowsums[] processing, even at a smallest possible scale of 2x2, was in the SEQ mode yet slower, than the same for 256x256 in the PAR mode.

Good job, chapel Team, indeed cool results on optimum alignment for harnessing the compact silicon resources to the max in PAR!

For records on exact run-time performance, ( ref. self-documented tables ) below, or do not hesistate to visit the live-IDE-run ( ref.'d above ) and experiment on your own.

Readers may also recognise extrinsic noise on small-scale experimentations, as O/S- and hosted-IDE-related processes intervene with resources-usage and influence onto the <SECTION-UNDER-TEST> runtime performance via adverse CPU / Lx-CACHE / memIO / process / et al conflicts, which fact exludes these measurements from being used for some generalised interpretations.

Hope all will enjoy the chapel lovely `[TIME]` results
_{demonstrated across the growing [EXPSPACE]-scaled computing landscapes}

/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ use Time;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_SEQ: Timer;
/* ---------------------------------------SETUP-SECTION-UNDER-TEST--*/ var aStopWATCH_PAR: Timer;

//nst max_idx =    123456;                   // seems to be too fat  for <TiO>-IDE to allocate                  <TiO>--   /wrappers/chapel: line 6: 24467 Killed
const max_idx =      4096;
//nst max_idx =      8192;                   // seems to be too long for <TiO>-IDE to let it run [SEQ] part     <TiO>--  The request exceeded the 60 second time limit and was terminated
//nst max_idx =     16384;                   // seems to be too long for <TiO>-IDE to let it run [PAR] part too <TiO>--   /wrappers/chapel: line 6: 12043 Killed
const indices = 1..max_idx;

const   adom  = {indices, indices};
var A: [adom] int;

[(i,j) in adom] A[i, j] = i*j;               // Populate A[,]

var rowsums: [indices] int;

/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.start();
for       i in indices {                     // SECTION-UNDER-TEST--
  rowsums[i] = + reduce(A[i, ..]);           // SECTION-UNDER-TEST--
}                                            // SECTION-UNDER-TEST--
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_SEQ.stop();

/* 
                                               <SECTION-UNDER-TEST> took     8973 [us] to run in [SEQ] mode for    2 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took    28611 [us] to run in [SEQ] mode for    4 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took    58824 [us] to run in [SEQ] mode for    8 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   486786 [us] to run in [SEQ] mode for   64 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  1019990 [us] to run in [SEQ] mode for  128 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  2010680 [us] to run in [SEQ] mode for  256 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  4154970 [us] to run in [SEQ] mode for  512 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  8260960 [us] to run in [SEQ] mode for 1024 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 15853000 [us] to run in [SEQ] mode for 2048 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 33126800 [us] to run in [SEQ] mode for 4096 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took      n/a [us] to run in [SEQ] mode for 8192 elements on <TiO>-IDE

   ============================================ */


/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.start();
forall    i in indices {                     // SECTION-UNDER-TEST--
  rowsums[i] = + reduce(A[i, ..]);           // SECTION-UNDER-TEST--
}                                            // SECTION-UNDER-TEST--
/* ---------------------------------------------SECTION-UNDER-TEST--*/ aStopWATCH_PAR.stop();
/*
                                               <SECTION-UNDER-TEST> took  12131 [us] to run in [PAR] mode for    2 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8095 [us] to run in [PAR] mode for    4 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8023 [us] to run in [PAR] mode for    8 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8156 [us] to run in [PAR] mode for   64 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   7990 [us] to run in [PAR] mode for  128 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took   8692 [us] to run in [PAR] mode for  256 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  15134 [us] to run in [PAR] mode for  512 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  16926 [us] to run in [PAR] mode for 1024 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took  30671 [us] to run in [PAR] mode for 2048 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 105323 [us] to run in [PAR] mode for 4096 elements on <TiO>-IDE
                                               <SECTION-UNDER-TEST> took 292232 [us] to run in [PAR] mode for 8192 elements on <TiO>-IDE

   ============================================ */



writeln( rowsums,
        "\n <SECTION-UNDER-TEST> took ", aStopWATCH_SEQ.elapsed( Time.TimeUnits.microseconds ), " [us] to run in [SEQ] mode for ", max_idx, " elements on <TiO>-IDE",
        "\n <SECTION-UNDER-TEST> took ", aStopWATCH_PAR.elapsed( Time.TimeUnits.microseconds ), " [us] to run in [PAR] mode for ", max_idx, " elements on <TiO>-IDE"
         );