MPI_Type_vector does not seem to receive/send what

I define a square matrix of size grid_size and work in its inner part (grid_size-2) while I Isend the next to outer edges to other processes. I define a toroidal topology so each submatrix-process easily computes its neighbors. While the rows (say [1][1] till [1][grid_size-2]) are sent correctly the columns (say [1][1] till [grid_size-2][1]) are not sent correctly - I use MPI_Type_contiguous for the rows while MPI_Type_vector for the columns - I check with empty matrices (they are matrices of chars so I initialize them to \0) and while the rows are always sent as 0 the columns differ at (semi) random positions. What am I missing ?

typedef char bool;
typedef bool **grid_t;

/* create a torroid topology */
void cart_create(MPI_Comm *new_comm, int Proc_Root) {
    int reorder = 1; /* allows processes reordered for efficiency */
    int periods[2], dim_size[2];
    dim_size[0] = Proc_Root; /* rows */
    dim_size[1] = Proc_Root; /* columns */
    periods[0] = 1; /* row periodic (each column forms a ring) */
    periods[1] = 1; /* columns periodic (each column forms a ring) */
    int comm_size;
    MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
    MPI_Cart_create(MPI_COMM_WORLD, 2, dim_size, periods, reorder, new_comm);
}

int main(int argc, char** argv) {

    /* ! MPI ! */
    MPI_Init(&argc, &argv);
    int rank;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int Num_of_Proc;
    MPI_Comm_size(MPI_COMM_WORLD, &Num_of_Proc);
    int Proc_Root = sqrt(Num_of_Proc);
    int Inner_Grid_Size = Num_of_Rows / Proc_Root; //size of process'submarix
    int Grid_Size = Inner_Grid_Size + 2; //grid size plus the ghost shells

    /* topology */
    MPI_Comm new_comm;
    cart_create(&new_comm, Proc_Root);

    /* allocate the grid */
    grid_t grid;
    create_grid(&grid, Grid_Size); // I fill it with 0
    grid_t grid2;
    create_empty_grid(&grid2, Grid_Size);
    grid_t new, old;

    bool *north_row = malloc(Inner_Grid_Size * sizeof *north_row);
    bool *south_row = malloc(Inner_Grid_Size * sizeof *south_row);
    bool *west_column = malloc(Inner_Grid_Size * sizeof *west_column);
    bool *east_column = malloc(Inner_Grid_Size * sizeof *east_column);
    // Works !
    MPI_Datatype rowtype;
    MPI_Type_contiguous(Inner_Grid_Size, MPI_CHAR, &rowtype); // MPI_C_BOOL
    MPI_Type_commit(&rowtype);
    // Where is the bug ?
    MPI_Datatype columntype;
    MPI_Type_vector(Inner_Grid_Size, 1, Grid_Size, MPI_CHAR, &columntype);
    MPI_Type_commit(&columntype);

    for (int k = 0; k < generations; k++) {
        if (k % 2) {
            old = grid2;
            new = grid;
        } else {
            old = grid;
            new = grid2;
        }
        MPI_Status status[16];
        MPI_Request reqs[16];
        MPI_Isend(&old[Inner_Grid_Size][1], 1, rowtype, neighboors_ranks[S],
                S, new_comm, &reqs[S]); //send to S
        MPI_Irecv(north_row, Inner_Grid_Size, MPI_CHAR, neighboors_ranks[N],
                S, new_comm, &reqs[S + EIGHT]); //receive from N
        // above works
        // below not
        MPI_Isend(&old[1][1], 1, columntype, neighboors_ranks[W], W,
                new_comm, &reqs[W]); //send to W
        MPI_Irecv(east_column, Inner_Grid_Size, MPI_CHAR, neighboors_ranks[E],
                W, new_comm, &reqs[W + EIGHT]); //receive from E
        MPI_Isend(&old[1][Inner_Grid_Size], 1, columntype, neighboors_ranks[E],
                E, new_comm, &reqs[E]); //send to E
        MPI_Irecv(west_column, Inner_Grid_Size, MPI_CHAR, neighboors_ranks[W],
                E, new_comm, &reqs[E + EIGHT]); //receive from W

        MPI_Waitall(EIGHT, reqs + EIGHT, status + EIGHT); //Wait receives
        if (rank == root)
            for (int p = 0; p < Inner_Grid_Size; p++) {
                printf("east[%d]=%d\n", p, east_column[p]); // should be 0 !?
                //  printf("north,%d\n", north_row[p]); // prints ok
                printf("west[%d]=%d\n", p, west_column[p]); // should be 0 !?
            }
        //...
    }
}

EDIT : allocation

void create_grid(grid_t *grid, int size) {
    srand(time(NULL) ^get_rank() << 16);
    if ((*grid = malloc(size * (sizeof **grid))) == NULL) return;
    for (int i = 0; i < size; ++i) {
        (*grid)[i] = malloc(size * (sizeof *((*grid)[i])));
        for (int j = 0; j < size; ++j) {
            (*grid)[i][j] = 0; /*was random */
        }
    }
}

/* the grid will be full of 0 */
void create_empty_grid(grid_t *grid, int size) {
    if ((*grid = malloc(size * (sizeof **grid))) == NULL) return;
    // the outer edges will be filled by the other processes
    for (int i = 0; i < size; ++i) {
        (*grid)[i] = malloc(size * (sizeof *((*grid)[i])));
        memset((*grid)[i], 0, sizeof (*(*grid)[i]) * size);
    }
}

void print_grid(grid_t grid, int start, int size) {
    for (int i = start; i < size; ++i) {
        for (int j = start; j < size; ++j) {
            if (grid[i][j]) {
                printf("@");
            } else {
                printf(".");
            }
        }
        printf("\n");
    }
    printf("\n");
}

This comes up often here (eg, see this question/answer and this one) when dealing with "multidimensional arrays" in C with MPI. It's not really an MPI thing, it's a C thing.

The standard way of allocating arrays-of-arrays in C to get a multidimensional array doesn't given you a contiguous block of memory. Each row (eg, each malloc) is individually contiguous, but the next row could be anywhere else in memory.

So the formula for skipping Grid_Size items to find the next item in the column won't work (and depending on grid size will likely segfault). So as with those answers above,

Change the allocation to look something like

data = malloc(size*size*sizeof(type)); 
grid = malloc(size*sizeof(type *)); 
for (int i=0; i<size; i++) 
    *grid[i] = &(data[i*size]);

or any of a number of variations you'll see kicking around. This gives you one block of size*size of your type, with the grid[] array pointing into it. Deallocation is then done by