MPI_Gatherv hangs when recomposing a matrix

I have to decompose and recompose a matrix in MPI (I'm using MPICH), and I'm using Scatterv and Gatherv as in the example from this question. Everything works well for small matrices, but when the matrix size increases (starting from 800x800), the program hangs when it reaches MPI_Gatherv. By printing debug messages, I can see that every process passes the call to Gatherv, except the one with rank 0 (the root process in the Gatherv call). Any suggestion? Here's the code:

#include <iostream>
#include <cstring>
#include <fstream>
#include <cstdlib>
#include "mpi.h"
using namespace std;

#define TOP_ROW_TAG 1
#define BOTTOM_ROW_TAG 2
#define LEFT_COL_TAG 3
#define RIGHT_COL_TAG 4



int main(int argc, char ** argv) {
    int me, nproc, width, height, wloc, hloc;
    double k, d,c, wdouble, hdouble, discr, delta_t, t;
    char* initial, end;

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &me); 
    MPI_Comm_size(MPI_COMM_WORLD, &nproc);
    MPI_Comm cart_top;



    wdouble = atof(argv[1]);
    hdouble = atof(argv[2]);
    discr = atof(argv[3]);
    k = atof(argv[4]);
    d = atof(argv[5]);
    c = atof(argv[6]);
    delta_t = atof(argv[7]);
    t = atof(argv[8]);
    initial = argv[9];
    end = argv[10];
    double p = k/(d*c);
    double dsc = delta_t/(discr*discr);

    width = wdouble / discr;
    height = hdouble / discr; 
    const int NPROWS=4;  /* number of rows in _decomposition_ */
    const int NPCOLS=4;  /* number of cols in _decomposition_ */
    const int BLOCKROWS = width/NPROWS;  /* number of rows in _block_ */
    const int BLOCKCOLS = height/NPCOLS;

const int dims[2] = {NPROWS, NPCOLS};
const int periods[2] = {0,0};
int* mycoords = new int[2];

int locsz = (width*height)/nproc;





double* T, *Tnew, *local, *locnew;
local = new double[BLOCKROWS*BLOCKCOLS];
locnew = new double[BLOCKROWS*BLOCKCOLS];
T = new double[width * height];
Tnew = new double[width * height];

ifstream infile;
infile.open(initial);
if(me==0) {
    cout<<"BLOCKROWS: "<<BLOCKROWS;
    cout<<"BLOCKCOLS: "<<BLOCKCOLS<<endl;
    cout<<"width: "<<width;
    cout<<"height: "<<height<<endl;
    int idx, jdx, temp;
    for (int i=0; i<width*height; i++) {
        string currline;
        getline(infile, currline);
        idx = atoi(strtok(currline.c_str(), " "));
        jdx = atoi(strtok(NULL, " "));
        temp = atof(strtok(NULL, " "));
        T[idx*height+jdx] = temp;
       infile.close();
}

MPI_Datatype blocktype;
MPI_Datatype blocktype2;
MPI_Datatype coltype, coltype2;


MPI_Type_vector(BLOCKROWS, 1, BLOCKCOLS, MPI_DOUBLE, &coltype);
MPI_Type_create_resized( coltype, 0, sizeof(double), &coltype2);
MPI_Type_commit(&coltype2);


MPI_Type_vector(BLOCKROWS, BLOCKCOLS, height, MPI_DOUBLE, &blocktype2);
MPI_Type_create_resized( blocktype2, 0, sizeof(double), &blocktype);
MPI_Type_commit(&blocktype);

int disps[NPROWS*NPCOLS];
int counts[NPROWS*NPCOLS];
for (int ii=0; ii<NPROWS; ii++) {
    for (int jj=0; jj<NPCOLS; jj++) {
        disps[ii*NPCOLS+jj] = ii*height*BLOCKROWS+jj*BLOCKCOLS;
        counts [ii*NPCOLS+jj] = 1;
    }
}

int myrank, lb_i, lb_j, ub_i, ub_j;
lb_i=0;
lb_j=0;
ub_i=BLOCKROWS;
ub_j=BLOCKCOLS;
/*
        0= left neighbor;
        1= right neighbor;
        2=top neighbor;
        3=bottom neighbor;
 */
int neighs[4] = {};
double* leftcol, *rightcol, *myleftcol, *myrightcol, *toprow, *bottomrow;
leftcol = new double[BLOCKROWS];
rightcol= new double[BLOCKROWS];
myleftcol = new double[BLOCKROWS];
myrightcol= new double[BLOCKROWS];
toprow = new double[BLOCKCOLS];
bottomrow = new double[BLOCKCOLS];


//Create topology and get neighbor's rank
MPI_Cart_create(MPI_COMM_WORLD, 2, dims, periods, 0, &cart_top);
MPI_Barrier(MPI_COMM_WORLD);
MPI_Comm_rank(cart_top, &myrank);


MPI_Cart_shift(cart_top, 0, -1, &myrank, &neighs[0]);
MPI_Cart_shift(cart_top, 0, 1, &myrank, &neighs[1]);
MPI_Cart_shift(cart_top, 1, 1, &myrank, &neighs[2]);
MPI_Cart_shift(cart_top, 1, -1, &myrank, &neighs[3]);
MPI_Scatterv(T, counts, disps, blocktype, local, BLOCKROWS*BLOCKCOLS, 
MPI_DOUBLE, 0, cart_top);
double curr_t=0;
for(double curr_t = 0; curr_t < t; curr_t+=delta_t) {
    MPI_Barrier(cart_top);

    //Send border columns to neighbors
    if(neighs[2] != MPI_PROC_NULL) {
        MPI_Send(&local[BLOCKCOLS-1], 1, coltype2, neighs[2], LEFT_COL_TAG+(int)(curr_t*1000), cart_top);
    }
    if(neighs[3] != MPI_PROC_NULL) {
        MPI_Send(local, 1, coltype2, neighs[3], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top);
    }
    if(neighs[0] != MPI_PROC_NULL) {
        MPI_Send(local, BLOCKCOLS, MPI_DOUBLE, neighs[0], TOP_ROW_TAG+(int)(curr_t*1000), cart_top);
    }
    if(neighs[1] != MPI_PROC_NULL) {
        MPI_Send(&local[(BLOCKROWS-1)*BLOCKCOLS], BLOCKCOLS, MPI_DOUBLE, neighs[1], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top);
    }

    if(neighs[3] != MPI_PROC_NULL) {
        MPI_Recv(leftcol, BLOCKROWS, MPI_DOUBLE, neighs[3], LEFT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
    }
    if(neighs[2] != MPI_PROC_NULL) {
        MPI_Recv(rightcol, BLOCKROWS, MPI_DOUBLE, neighs[2], RIGHT_COL_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
    }
    if(neighs[1] != MPI_PROC_NULL) {
        MPI_Recv(bottomrow, BLOCKCOLS, MPI_DOUBLE, neighs[1], TOP_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
    }
    if(neighs[0] != MPI_PROC_NULL) {
        MPI_Recv(toprow, BLOCKCOLS, MPI_DOUBLE, neighs[0], BOTTOM_ROW_TAG+(int)(curr_t*1000), cart_top, MPI_STATUS_IGNORE);
    }

    MPI_Barrier(cart_top);
    double* aux;
    //cout<<" t in process "<<me<<" is " <<t<<endl;
    int i, j;
    MPI_Comm_rank(cart_top, &myrank);
    MPI_Barrier(cart_top);

        for(i=lb_i; i<ub_i; i++) {
            for(j=lb_j; j<ub_j; j++) {
                double curr,c1,c2,c3,c4;
       curr = local[i*BLOCKCOLS+j];
                 c1 = i==0 ? toprow[j] : local[(i-1)*BLOCKCOLS+j];
                 c2 = i==BLOCKROWS-1 ? bottomrow[j] : local[(i+1)*BLOCKCOLS+j];
                 c3 = j==0 ? leftcol[i] : local[i*BLOCKCOLS+(j-1)];
                 c4 = j==BLOCKCOLS-1 ? rightcol[i] : local[i*BLOCKCOLS+(j+1)];

       locnew[i*BLOCKCOLS+j] = curr*(1-4*dsc*p) + dsc*p*(c1+c2+c3+c4);
                /*if(i==0) locnew[i*BLOCKCOLS+j] = toprow[j];
                 else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = bottomrow[j];
                 if(j==0) locnew[i*BLOCKCOLS+j] = leftcol[i];
                 else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = rightcol[i];
                 if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1)    locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
       /*if(i==0) locnew[i*BLOCKCOLS+j] = (double)5000;
       else if(i==BLOCKROWS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
       if(j==0) locnew[i*BLOCKCOLS+j] = (double)5000;
       else if(j==BLOCKCOLS-1) locnew[i*BLOCKCOLS+j] = (double)5000;
       if(i!=0 && i!=BLOCKROWS-1 && j!=0 && j!=BLOCKCOLS-1)    locnew[i*BLOCKCOLS+j] = local[i*BLOCKCOLS+j];*/
            }
        }


    aux = local;
    local = locnew;
    locnew = aux;



    MPI_Barrier(cart_top);


/*  aux = T;
    T=Tnew;
    Tnew = aux;*/

}
MPI_Gatherv(local, BLOCKROWS*BLOCKCOLS, MPI_DOUBLE, Tnew, counts, disps, blocktype, 0,cart_top);
if(me == 0) {
    ofstream outfile;
    outfile.open(argv[10]);
    for(int i=0; i<width; i++) {
        for(int j=0; j<height; j++) {
            outfile<< i<<" " <<j<<" "<<Tnew[i*height+j]<<endl;
        }

    }
    outfile.close();
}
MPI_Finalize();


}