Hi,
my matrix multiplication c code looks like this:
---------------------------------------------------------------------------
#include <mpi.h>
#define NRA 300
#define NCA 300
#define NCB 300
.
.
.
int main(int argc, char *argv[]) {
double **a, **b, **c;
.
.
.
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
.
.
.
a = (double **)malloc(NRA*sizeof(double *));
for(i=0; i<NRA; i++) {
a[i] = (double *)malloc(NCA*sizeof(double));
}
b = (double **)malloc(NCA*sizeof(double *));
for(i=0; i<NCA; i++) {
b[i] = (double *)malloc(NCB*sizeof(double));
}
c = (double **)malloc(NRA*sizeof(double *));
for(i=0; i<NRA; i++) {
c[i] = (double *)malloc(NCB*sizeof(double));
}
if(taskid == MASTER) {
// initialize matrix a and matrix b
.
.
.
// send matrix data to worker processes
.
.
.
}
else {
// receive matrix data from master process
.
.
.
}
// for both master and worker processes -- matrix calculation
for(k=0; k<NCB; k++) {
for(i=0; i<rows; i++) {
c[i][k] = 0.0;
for(j=0; j<NCA; j++) {
c[i][k] = c[i][k] + a[i][j]*b[j][k];
}
}
}
if(taskid == MASTER) {
// receive results from worker processes
.
.
.
}
else {
// send results to master process
.
.
.
}
// free all dynamically allocated memories
for(i=0; i<NRA; i++) {
free(a[i]);
}
free(a);
for(i=0; i<NCA; i++) {
free(b[i]);
}
free(b);
for(i=0; i<NRA; i++) {
free(c[i]);
}
free(c);
MPI_Finalize();
return 0;
}
---------------------------------------------------------------------------
When I run this program on two machines, error message like this came out.
---------------------------------------------------------------------------
Rank (0, MPI_COMM_WORLD): Call stack within LAM:
MPI_Recv: process in local group is dead (rank 0, MPI_COMM_WORLD)
Rank (0, MPI_COMM_WORLD): - MPI_Recv()
Rank (0, MPI_COMM_WORLD): - main()
One of the processes started by mpirun has exited with a nonzero exit
code. This typically indicates that the process finished in error.
If your process did not finish in error, be sure to include a "return
0" or "exit(0)" in your C code before exiting the application.
PID 4672 failed on node n1 (169.237.108.13) due to signal 9.
---------------------------------------------------------------------------
Any hint for that?
|