I unfortunately don't have time you debug your program, but I would
strongly recommend running it through a memory-checking debugger such
as Valgrind, Bcheck, or purify. I don't say this lightly -- I use
these tools myself in the normal course of application development.
See the LAM FAQ in the section of "Debugging MPI Programs under
LAM/MPI" for more specific information.
On Jan 18, 2005, at 11:16 PM, Yu-Cheng Chou wrote:
>
> hi,
> I ran a matrix multiplication program on two machines seperately and it
> worked.
> Later I ran the same program on the same machines concurrently and it
> didn't work.
>
> Can anyone give me some hint?
>
> -----------------------------------------------------------------------
> ----
> matrix multiplication program code:
>
> #include <mpi.h>
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #define MASTER 0 // taskid of first task
> #define FROM_MASTER 1 // setting a message type
> #define FROM_WORKER 2 // setting a message type
>
> int main(int argc, char *argv[]) {
> int N[3], // N[0]: number of rows of matrix A
> // N[1]: number of columns of
> matrix A
> // N[2]: number of columns of
> matrix B
> numtasks, // number of tasks
> taskid, // task identifier
> source, // task id of message source
> dest, // task id of message destination
> mtype, // message type
> i, j, k; // misc
>
> double stime, etime; // start time and stop time
> double **a, **b, **c; // c = a * b
> int *start, *end, *row;
>
> FILE *stream;
> char line[1024], *file = "download/matrix.txt";
> MPI_Status status;
>
> stream = fopen("upload/dimension.txt", "r");
> for(i=0; i<3; i++) {
> fgets(line, sizeof(line), stream);
> N[i] = atoi(strtok(line, " "));
> }
> fclose(stream);
>
> a = (double **)malloc(N[0]*sizeof(double *));
> for(i=0; i<N[0]; i++) {
> a[i] = (double *)malloc(N[1]*sizeof(double));
> }
>
> b = (double **)malloc(N[1]*sizeof(double *));
> for(i=0; i<N[1]; i++) {
> b[i] = (double *)malloc(N[2]*sizeof(double));
> }
>
> c = (double **)malloc(N[0]*sizeof(double *));
> for(i=0; i<N[2]; i++) {
> c[i] = (double *)malloc(N[2]*sizeof(double));
> }
>
> MPI_Init(&argc, &argv);
> MPI_Comm_size(MPI_COMM_WORLD, &numtasks);
> MPI_Comm_rank(MPI_COMM_WORLD, &taskid);
>
> start = (int *)malloc(numtasks*sizeof(int));
>
> end = (int *)malloc(numtasks*sizeof(int));
>
> row = (int *)malloc(numtasks*sizeof(int));
>
> stream = fopen("upload/start.txt", "r");
> for(i=0; i<numtasks; i++) {
> fgets(line, sizeof(line), stream);
> start[i] = atoi(strtok(line, " "));
> }
> fclose(stream);
>
> stream = fopen("upload/end.txt", "r");
> for(i=0; i<numtasks; i++) {
> fgets(line, sizeof(line), stream);
> end[i] = atoi(strtok(line, " "));
> }
> fclose(stream);
>
> for(i=0; i<numtasks; i++) {
> fgets(line, sizeof(line), stream);
> row[i] = end[i] - start[i] + 1;
> }
>
> stime = MPI_Wtime();
>
> if(taskid == MASTER) {
> // initialize matrix a and matrix b
> for(i=0; i<N[0]; i++) {
> for(j=0; j<N[1]; j++) {
> a[i][j] = i+j;
> }
> }
> for(i=0; i<N[1]; i++) {
> for(j=0; j<N[2]; j++) {
> b[i][j] = i*j;
> }
> }
>
> // send matrix data to worker processes
> mtype = FROM_MASTER;
> for(dest=1; dest<numtasks; dest++) {
> MPI_Send(&a[start[dest]-1][0], row[dest]*N[1], MPI_DOUBLE,
> dest, mtype, MPI_COMM_WORLD);
> MPI_Send(&b, N[1]*N[2], MPI_DOUBLE, dest, mtype,
> MPI_COMM_WORLD);
> }
>
> }
>
> else {
> // receive matrix data from master process
> mtype = FROM_MASTER;
> MPI_Recv(&a, row[taskid]*N[1], MPI_DOUBLE, MASTER, mtype,
> MPI_COMM_WORLD, &status);
> MPI_Recv(&b, N[1]*N[2], MPI_DOUBLE, MASTER, mtype,
> MPI_COMM_WORLD, &status);
> }
>
> // common part for master and worker processes -- matrix
> calculation
> for(k=0; k<N[2]; k++) {
> for(i=0; i<row[taskid]; i++) {
> c[i][k] = 0.0;
> for(j=0; j<N[1]; j++) {
> c[i][k] = c[i][k] + a[i][j]*b[j][k];
> }
> }
> }
>
> if(taskid == MASTER) {
> // receive results from worker processes
> mtype = FROM_WORKER;
> for(source=1; source<numtasks; source++) {
> MPI_Recv(&c[start[source]-1][0], row[source]*N[2],
> MPI_DOUBLE,
> source, mtype, MPI_COMM_WORLD, &status);
> }
> }
> else {
> // send results to master process
> mtype = FROM_WORKER;
> MPI_Send(&c, row[taskid]*N[2], MPI_DOUBLE, MASTER, mtype,
> MPI_COMM_WORLD);
> }
>
> etime = MPI_Wtime();
>
> if(taskid == MASTER) {
> printf("CPU time : %f\n", etime-stime);
>
> for(i=0; i<N[0]; i++) {
> for(j=0; j<N[2]; j++) {
> printf("%.2f ", c[i][j]);
> }
> printf("\n");
> }
> }
>
> free(start);
>
> free(end);
>
> free(row);
>
> MPI_Finalize();
>
> for(i=0; i<N[0]; i++) {
> free(a[i]);
> }
> free(a);
>
> for(i=0; i<N[1]; i++) {
> free(b[i]);
> }
> free(b);
>
> for(i=0; i<N[0]; i++) {
> free(c[i]);
> }
> free(c);
>
> return 0;
> }
> -----------------------------------------------------------------------
> ----
> dimension.txt:
>
> 3
> 3
> 3
> -----------------------------------------------------------------------
> ----
> start.txt:
>
> 1
> 3
> -----------------------------------------------------------------------
> ----
> end.txt:
>
> 2
> 3
> -----------------------------------------------------------------------
> ----
> error message:
>
> $ mpirun -f -v -np 2 matrix
> 2264 matrix running on n0 (o)
> 8142 matrix running on n1
> Rank (0, MPI_COMM_WORLD): Call stack within LAM:
> Rank (0, MPI_COMM_WORLD): - MPI_Recv()
> Rank (0, MPI_COMM_WORLD): - main()
> MPI_Recv: process in local group is dead (rank 0, MPI_COMM_WORLD)
> -----------------------------------------------------------------------
> ----
> One of the processes started by mpirun has exited with a nonzero exit
> code. This typically indicates that the process finished in error.
> If your process did not finish in error, be sure to include a "return
> 0" or "exit(0)" in your C code before exiting the application.
>
> PID 8142 failed on node n1 (169.237.108.72) due to signal 11.
> -----------------------------------------------------------------------
> ----
>
> Yucheng
>
>
>
>
>
>
>
>
>
>
>
> _______________________________________________
> This list is archived at http://www.lam-mpi.org/MailArchives/lam/
>
--
{+} Jeff Squyres
{+} jsquyres_at_[hidden]
{+} http://www.lam-mpi.org/
|