I have a problem which runs nicely with OpenMPI, LAM 7.0.5 and
Suns's LAM 6.3-b3, but fails on LAM 7.1.3. This doesn't rule out the
possibility that a programming error is involved, but does make LAM
7.1.3 somewhat more suspect. In any case, I have been chasing this
problem for about a week now and appreciate some help. The problem
concerns my subroutine AC_multiply which, in its present configuration
has the form:
**************************************************************************
subroutine AC_multiply(X, rhs)
! ... matrix-vector multiply: A*X=rhs
! ... Find rhs=A*X, where A is the partitioned matrix.
! ... Each process corresponds to a partition of matrix A.
! ...
! ... argument list
! ...
real(kind=kv), dimension(:), intent(in) :: X
real(kind=kv), dimension(:), intent(out) :: rhs
! ...
! ... local variables: none
! ...
integer :: i, ierr
integer :: status(MPI_STATUS_SIZE)
! ..............................................................
stride=nx; reach=ny; top=nz
! ...
! ... Nonblocking sends; does not function with LAM 7.1.3 ?
! ...
if (no_partitions>1) call vec_isend(X, part_intracomm, 1, 1, 1)
! ...
! ... Multiply partition by vector X corresponding to partition nodes.
! ...
call a_multiply(X, rhs)
! ...
! ... Receive X vector info from black partitions
! ...
if (no_partitions>1) then
call vec_fnc_recv(recv_prod_1, rhs, part_intracomm, 1, 1, 1)
! ...
do i=1, 6
call MPI_WAIT(request(i), status, ierr)
enddo
endif
! ...
end subroutine AC_multiply
***************************************************************************
As the name implies, this is a vector/matrix multiply subroutine where
the matrix has been partitioned into blocks based on a CCFD
representation. The interior of the blocks, where the bulk of the
work is done, uses a general-purpose vector/matrix multiply scheme for
compressed diagonal storage: subroutine a_multiply(X, rhs). The
various block vector/matrix multiplies obtained from subroutine
a_multiply are tied together by the passing of appropriate sections of
the X vector to processes representing surrounding blocks; this is
done with the call to subroutine vec_isend, where up to six X-vector
sections can be passed to processes representing surrounding blocks.
The current process also looks to receive up to six X-vector sections
from surrounding blocks (processes); this is done in subroutine
vec_fnc_recv where upon these sections are multiplied by appropriate
coefficients and added to the RHS vector. In this manner, a complete
block-by-block vector/matrix multiply is carried out. I have chosen
to use non-blocking sends (MPI_isend) to send the small X-vector
sections in subroutine vec_isend; this seems logical as blocking sends
could lead to a deadlock. Integer request flags associated with the
non-blocking sends are cleared subsequent to the call to subroutine
vec_fnc_send. When an X-vector section is received in subroutine
vec_fnc_send, the sending process is first identified; knowing the
sending process allows the receiving process to form the appropriate
addition to the RHS vector.
My problem arises in that X-vector sections received are not always the
correct X-vector sections. This could be a problem of the receive
construct:
*****************************************************************************
subroutine vec_fnc_recv(load_fnc, r_vec, communicator, lag, start, step)
! ... Receive R_value array corresponding to entries in C_{i,j};
! ... apply load_fnc to R_value and add to R.
! ... step=1: accept R_value array from all adjoining partitions.
! ... step=2: accept R_value array from alternate partitions
! ... start=1: lower faces or all.
! ... start=2: upper faces, only step=2
! ... if (communicator==cp_intracomm) lag=0
! ... if (communicator==part_intracomm) lag=1
! ... R, R_value and part_con passed via module
! ... i_val: upon return, has value of 1; debugging purposes only
! ...
! ... load_fnc specified by calling program;
! ... see RECEIVE FUNCTIONS below.
! ...
! ... argument list
! ...
integer, external :: load_fnc
integer, intent(in) :: communicator, lag, start, step
real(kind=kv), dimension(:), intent(inout), target :: r_vec
! ...
! ... local variables
! ...
integer :: ierr, error
integer :: i, j, tag, sender, process_no, i_val
integer :: status(MPI_STATUS_SIZE)
character(len=64) :: err_loc_message
! ...
! ...................................................................
! ...
nullify (part_con, R_value)
R=>r_vec
ierr=0; i_val=-1
err_loc_message='SR_utilities_child vec_fnc_recv 1'
do i=start,6,step
process_no=p_adj(i)
! ... i indicates partition face
if (process_no<1) cycle
CALL MPI_PROBE(MPI_ANY_SOURCE, MPI_ANY_TAG, communicator, &
status, ierr)
sender=status(MPI_SOURCE)
tag=status(MPI_TAG)
if (tag==0) then
! ... no content in array
call MPI_RECV(MPI_BOTTOM, 0, MPI_INTEGER, sender, &
tag, communicator, status, ierr)
write(unit=interim_print,fmt=*) "child: SR_utilities p=", &
process_id," LOOP i=", i," sender=",sender," empty"
else
! ... Size of array d_value returned in tag.
allocate(R_value(1:tag), stat=error)
call MPI_RECV(R_value, tag, MPI_DOUBLE_PRECISION, sender, &
tag, communicator, status, ierr)
call error_class(communicator, ierr, err_loc_message)
! ...
! ... Add R_value to R using load_fnc
! ...
do j=start,6,step
if (p_adj(j)/=sender+lag) cycle
! ... j indicates partition face
Select case(j)
case(1)
part_con=>part_con_1l
i_val=load_fnc(index_1, tag)
case(2)
part_con=>part_con_1u
i_val=load_fnc(index_2, tag)
case(3)
part_con=>part_con_2l
i_val=load_fnc(index_3, tag)
case(4)
part_con=>part_con_2u
i_val=load_fnc(index_4, tag)
case(5)
part_con=>part_con_3l
i_val=load_fnc(index_5, tag)
case(6)
part_con=>part_con_3u
i_val=load_fnc(index_6, tag)
end select
nullify (part_con)
exit
enddo
endif
! ...
deallocate (R_value)
! ...
enddo
nullify (R)
! ...
end subroutine vec_fnc_recv
function recv_prod_1(index_fn, loop_size)
! ... index_f: function for cell face ording indicator
integer :: recv_prod_1
integer, intent(in) :: loop_size
integer, external :: index_fn
integer :: i, ii
! ...
recv_prod_1=0
do i=1,loop_size
ii=index_fn(i)
R(ii)=R(ii)+part_con(i)*R_value(i)
enddo
recv_prod_1=1
end function recv_prod_1
****************************************************************************
Through the array p_adj(i), each process knows the identification of
its neighboring processes (blocks). By checking the
sender=status(MPI_SOURCE) versus the values entered in p_adj(i) (give
or take an adjustment for communicator differences), the originating
process of the entering X-array section is identified. What has
occurred to me is that there is some incompatability between using a
non-blocking send and the use of MPI_PROBE to identify the souce
process on the receiving end. For it is evident, from my debugging,
that the array being received with the call to MPI_RECV(R_value ...)
is not identically the array being sent by that process, even though
status(MPI_SOURCE) has identified the source process. So either I
have a logic error of some sort at this point, or LAM 7.1.3 has a
problem. It should be noted that I also have a blocking version of
the subroutine vec_isend (vec_send), which uses standard MPI_send's,
and works perfectly in this case. I could use this version to send
the X-array sections in the AC_multiply subroutine above, but that
puts me in the uncomfortable position of starting all process with
blocking sends, which could leand to a deadlock. In this particular
case, I haven't actually seen a deadlock resulting from the use of
blocking sends, but I am newby enough to be wary. The subroutine
vec_isend follows; I am attaching, with separate files, information
concerning the LAM installation and running the code. The entire code
is tarred (tar czf ../vec_mat_test.gz *) and attached to this message.
-- Rich Naff
****************************************************************************
subroutine vec_isend(X, communicator, lag, start, step)
! ... Send segment of X array, corresponding to C_{i,j} connectors,
! ... to surrounding partitions.
! ... nonblocking version
! ...
! ... argument list
! ...
integer, intent(in) :: communicator, lag, start, step
real(kind=kv), dimension(:), intent(in) :: X
! ...
! ... local variables
! ...
integer :: ierr, error
integer :: i, process_no
integer :: status(MPI_STATUS_SIZE)
real(kind=kv), dimension(:), pointer :: X_value
character(len=64) :: err_loc_message
! ...
! ...................................................................
! ...
ierr=0; request=MPI_REQUEST_NULL
do i=start,6,step
process_no=p_adj(i)
! ... i indicates partition face
if (process_no<1) cycle
! ...
! ... Array X_value filled in subroutine load_send.
! ...
Select case(i)
case(1)
allocate(X_value(1:CC_size(1)), stat=error)
call load_send(index_1, CC_size(1))
call MPI_ISEND(X_value, CC_size(1), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(1), communicator, request(1), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 1'
call error_class(communicator, ierr, err_loc_message)
case(2)
allocate(X_value(1:CC_size(1)), stat=error)
call load_send(index_2, CC_size(1))
call MPI_ISEND(X_value, CC_size(1), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(1), communicator, request(2), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 2'
call error_class(communicator, ierr, err_loc_message)
case(3)
allocate(X_value(1:CC_size(2)), stat=error)
call load_send(index_3, CC_size(2))
call MPI_ISEND(X_value, CC_size(2), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(2), communicator, request(3), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 3'
call error_class(communicator, ierr, err_loc_message)
case(4)
allocate(X_value(1:CC_size(2)), stat=error)
call load_send(index_4, CC_size(2))
call MPI_ISEND(X_value, CC_size(2), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(2), communicator, request(4), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 4'
call error_class(communicator, ierr, err_loc_message)
case(5)
allocate(X_value(1:CC_size(3)), stat=error)
call load_send(index_5, CC_size(3))
call MPI_ISEND(X_value, CC_size(3), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(3), communicator, request(5), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 5'
call error_class(communicator, ierr, err_loc_message)
case(6)
allocate(X_value(1:CC_size(3)), stat=error)
call load_send(index_6, CC_size(3))
call MPI_ISEND(X_value, CC_size(3), MPI_DOUBLE_PRECISION, &
process_no-lag, CC_size(3), communicator, request(6), ierr)
err_loc_message='SR_utilities_child vec_isend MPI_ISEND 6'
call error_class(communicator, ierr, err_loc_message)
end select
! ...
deallocate (X_value)
! ...
enddo
! ...
contains
subroutine load_send(index_fn, loop_size)
! ... Extract segment from X array; store in X_value array.
! ... index_f: function for cell face ording indicator
integer, intent(in) :: loop_size
integer, external :: index_fn
integer :: i
! ...
do i=1, loop_size
X_value(i)=X(index_fn(i))
enddo
end subroutine load_send
end subroutine vec_isend
*****************************************************************************
--
Rich Naff
U.S. Geological Survey
MS 413 Box 25046
Denver, CO 80225 USA
telephone: (303) 236-4986
Email: rlnaff_at_[hidden]
|