-
Notifications
You must be signed in to change notification settings - Fork 908
Description
Background information
What version of Open MPI are you using? (e.g., v4.1.6, v5.0.1, git branch name and hash, etc.)
v4.1.7rc1
Describe how Open MPI was installed (e.g., from a source/distribution tarball, from a git clone, from an operating system distribution package, etc.)
Git clone.
If you are building/installing from a git clone, please copy-n-paste the output from git submodule status
.
Please describe the system on which you are running
- Operating system/version: Ubuntu 24.04.2 LTS (Noble Numbat), Kernel 6.8.0-57-generic
- Computer hardware:
- Network type: IPoIB
Details of the problem
With the following reproducer with ucc_perftest :
$ srun -A admin -p admin -N64 --mpi=pmix --ntasks-per-node=8 --container-image=<container-image> env UCX_TLS=self,tcp ucc_perftest -c alltoall -m host -b 1048576 -e 2147483648 -n 2
srun: job 357102 queued and waiting for resources
srun: job 357102 has been allocated resources
[1744257072.724104] [node0190:1259783:0] sock.c:334 UCX ERROR connect(fd=252, dest_addr=<ip address>) failed: Connection timed out
[node0190.<domain>:1259783] pml_ucx.c:424 Error: ucp_ep_create(proc=496) failed: Destination is unreachable
[node0190.<domain>:1259783] pml_ucx.c:477 Error: Failed to resolve UCX endpoint for rank 496 [LOG_CAT_COMMPATTERNS] isend failed in comm_allreduce_pml at iterations 7
.................
that leads to corefiles being created on several nodes.
Thus I have been able to dig into these corefiles to try to understand the reason of the SEGVs, likely to be caused by a wrong/missing error-path in OMPI/UCC code, when triggering the original error/msgs above upon some IPoIB networking issue.
Here are my findings.
The fully unwinded stack is like following :
#0 ompi_request_default_test_all (count=2, requests=0x555555a2f228, completed=0x7fffffffc5c4, statuses=0x0) at request/req_test.c:187
#1 0x00007ffff50139ac in oob_allgather_test (req=0x555555a2f200) at coll_ucc_module.c:182
#2 0x00007ffff7f8ea5c in ucc_core_addr_exchange (context=context@entry=0x555555a2e990, oob=oob@entry=0x555555a2e9a8, addr_storage=addr_storage@entry=0x555555a2eaa0) at core/ucc_context.c:461
#3 0x00007ffff7f8f657 in ucc_context_create_proc_info (lib=0x5555559d12b0, params=params@entry=0x7fffffffc960, config=0x555555a2e840, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>, proc_info=0x7ffff7fbca60 <ucc_local_proc>)
at core/ucc_context.c:723
#4 0x00007ffff7f901f0 in ucc_context_create (lib=<optimized out>, params=params@entry=0x7fffffffc960, config=<optimized out>, context=context@entry=0x7ffff50213c8 <mca_coll_ucc_component+392>) at core/ucc_context.c:866
#5 0x00007ffff5013cb1 in mca_coll_ucc_init_ctx () at coll_ucc_module.c:302
#6 0x00007ffff501583f in mca_coll_ucc_comm_query (comm=0x55555557d240 <ompi_mpi_comm_world>, priority=0x7fffffffcb6c) at coll_ucc_module.c:488
#7 0x00007ffff7ee5e4c in query_2_0_0 (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=0x55555557d240 <ompi_mpi_comm_world>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:540
#8 query (module=<synthetic pointer>, priority=0x7fffffffcb6c, comm=<optimized out>, component=0x7ffff5021240 <mca_coll_ucc_component>) at base/coll_base_comm_select.c:523
#9 check_one_component (module=<synthetic pointer>, component=0x7ffff5021240 <mca_coll_ucc_component>, comm=<optimized out>) at base/coll_base_comm_select.c:486
#10 check_components (comm=comm@entry=0x55555557d240 <ompi_mpi_comm_world>, components=<optimized out>) at base/coll_base_comm_select.c:406
#11 0x00007ffff7ee6446 in mca_coll_base_comm_select (comm=0x55555557d240 <ompi_mpi_comm_world>) at base/coll_base_comm_select.c:114
#12 0x00007ffff7f33613 in ompi_mpi_init (argc=<optimized out>, argc@entry=0, argv=<optimized out>, argv@entry=0x0, requested=0, provided=0x7fffffffcdf4, reinit_ok=reinit_ok@entry=false) at runtime/ompi_mpi_init.c:957
#13 0x00007ffff7ed6c2c in PMPI_Init (argc=0x0, argv=0x0) at pinit.c:69
#14 0x000055555555dbf4 in ucc_pt_bootstrap_mpi::ucc_pt_bootstrap_mpi() ()
#15 0x0000555555565666 in ucc_pt_comm::ucc_pt_comm(ucc_pt_comm_config) ()
#16 0x0000555555558f2a in main ()
where you can see that the unresolved symbol/frame in previously detailed stack is in fact in oob_allgather_test().
And the reason of the SEGV is because :
(gdb) p/x *(oob_allgather_req_t *)0x555555a2f200
$1 = {sbuf = 0x555555a2ea00, rbuf = 0x555555a710c0, oob_coll_ctx = 0x55555557d240, msglen = 0x8, iter = 0x1, reqs = {0x726568, 0x555555a8fa48}}
where reqs[0] is garbage when being dereferenced :
(gdb) p/x $rip
$3 = 0x7ffff7eb39e8
(gdb) x/10i ($rip - 0x18)
0x7ffff7eb39d0 <ompi_request_default_test_all+48>: cmpq $0x1,0x58(%rax)
0x7ffff7eb39d5 <ompi_request_default_test_all+53>: je 0x7ffff7eb39f0 <ompi_request_default_test_all+80>
0x7ffff7eb39d7 <ompi_request_default_test_all+55>: lea 0x1(%r12),%rax
0x7ffff7eb39dc <ompi_request_default_test_all+60>: cmp %rax,%rdi
0x7ffff7eb39df <ompi_request_default_test_all+63>: je 0x7ffff7eb39fe <ompi_request_default_test_all+94>
0x7ffff7eb39e1 <ompi_request_default_test_all+65>: mov %rax,%r12
0x7ffff7eb39e4 <ompi_request_default_test_all+68>: mov (%rbx,%r12,8),%rax
=> 0x7ffff7eb39e8 <ompi_request_default_test_all+72>: mov 0x60(%rax),%esi
0x7ffff7eb39eb <ompi_request_default_test_all+75>: cmp $0x1,%esi
0x7ffff7eb39ee <ompi_request_default_test_all+78>: jne 0x7ffff7eb39d0 <ompi_request_default_test_all+48>
(gdb) x/gx ($rax + 0x60)
0x7265c8: Cannot access memory at address 0x7265c8
(gdb) p/x $rbx + $r12 * 0x8
$4 = 0x555555a2f228
(gdb) x/gx ($rbx + $r12 * 0x8)
0x555555a2f228: 0x0000000000726568
(gdb) p/x $rax
$5 = 0x726568
(gdb) x/gx ($rax + 0x60)
0x7265c8: Cannot access memory at address 0x7265c8
(gdb)
Looking at the corresponding source code in "ompi/mca/coll/ucc/coll_ucc_module.c" :
141
142 typedef struct oob_allgather_req{
143 void *sbuf;
144 void *rbuf;
145 void *oob_coll_ctx;
146 size_t msglen;
147 int iter;
148 ompi_request_t *reqs[2];
149 } oob_allgather_req_t;
150
151 static ucc_status_t oob_allgather_test(void *req)
152 {
153 oob_allgather_req_t *oob_req = (oob_allgather_req_t*)req;
154 ompi_communicator_t *comm = (ompi_communicator_t *)oob_req->oob_coll_ctx;
155 char *tmpsend = NULL;
156 char *tmprecv = NULL;
157 size_t msglen = oob_req->msglen;
158 int probe_count = 5;
159 int rank, size, sendto, recvfrom, recvdatafrom,
160 senddatafrom, completed, probe;
161
162 size = ompi_comm_size(comm);
163 rank = ompi_comm_rank(comm);
164 if (oob_req->iter == 0) {
165 tmprecv = (char*) oob_req->rbuf + (ptrdiff_t)rank * (ptrdiff_t)msglen;
166 memcpy(tmprecv, oob_req->sbuf, msglen);
167 }
168 sendto = (rank + 1) % size;
169 recvfrom = (rank - 1 + size) % size;
170 for (; oob_req->iter < size - 1; oob_req->iter++) {
171 if (oob_req->iter > 0) { <<<< iter is 0 for 1st loop ...
172 probe = 0;
173 do {
174 ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);
<<<<<< during 2nd loop (iter == 1) , ompi_request_test_all() is called with garbled reqs[0] !!
175 probe++;
176 } while (!completed && probe < probe_count);
177 if (!completed) {
178 return UCC_INPROGRESS;
179 }
180 }
181 recvdatafrom = (rank - oob_req->iter - 1 + size) % size;
182 senddatafrom = (rank - oob_req->iter + size) % size;
183 tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;
184 tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;
185 MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
186 MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));
<<<<<< isend triggers an error so reqs[0] is not populated !!
187 MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
188 MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));
<<<<<< irecv do not report error, so reqs[1] is populated.
189 }
190 probe = 0;
191 do {
192 ompi_request_test_all(2, oob_req->reqs, &completed, MPI_STATUS_IGNORE);
193 probe++;
194 } while (!completed && probe < probe_count);
195 if (!completed) {
196 return UCC_INPROGRESS;
197 }
198 return UCC_OK;
199 }
200
201 static ucc_status_t oob_allgather_free(void *req)
202 {
203 free(req);
204 return UCC_OK;
205 }
206
207 static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,
208 void *oob_coll_ctx, void **req)
209 {
210 oob_allgather_req_t *oob_req = malloc(sizeof(*oob_req));
211 oob_req->sbuf = sbuf;
212 oob_req->rbuf = rbuf;
213 oob_req->msglen = msglen;
214 oob_req->oob_coll_ctx = oob_coll_ctx;
215 oob_req->iter = 0;
216 *req = oob_req;
217 return UCC_OK;
218 }
219
"ompi/mca/coll/ucc/coll_ucc_module.c" 528 lines --41%-- 219,0-1 37%
and just to be complete :
#define ompi_request_test_all (ompi_request_functions.req_test_all)
"ompi/request/request.h" 504L, 19446B 407,1 83%
(gdb) x/i ompi_request_functions.req_test_all
0x7ffff7eb39a0 <ompi_request_default_test_all>: endbr64
Based on all of this it appears that the following patch/correction (in v4.1.7rc1, the quite recent OMPI version we are running) would allow OMPI/UCC to no longer coredump by gracefully handling any error during isend/irecv :
~/ompi$ git status
HEAD detached at v4.1.7rc1
Changes not staged for commit:
(use "git add <file>..." to update what will be committed)
(use "git restore <file>..." to discard changes in working directory)
modified: ompi/mca/coll/ucc/coll_ucc_module.c
no changes added to commit (use "git add" and/or "git commit -a")
~/ompi$ git diff
diff --git a/ompi/mca/coll/ucc/coll_ucc_module.c b/ompi/mca/coll/ucc/coll_ucc_module.c
index 1686697618..dfa2674a3d 100644
--- a/ompi/mca/coll/ucc/coll_ucc_module.c
+++ b/ompi/mca/coll/ucc/coll_ucc_module.c
@@ -158,6 +158,7 @@ static ucc_status_t oob_allgather_test(void *req)
int probe_count = 5;
int rank, size, sendto, recvfrom, recvdatafrom,
senddatafrom, completed, probe;
+ int rc;
size = ompi_comm_size(comm);
rank = ompi_comm_rank(comm);
@@ -182,10 +183,12 @@ static ucc_status_t oob_allgather_test(void *req)
senddatafrom = (rank - oob_req->iter + size) % size;
tmprecv = (char*)oob_req->rbuf + (ptrdiff_t)recvdatafrom * (ptrdiff_t)msglen;
tmpsend = (char*)oob_req->rbuf + (ptrdiff_t)senddatafrom * (ptrdiff_t)msglen;
- MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
+ rc = MCA_PML_CALL(isend(tmpsend, msglen, MPI_BYTE, sendto, MCA_COLL_BASE_TAG_UCC,
MCA_PML_BASE_SEND_STANDARD, comm, &oob_req->reqs[0]));
- MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
+ if (OMPI_SUCCESS != rc) return rc
+ rc = MCA_PML_CALL(irecv(tmprecv, msglen, MPI_BYTE, recvfrom,
MCA_COLL_BASE_TAG_UCC, comm, &oob_req->reqs[1]));
+ if (OMPI_SUCCESS != rc) return rc
}
probe = 0;
do {
@@ -213,6 +216,8 @@ static ucc_status_t oob_allgather(void *sbuf, void *rbuf, size_t msglen,
oob_req->msglen = msglen;
oob_req->oob_coll_ctx = oob_coll_ctx;
oob_req->iter = 0;
+ oob_req->reqs[0] = NULL;
+ oob_req->reqs[1] = NULL;
*req = oob_req;
return UCC_OK;
}
~/ompi$