Skip to content

Commit a4bfac9

Browse files
Revoke coll/han subcomms
Signed-off-by: Matthew Whitlock <mwhitlo@sandia.gov>
1 parent 41a68b5 commit a4bfac9

File tree

9 files changed

+100
-0
lines changed

9 files changed

+100
-0
lines changed

ompi/communicator/ft/comm_ft_revoke.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@ static int ompi_comm_revoke_local(ompi_communicator_t* comm, ompi_comm_rbcast_me
9191
comm->any_source_enabled = false;
9292
/* purge the communicator unexpected fragments and matching logic */
9393
MCA_PML_CALL(revoke_comm(comm, false));
94+
/* revoke any subcomms created by coll */
95+
comm->c_coll->coll_revoke_local(comm);
9496
/* Signal the point-to-point stack to recheck requests */
9597
wait_sync_global_wakeup(MPI_ERR_REVOKED);
9698
return true;

ompi/mca/coll/base/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ libmca_coll_la_SOURCES += \
3131
base/coll_base_comm_unselect.c \
3232
base/coll_base_find_available.c \
3333
base/coll_base_frame.c \
34+
base/coll_base_revoke_local.c \
3435
base/coll_base_bcast.c \
3536
base/coll_base_scatter.c \
3637
base/coll_base_topo.c \

ompi/mca/coll/base/coll_base_comm_select.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
#include "ompi/mca/coll/coll.h"
5050
#include "ompi/mca/coll/base/base.h"
5151
#include "ompi/mca/coll/base/coll_base_util.h"
52+
#include "ompi/mca/coll/base/coll_base_functions.h"
5253

5354
/*
5455
* Stuff for the OBJ interface
@@ -227,6 +228,7 @@ int mca_coll_base_comm_select(ompi_communicator_t * comm)
227228
/* Initialize all the relevant pointers, since they're used as
228229
* sentinel values */
229230
comm->c_coll = (mca_coll_base_comm_coll_t*)calloc(1, sizeof(mca_coll_base_comm_coll_t));
231+
comm->c_coll->coll_revoke_local = mca_coll_base_revoke_local;
230232

231233
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
232234
"coll:base:comm_select: Checking all available modules");

ompi/mca/coll/base/coll_base_functions.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ int mca_coll_base_reduce_local(const void *inbuf, void *inoutbuf, size_t count,
306306
struct ompi_datatype_t * dtype, struct ompi_op_t * op,
307307
mca_coll_base_module_t *module);
308308

309+
int mca_coll_base_revoke_local(struct ompi_communicator_t *comm);
310+
309311
#if OPAL_ENABLE_FT_MPI
310312
/* Agreement */
311313
int ompi_coll_base_agree_noft(void *contrib,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2+
/*
3+
* Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4+
* University Research and Technology
5+
* Corporation. All rights reserved.
6+
* Copyright (c) 2004-2017 The University of Tennessee and The University
7+
* of Tennessee Research Foundation. All rights
8+
* reserved.
9+
* Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10+
* University of Stuttgart. All rights reserved.
11+
* Copyright (c) 2004-2005 The Regents of the University of California.
12+
* All rights reserved.
13+
* Copyright (c) 2013 Los Alamos National Security, LLC. All Rights
14+
* reserved.
15+
* Copyright (c) 2015-2016 Research Organization for Information Science
16+
* and Technology (RIST). All rights reserved.
17+
* Copyright (c) 2016-2017 IBM Corporation. All rights reserved.
18+
* Copyright (c) 2018 Siberian State University of Telecommunications
19+
* and Information Science. All rights reserved.
20+
* Copyright (c) 2022 Cisco Systems, Inc. All rights reserved.
21+
* Copyright (c) 2024 Stony Brook University. All rights reserved.
22+
* $COPYRIGHT$
23+
*
24+
* Additional copyrights may follow
25+
*
26+
* $HEADER$
27+
*/
28+
29+
#include "ompi_config.h"
30+
31+
#include "mpi.h"
32+
#include "opal/class/opal_list.h"
33+
#include "ompi/communicator/communicator.h"
34+
#include "ompi/mca/coll/base/coll_base_functions.h"
35+
#include "ompi/mca/coll/base/coll_base_util.h"
36+
37+
int mca_coll_base_revoke_local(ompi_communicator_t* comm){
38+
// Called on each initialized component, to give each the opportunity to
39+
// revoke any subcomms
40+
mca_coll_base_avail_coll_t* avail;
41+
OPAL_LIST_FOREACH(avail, comm->c_coll->module_list, mca_coll_base_avail_coll_t){
42+
if(NULL == avail->ac_module) continue;
43+
if(NULL == avail->ac_module->coll_revoke_local) continue;
44+
avail->ac_module->coll_revoke_local(comm, avail->ac_module);
45+
}
46+
return OMPI_SUCCESS;
47+
}

ompi/mca/coll/coll.h

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,19 @@ typedef int (*mca_coll_base_module_reduce_local_fn_t)
484484
struct ompi_datatype_t * dtype, struct ompi_op_t * op,
485485
struct mca_coll_base_module_3_0_0_t *module);
486486

487+
/*
488+
* revoke_local
489+
* Even though this is not a collective operation, it is related to the
490+
* collectives. Adding to the framework allows a collective component the
491+
* option of intercepting it to, e.g., also revoke sub-communicators
492+
*/
493+
typedef int (*mca_coll_base_module_revoke_local_fn_t)
494+
(struct ompi_communicator_t* comm,
495+
struct mca_coll_base_module_3_0_0_t *module);
496+
/* revoke applies to all coll modules, so the comm's function differs */
497+
typedef int (*mca_coll_base_comm_revoke_local_fn_t)
498+
(struct ompi_communicator_t* comm);
499+
487500

488501
/* ******************************************************************** */
489502

@@ -627,6 +640,8 @@ struct mca_coll_base_module_3_0_0_t {
627640

628641
mca_coll_base_module_reduce_local_fn_t coll_reduce_local;
629642

643+
mca_coll_base_module_revoke_local_fn_t coll_revoke_local;
644+
630645
/** Data storage for all the algorithms defined in the base. Should
631646
not be used by other modules */
632647
struct mca_coll_base_comm_t* base_data;
@@ -802,6 +817,8 @@ struct mca_coll_base_comm_coll_t {
802817
mca_coll_base_module_iagree_fn_t coll_iagree;
803818
mca_coll_base_module_3_0_0_t *coll_iagree_module;
804819

820+
mca_coll_base_comm_revoke_local_fn_t coll_revoke_local;
821+
805822
/* List of modules initialized, queried and enabled */
806823
opal_list_t *module_list;
807824
};

ompi/mca/coll/han/coll_han.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -551,6 +551,9 @@ mca_coll_han_scatter_intra_dynamic(SCATTER_BASE_ARGS,
551551
int
552552
mca_coll_han_scatterv_intra_dynamic(SCATTERV_BASE_ARGS,
553553
mca_coll_base_module_t *module);
554+
int
555+
mca_coll_han_revoke_local(struct ompi_communicator_t *comm,
556+
mca_coll_base_module_t *module);
554557

555558
int mca_coll_han_barrier_intra_simple(struct ompi_communicator_t *comm,
556559
mca_coll_base_module_t *module);

ompi/mca/coll/han/coll_han_module.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ static void mca_coll_han_module_construct(mca_coll_han_module_t * module)
9292
module->dynamic_errors = 0;
9393

9494
han_module_clear(module);
95+
96+
module->super.coll_revoke_local = mca_coll_han_revoke_local;
9597
}
9698

9799
/*
@@ -257,6 +259,7 @@ mca_coll_han_comm_query(struct ompi_communicator_t * comm, int *priority)
257259
/* We are on a topologic sub-communicator, return only the selector */
258260
han_module->super.coll_allgatherv = mca_coll_han_allgatherv_intra_dynamic;
259261
}
262+
han_module->super.coll_revoke_local = mca_coll_han_revoke_local;
260263

261264
opal_output_verbose(10, ompi_coll_base_framework.framework_output,
262265
"coll:han:comm_query (%s/%s): pick me! pick me!",

ompi/mca/coll/han/coll_han_subcomms.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -414,3 +414,26 @@ int mca_coll_han_comm_create(struct ompi_communicator_t *comm,
414414
OBJ_DESTRUCT(&comm_info);
415415
return OMPI_SUCCESS;
416416
}
417+
418+
int mca_coll_han_revoke_local(ompi_communicator_t *comm,
419+
mca_coll_base_module_t *module)
420+
{
421+
mca_coll_han_module_t *han_module = (mca_coll_han_module_t*) module;
422+
for(int i = 0; i < NB_TOPO_LVL; i++){
423+
if(NULL == han_module->sub_comm[i]) continue;
424+
ompi_comm_revoke_internal(han_module->sub_comm[i]);
425+
}
426+
if(han_module->cached_low_comms != NULL){
427+
for(int i = 0; i < COLL_HAN_LOW_MODULES; i++){
428+
if(NULL == han_module->cached_low_comms[i]) continue;
429+
ompi_comm_revoke_internal(han_module->cached_low_comms[i]);
430+
}
431+
}
432+
if(han_module->cached_up_comms != NULL){
433+
for(int i = 0; i < COLL_HAN_LOW_MODULES; i++){
434+
if(NULL == han_module->cached_low_comms[i]) continue;
435+
ompi_comm_revoke_internal(han_module->cached_low_comms[i]);
436+
}
437+
}
438+
return MPI_SUCCESS;
439+
}

0 commit comments

Comments
 (0)