Skip to content

Fix Libfabric MR caching issues #13327

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ompi/mca/mtl/ofi/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
# and Technology (RIST). All rights reserved.
# Copyright (c) 2020 Triad National Security, LLC. All rights
# reserved.
# Copyright (c) 2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
# Copyright (c) 2022-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
# Copyright (c) 2025 Jeffrey M. Squyres. All rights reserved.
# $COPYRIGHT$
#
Expand Down Expand Up @@ -48,6 +48,7 @@ mtl_ofi_sources = \
mtl_ofi_component.c \
mtl_ofi_endpoint.h \
mtl_ofi_endpoint.c \
mtl_ofi_mr.c \
mtl_ofi_request.h \
mtl_ofi_types.h \
mtl_ofi_opt.h \
Expand Down
80 changes: 20 additions & 60 deletions ompi/mca/mtl/ofi/mtl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
* reserved.
* Copyright (c) 2019-2024 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2018-2023 Amazon.com, Inc. or its affiliates. All Rights reserved.
* reserved.
* Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2021 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2021 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
Expand Down Expand Up @@ -73,6 +72,8 @@ extern int ompi_mtl_ofi_del_comm(struct mca_mtl_base_module_t *mtl,

int ompi_mtl_ofi_progress_no_inline(void);

int ompi_mtl_ofi_rcache_init(void);

#if OPAL_HAVE_THREAD_LOCAL
extern opal_thread_local int ompi_mtl_ofi_per_thread_ctx;
#endif
Expand Down Expand Up @@ -291,78 +292,37 @@ ompi_mtl_ofi_set_mr_null(ompi_mtl_ofi_request_t *ofi_req) {
static
int ompi_mtl_ofi_register_buffer(struct opal_convertor_t *convertor,
ompi_mtl_ofi_request_t *ofi_req,
void* buffer) {
void* buffer)
{
int ret;
uint32_t cache_flags = 0;

ofi_req->mr = NULL;
if (ofi_req->length <= 0 || NULL == buffer) {
return OMPI_SUCCESS;
}

#if OPAL_OFI_HAVE_FI_MR_IFACE

if ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg) {
/* Register buffer */
int ret;
struct fi_mr_attr attr = {0};
struct iovec iov = {0};

iov.iov_base = buffer;
iov.iov_len = ofi_req->length;
attr.mr_iov = &iov;
attr.iov_count = 1;
attr.access = FI_SEND | FI_RECV;
attr.offset = 0;
attr.context = NULL;
if (false == ompi_mtl_base_selected_component->accelerator_support) {
goto reg;
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
attr.iface = FI_HMEM_CUDA;
opal_accelerator.get_device(&attr.device.cuda);
#if OPAL_OFI_HAVE_FI_HMEM_ROCR
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) {
attr.iface = FI_HMEM_ROCR;
opal_accelerator.get_device(&attr.device.cuda);
#endif
#if OPAL_OFI_HAVE_FI_HMEM_ZE
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) {
attr.iface = FI_HMEM_ZE;
opal_accelerator.get_device(&attr.device.ze);
#endif
} else {
return OPAL_ERROR;
}
reg:
ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &ofi_req->mr);

if (ret) {
opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true,
opal_accelerator_base_selected_component.base_version.mca_component_name,
buffer, ofi_req->length,
fi_strerror(-ret), ret);
ofi_req->mr = NULL;
return OMPI_ERROR;
}
if (! ((convertor->flags & CONVERTOR_ACCELERATOR) && ompi_mtl_ofi.hmem_needs_reg)) {
return OMPI_SUCCESS;
}

#endif

return OMPI_SUCCESS;
/* note - the cache access flags are a little broken, because rcache doesn't
* understand send/recv requirements. Since this rcache is only used in the
* MTL, that isn't a problem and we fix it in the underlying register call.
*/
ret = ompi_mtl_ofi.rcache->rcache_register(ompi_mtl_ofi.rcache, buffer, ofi_req->length,
cache_flags, MCA_RCACHE_ACCESS_ANY,
(mca_rcache_base_registration_t **) &ofi_req->mr);
return ret;
}

/** Deregister buffer */
__opal_attribute_always_inline__ static inline int
ompi_mtl_ofi_deregister_buffer(ompi_mtl_ofi_request_t *ofi_req) {
if (ofi_req->mr) {
int ret;
ret = fi_close(&ofi_req->mr->fid);
if (ret) {
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
"fi_close",
ompi_process_info.nodename, __FILE__, __LINE__,
fi_strerror(-ret), ofi_req->mr->fid);
return OMPI_ERROR;
}
ofi_req->mr = NULL;
(void)ompi_mtl_ofi.rcache->rcache_deregister(ompi_mtl_ofi.rcache, &ofi_req->mr->base);
}

return OMPI_SUCCESS;
}

Expand Down
34 changes: 20 additions & 14 deletions ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* Copyright (c) 2014-2021 Cisco Systems, Inc. All rights reserved
* Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2018-2022 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018-2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2020-2023 Triad National Security, LLC. All rights
* reserved.
* $COPYRIGHT$
Expand Down Expand Up @@ -823,27 +823,28 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
}
} else {
*accelerator_support = true;
ompi_mtl_ofi.hmem_needs_reg = true;
/*
* Workaround for the fact that the CXI provider actually doesn't need for accelerator memory to be registered
* for local buffers, but if one does do so using fi_mr_regattr, one actually needs to manage the
* requested_key field in the fi_mr_attr attr argument, and the OFI MTL doesn't track which requested_keys
* have already been registered. So just set a flag to disable local registration. Note the OFI BTL doesn't
* have a problem here since it uses fi_mr_regattr only within the context of an rcache, and manages the
* requested_key field in this way.
*/
if ((NULL != strstr(prov->fabric_attr->prov_name, "cxi")) ||
(NULL != strstr(prov->fabric_attr->prov_name, "CXI")) ) {
ompi_mtl_ofi.hmem_needs_reg = false;
}

/* Only explicitly register domain buffers if the provider requires it.
For example, CXI does not require it but EFA does require it. */
if ((prov->domain_attr->mr_mode & FI_MR_HMEM) != 0) {
ompi_mtl_ofi.hmem_needs_reg = true;
opal_output_verbose(50, opal_common_ofi.output,
"Support for device buffers enabled with explicit registration");
} else {
opal_output_verbose(50, opal_common_ofi.output,
"Support for device buffers enabled with implicit registration");
}
}
#else
opal_output_verbose(50, opal_common_ofi.output,
"%s:%d: Libfabric provider does not support device buffers. Continuing with device to host copies.\n",
__FILE__, __LINE__);
#endif

if (ompi_mtl_ofi.hmem_needs_reg) {
ompi_mtl_ofi_rcache_init();
}

/**
* Select the format of the OFI tag
*/
Expand Down Expand Up @@ -1177,6 +1178,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
{
ssize_t ret;

if (NULL != ompi_mtl_ofi.rcache) {
mca_rcache_base_module_destroy(ompi_mtl_ofi.rcache);
ompi_mtl_ofi.rcache = NULL;
}

opal_progress_unregister(ompi_mtl_ofi_progress_no_inline);

/* Close all the OFI objects */
Expand Down
124 changes: 124 additions & 0 deletions ompi/mca/mtl/ofi/mtl_ofi_mr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
*
* $HEADER$
*/

#include "opal_config.h"

#include "mtl_ofi.h"

static int
ompi_mtl_ofi_reg_mem(void *reg_data, void *base, size_t size,
mca_rcache_base_registration_t *reg)
{
int ret;
struct fi_mr_attr attr = {0};
struct iovec iov = {0};
ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg;
int dev_id;
uint64_t flags;

iov.iov_base = base;
iov.iov_len = size;
attr.mr_iov = &iov;
attr.iov_count = 1;
attr.access = FI_SEND | FI_RECV;
attr.offset = 0;
attr.context = NULL;

#if OPAL_OFI_HAVE_FI_MR_IFACE
if (OPAL_LIKELY(NULL != base)) {
ret = opal_accelerator.check_addr(base, &dev_id, &flags);
if (ret < 0) {
return ret;
} else if (ret > 0 ) {
if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "cuda")) {
attr.iface = FI_HMEM_CUDA;
opal_accelerator.get_device(&attr.device.cuda);
#if OPAL_OFI_HAVE_FI_HMEM_ROCR
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "rocm")) {
attr.iface = FI_HMEM_ROCR;
opal_accelerator.get_device(&attr.device.cuda);
#endif
#if OPAL_OFI_HAVE_FI_HMEM_ZE
} else if (0 == strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "ze")) {
attr.iface = FI_HMEM_ZE;
opal_accelerator.get_device(&attr.device.ze);
#endif
} else {
return OPAL_ERROR;
}
}
}
#endif

ret = fi_mr_regattr(ompi_mtl_ofi.domain, &attr, 0, &mtl_reg->ofi_mr);
if (0 != ret) {
opal_show_help("help-mtl-ofi.txt", "Buffer Memory Registration Failed", true,
opal_accelerator_base_selected_component.base_version.mca_component_name,
base, size, fi_strerror(-ret), ret);
mtl_reg->ofi_mr = NULL;
return OPAL_ERR_OUT_OF_RESOURCE;
}

mtl_reg->mem_desc = fi_mr_desc(mtl_reg->ofi_mr);

return OPAL_SUCCESS;
}


static int
ompi_mtl_ofi_dereg_mem(void *reg_data, mca_rcache_base_registration_t *reg)
{
ompi_mtl_ofi_reg_t *mtl_reg = (ompi_mtl_ofi_reg_t *)reg;
int ret;

if (mtl_reg->ofi_mr != NULL) {
ret = fi_close(&mtl_reg->ofi_mr->fid);
if (0 != ret) {
opal_output_verbose(1, opal_common_ofi.output,
"%s: error unpinning memory mr=%p: %s",
__func__, (void *)mtl_reg->ofi_mr,
fi_strerror(-ret));
return OPAL_ERROR;
}
}

return OPAL_SUCCESS;
}


int
ompi_mtl_ofi_rcache_init(void)
{
mca_rcache_base_resources_t rcache_resources;
char *tmp;

if (NULL != ompi_mtl_ofi.rcache) {
return OMPI_SUCCESS;
}

tmp = strdup("mtl-ofi");
rcache_resources.cache_name = tmp;
rcache_resources.reg_data = NULL;
rcache_resources.sizeof_reg = sizeof(ompi_mtl_ofi_reg_t);
rcache_resources.register_mem = ompi_mtl_ofi_reg_mem;
rcache_resources.deregister_mem = ompi_mtl_ofi_dereg_mem;

ompi_mtl_ofi.rcache = mca_rcache_base_module_create("grdma", &ompi_mtl_ofi, &rcache_resources);
free(tmp);

if (NULL == ompi_mtl_ofi.rcache) {
/* something when horribly wrong */
opal_output_verbose(1, opal_common_ofi.output,
"creating rcache failed");
return OMPI_ERROR;
}

return OMPI_SUCCESS;
}
7 changes: 5 additions & 2 deletions ompi/mca/mtl/ofi/mtl_ofi_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
* Copyright (c) 2013-2016 Intel, Inc. All rights reserved
* Copyright (c) 2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
*
* $COPYRIGHT$
*
Expand All @@ -25,6 +26,7 @@ typedef enum {
OMPI_MTL_OFI_PROBE
} ompi_mtl_ofi_request_type_t;

struct ompi_mtl_ofi_reg_t;
struct ompi_mtl_ofi_request_t;

struct ompi_mtl_ofi_request_t {
Expand Down Expand Up @@ -89,8 +91,9 @@ struct ompi_mtl_ofi_request_t {
struct mca_mtl_request_t *mrecv_req;

/** Stores reference to memory region from registration */
/* Set to NULL if memory not registered or if non accelerator buffer */
struct fid_mr *mr;

/* Set to NULL if memory not registered */
struct ompi_mtl_ofi_reg_t *mr;
};
typedef struct ompi_mtl_ofi_request_t ompi_mtl_ofi_request_t;

Expand Down
14 changes: 14 additions & 0 deletions ompi/mca/mtl/ofi/mtl_ofi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* Copyright (c) 2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2022-2023 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2025 Amazon.com, Inc. or its affiliates. All Rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -16,6 +17,9 @@

#include "mtl_ofi.h"

#include "opal/mca/rcache/base/base.h"


BEGIN_C_DECLS

/**
Expand Down Expand Up @@ -102,6 +106,8 @@ typedef struct mca_mtl_ofi_module_t {
bool has_posted_initial_buffer;
bool hmem_needs_reg;

/** registration cache */
mca_rcache_base_module_t *rcache;
} mca_mtl_ofi_module_t;

extern mca_mtl_ofi_module_t ompi_mtl_ofi;
Expand All @@ -116,6 +122,14 @@ typedef enum {
OFI_SCALABLE_EP,
} mca_mtl_ofi_ep_type;

struct ompi_mtl_ofi_reg_t {
mca_rcache_base_registration_t base;
struct fid_mr *ofi_mr;
void *mem_desc;
};
typedef struct ompi_mtl_ofi_reg_t ompi_mtl_ofi_reg_t;


/*
* Define upper limit for number of events read from a CQ.
* Setting this to 100 as this was deemed optimal from empirical data.
Expand Down
Loading