Skip to content

Commit f944b37

Browse files
committed
prov/efa: fix missing mem_desc and iface initialization in non-p2p path
When p2p is not supported for CUDA/ROCR memory, the non-p2p path in efa_rdm_mr_reg_impl() was missing two critical initializations: efa_mr->mr_fid.mem_desc was not set (causing fi_mr_desc() to return NULL/invalid pointers) and efa_mr_hmem_setup() was not called (leaving efa_mr->iface uninitialized). Fix by adding the missing mem_desc assignment, calling efa_mr_hmem_setup() in the non-p2p path, making efa_mr_hmem_setup() non-static, and adding a unit test to verify both fields are properly initialized when p2p is disabled. Signed-off-by: Shi Jin <sjina@amazon.com>
1 parent 3f76ad5 commit f944b37

5 files changed

Lines changed: 103 additions & 3 deletions

File tree

prov/efa/src/efa_mr.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ int efa_mr_regattr_validate(struct fid *fid, const struct fi_mr_attr *attr,
8989
*
9090
* @return FI_SUCCESS or negative FI error code
9191
*/
92-
static int efa_mr_hmem_setup(struct efa_mr *efa_mr,
93-
const struct fi_mr_attr *attr,
94-
uint64_t flags)
92+
int efa_mr_hmem_setup(struct efa_mr *efa_mr,
93+
const struct fi_mr_attr *attr,
94+
uint64_t flags)
9595
{
9696
if (attr->iface == FI_HMEM_SYSTEM) {
9797
efa_mr->iface = FI_HMEM_SYSTEM;

prov/efa/src/efa_mr.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ struct efa_mr {
1818

1919
int efa_mr_reg_impl(struct efa_mr *efa_mr, uint64_t flags, const struct fi_mr_attr *mr_attr);
2020
int efa_mr_dereg_impl(struct efa_mr *efa_mr);
21+
int efa_mr_hmem_setup(struct efa_mr *efa_mr, const struct fi_mr_attr *attr, uint64_t flags);
2122
int efa_mr_validate_regattr(struct fid *fid, const struct fi_mr_attr *attr, uint64_t flags);
2223

2324
#define EFA_MR_ATTR_INIT_SYSTEM(iov, count, access, offset, requested_key, context) \

prov/efa/src/rdm/efa_rdm_mr.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,11 @@ static int efa_rdm_mr_reg_impl(struct efa_rdm_mr *efa_rdm_mr, uint64_t flags,
497497
*/
498498
if ((mr_attr->iface == FI_HMEM_CUDA || mr_attr->iface == FI_HMEM_ROCR)
499499
&& !g_efa_hmem_info[mr_attr->iface].p2p_supported_by_device) {
500+
ret = efa_mr_hmem_setup(&efa_rdm_mr->efa_mr, mr_attr, flags);
501+
if (ret)
502+
return ret;
500503
efa_rdm_mr->efa_mr.mr_fid.key = efa_rdm_mr_non_p2p_keygen();
504+
efa_rdm_mr->efa_mr.mr_fid.mem_desc = &efa_rdm_mr->efa_mr;
501505
} else {
502506
/* base mr registration (ibv mr), must be called the first before RDM specific fields are setup */
503507
ret = efa_mr_reg_impl(&efa_rdm_mr->efa_mr, flags, mr_attr);

prov/efa/test/efa_unit_test_mr.c

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,3 +1356,97 @@ void test_efa_rdm_mr_cache_reference_counting(struct efa_resource **state)
13561356
free(buf);
13571357
#endif
13581358
}
1359+
1360+
#if HAVE_CUDA
1361+
/**
1362+
* @brief Test RDM MR registration for CUDA memory in non-p2p path
1363+
*
1364+
* This test verifies that when p2p is not supported, the non-p2p path
1365+
* properly initializes the MR structure, including the mem_desc field
1366+
* and iface field. It uses fi_mr_desc to verify the descriptor is correct.
1367+
*/
1368+
void test_efa_rdm_mr_reg_cuda_memory_non_p2p(struct efa_resource **state)
1369+
{
1370+
struct efa_resource *resource = *state;
1371+
struct efa_domain *efa_domain;
1372+
size_t mr_size = 64;
1373+
void *buf;
1374+
struct fid_mr *mr = NULL;
1375+
struct fi_mr_attr mr_reg_attr = { 0 };
1376+
struct iovec iovec;
1377+
struct efa_rdm_mr *efa_rdm_mr;
1378+
void *desc;
1379+
int err, baseline_ct, baseline_sz;
1380+
1381+
if (!g_efa_hmem_info[FI_HMEM_CUDA].initialized) {
1382+
skip();
1383+
return;
1384+
}
1385+
1386+
resource->hints = efa_unit_test_alloc_hints_hmem(
1387+
FI_EP_RDM, EFA_FABRIC_NAME);
1388+
efa_unit_test_resource_construct_with_hints(resource, FI_EP_RDM,
1389+
FI_VERSION(2, 0),
1390+
resource->hints,
1391+
true, true);
1392+
1393+
efa_domain = container_of(resource->domain, struct efa_domain,
1394+
util_domain.domain_fid);
1395+
1396+
/* Mock p2p as not supported to force non-p2p path */
1397+
g_efa_hmem_info[FI_HMEM_CUDA].p2p_supported_by_device = false;
1398+
1399+
/* fi_endpoint calls ofi_bufpool_grow, which registers mr */
1400+
baseline_ct = ofi_atomic_get64(&efa_domain->ibv_mr_reg_ct);
1401+
baseline_sz = ofi_atomic_get64(&efa_domain->ibv_mr_reg_sz);
1402+
1403+
err = ofi_cudaMalloc(&buf, mr_size);
1404+
assert_int_equal(err, 0);
1405+
assert_non_null(buf);
1406+
1407+
mr_reg_attr.access = FI_SEND | FI_RECV;
1408+
mr_reg_attr.iface = FI_HMEM_CUDA;
1409+
iovec.iov_base = buf;
1410+
iovec.iov_len = mr_size;
1411+
mr_reg_attr.mr_iov = &iovec;
1412+
mr_reg_attr.iov_count = 1;
1413+
1414+
err = fi_mr_regattr(resource->domain, &mr_reg_attr, 0, &mr);
1415+
assert_int_equal(err, 0);
1416+
assert_non_null(mr);
1417+
1418+
/* Verify this is an RDM MR */
1419+
efa_rdm_mr = container_of(mr, struct efa_rdm_mr, efa_mr.mr_fid);
1420+
assert_non_null(efa_rdm_mr);
1421+
1422+
/* Test fi_mr_desc returns the correct descriptor */
1423+
desc = fi_mr_desc(mr);
1424+
assert_non_null(desc);
1425+
assert_ptr_equal(desc, &efa_rdm_mr->efa_mr);
1426+
1427+
/* Verify iface is properly set (second bug fix) */
1428+
assert_int_equal(efa_rdm_mr->efa_mr.iface, FI_HMEM_CUDA);
1429+
1430+
/* Verify key is set (non-p2p path should generate proprietary key) */
1431+
assert_true(fi_mr_key(mr) != FI_KEY_NOTAVAIL);
1432+
assert_true(fi_mr_key(mr) > UINT32_MAX); /* Non-p2p keys should be > UINT32_MAX */
1433+
1434+
/* Since we're in non-p2p path, no ibv_mr should be registered */
1435+
assert_int_equal(ofi_atomic_get64(&efa_domain->ibv_mr_reg_ct), baseline_ct);
1436+
assert_int_equal(ofi_atomic_get64(&efa_domain->ibv_mr_reg_sz), baseline_sz);
1437+
1438+
/* Verify RDM-specific fields are properly initialized */
1439+
assert_true(efa_rdm_mr->inserted_to_mr_map);
1440+
assert_true(efa_rdm_mr->needs_sync); /* CUDA memory should need sync */
1441+
1442+
/* Cleanup */
1443+
assert_int_equal(fi_close(&mr->fid), 0);
1444+
err = ofi_cudaFree(buf);
1445+
assert_int_equal(err, 0);
1446+
}
1447+
#else
1448+
void test_efa_rdm_mr_reg_cuda_memory_non_p2p(struct efa_resource **state)
1449+
{
1450+
skip();
1451+
}
1452+
#endif

prov/efa/test/efa_unit_tests.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,7 @@ void test_efa_rdm_mr_reg_host_memory();
505505
void test_efa_rdm_mr_reg_host_memory_no_mr_local();
506506
void test_efa_rdm_mr_reg_host_memory_overlapping_buffers();
507507
void test_efa_rdm_mr_reg_cuda_memory();
508+
void test_efa_rdm_mr_reg_cuda_memory_non_p2p();
508509
void test_efa_direct_mr_reg_cuda_memory();
509510
void test_efa_direct_mr_reg_rdma_read_not_supported();
510511
void test_efa_direct_mr_reg_rdma_write_not_supported();

0 commit comments

Comments
 (0)