Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

UCT/API: introduce sys_dev field for mem_alloc #10448

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion src/uct/api/uct.h
Original file line number Diff line number Diff line change
Expand Up @@ -1712,6 +1712,7 @@ typedef struct uct_allocated_memory {
ucs_memory_type_t mem_type; /**< type of allocated memory */
uct_md_h md; /**< if method==MD: MD used to allocate the memory */
uct_mem_h memh; /**< if method==MD: MD memory handle */
ucs_sys_device_t sys_dev; /**< System device for allocated memory */
} uct_allocated_memory_t;


Expand Down Expand Up @@ -2501,7 +2502,10 @@ typedef enum {
UCT_MEM_ALLOC_PARAM_FIELD_MDS = UCS_BIT(3),

/** Enables @ref uct_mem_alloc_params_t::name */
UCT_MEM_ALLOC_PARAM_FIELD_NAME = UCS_BIT(4)
UCT_MEM_ALLOC_PARAM_FIELD_NAME = UCS_BIT(4),

/** Enables @ref uct_mem_alloc_params_t::sys_device */
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is called sys_dev in uct_mem_alloc_params_t. Pls align one of them with the other.
also UCT_MEM_ALLOC_PARAM_FIELD_SYS_DEVICE

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UCT_MEM_ALLOC_PARAM_FIELD_SYS_DEV

UCT_MEM_ALLOC_PARAM_FIELD_SYS_DEV = UCS_BIT(5)
} uct_mem_alloc_params_field_t;


Expand Down Expand Up @@ -2564,6 +2568,12 @@ typedef struct {
* "anonymous-uct_mem_alloc" is used by default.
*/
const char *name;

/**
* System device on which memory is to be allocated, or
* UCS_SYS_DEVICE_ID_UNKNOWN to allow allocating on any device.
*/
ucs_sys_device_t sys_dev;
} uct_mem_alloc_params_t;


Expand Down
5 changes: 3 additions & 2 deletions src/uct/base/uct_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,10 +539,11 @@ ucs_status_t uct_mem_alloc_check_params(size_t length,

ucs_status_t uct_md_mem_alloc(uct_md_h md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
return md->ops->mem_alloc(md, length_p, address_p, mem_type, flags,
alloc_name, memh_p);
alloc_name, sys_dev, memh_p);
}

ucs_status_t uct_md_mem_free(uct_md_h md, uct_mem_h memh)
Expand Down
5 changes: 4 additions & 1 deletion src/uct/base/uct_md.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ typedef ucs_status_t (*uct_md_mem_alloc_func_t)(uct_md_h md,
ucs_memory_type_t mem_type,
unsigned flags,
const char *alloc_name,
ucs_sys_device_t sys_dev,
uct_mem_h *memh_p);

typedef ucs_status_t (*uct_md_mem_free_func_t)(uct_md_h md, uct_mem_h memh);
Expand Down Expand Up @@ -192,11 +193,13 @@ uct_md_query_empty_md_resource(uct_md_resource_desc_t **resources_p,
* @param [in] flags Memory allocation flags, see @ref uct_md_mem_flags.
* @param [in] name Name of the allocated region, used to track memory
* usage for debugging and profiling.
* @param [in] sys_dev System device for the allocation.
* @param [out] memh_p Filled with handle for allocated region.
*/
ucs_status_t uct_md_mem_alloc(uct_md_h md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p);
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p);

/**
* @ingroup UCT_MD
Expand Down
14 changes: 9 additions & 5 deletions src/uct/base/uct_mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
const uct_alloc_method_t *method;
ucs_log_level_t log_level;
ucs_memory_type_t mem_type;
ucs_sys_device_t sys_dev;
uct_md_attr_t md_attr;
ucs_status_t status;
unsigned flags;
Expand Down Expand Up @@ -101,12 +102,15 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
params->name : "anonymous-uct_mem_alloc";
mem_type = (params->field_mask & UCT_MEM_ALLOC_PARAM_FIELD_MEM_TYPE) ?
params->mem_type : UCS_MEMORY_TYPE_HOST;
sys_dev = (params->field_mask & UCT_MEM_ALLOC_PARAM_FIELD_SYS_DEV) ?
params->sys_dev : UCS_SYS_DEVICE_ID_UNKNOWN;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

whould we return INVALID_PARAM if sys_dev is not UCS_SYS_DEVICE_ID_UNKNOWN and mem_type is host?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@brminich This assumes that GPU MDs cannot allocate host memory which is true today but maybe it is better for UCT MDs to return that error than catch it here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you think it can be a valid case in the future, then ok

alloc_length = length;
log_level = (flags & UCT_MD_MEM_FLAG_HIDE_ERRORS) ? UCS_LOG_LEVEL_DEBUG :
UCS_LOG_LEVEL_ERROR;

ucs_trace("allocating %s: %s memory length %zu flags 0x%x", alloc_name,
ucs_memory_type_names[mem_type], alloc_length, flags);
ucs_trace("allocating %s: %s memory length %zu sys_dev %s flags 0x%x",
alloc_name, ucs_memory_type_names[mem_type], alloc_length,
ucs_topo_sys_device_get_name(sys_dev), flags);
ucs_log_indent(1);

for (method = methods; method < methods + num_methods; ++method) {
Expand Down Expand Up @@ -151,9 +155,8 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
* fall-back, because this MD already exposed support for memory
* allocation.
*/
status = uct_md_mem_alloc(md, &alloc_length, &address,
mem_type, flags, alloc_name,
&memh);
status = uct_md_mem_alloc(md, &alloc_length, &address, mem_type,
flags, alloc_name, sys_dev, &memh);
if (status != UCS_OK) {
ucs_log(log_level,
"failed to allocate %zu bytes using md %s for %s: %s",
Expand All @@ -165,6 +168,7 @@ ucs_status_t uct_mem_alloc(size_t length, const uct_alloc_method_t *methods,
ucs_assert(memh != UCT_MEM_HANDLE_NULL);
mem->md = md;
mem->mem_type = mem_type;
mem->sys_dev = sys_dev;
rakhmets marked this conversation as resolved.
Show resolved Hide resolved
mem->memh = memh;
goto allocated;
}
Expand Down
3 changes: 2 additions & 1 deletion src/uct/cuda/cuda_copy/cuda_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,8 @@ static void uct_cuda_copy_sync_memops(uct_cuda_copy_md_t *md,
static ucs_status_t
uct_cuda_copy_mem_alloc(uct_md_h uct_md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
uct_cuda_copy_md_t *md = ucs_derived_of(uct_md, uct_cuda_copy_md_t);
ucs_status_t status;
Expand Down
2 changes: 1 addition & 1 deletion src/uct/ib/mlx5/dv/ib_mlx5dv_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -1986,7 +1986,7 @@ ucs_status_t
uct_ib_mlx5_devx_device_mem_alloc(uct_md_h uct_md, size_t *length_p,
void **address_p, ucs_memory_type_t mem_type,
unsigned flags, const char *alloc_name,
uct_mem_h *memh_p)
ucs_sys_device_t sys_dev, uct_mem_h *memh_p)
rakhmets marked this conversation as resolved.
Show resolved Hide resolved
{
#if HAVE_IBV_DM
uct_ib_md_t *md = ucs_derived_of(uct_md, uct_ib_md_t);
Expand Down
3 changes: 2 additions & 1 deletion src/uct/rocm/copy/rocm_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ static void uct_rocm_copy_md_close(uct_md_h uct_md) {
static ucs_status_t
uct_rocm_copy_mem_alloc(uct_md_h md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
ucs_status_t status;
hsa_status_t hsa_status;
Expand Down
3 changes: 2 additions & 1 deletion src/uct/sm/mm/posix/mm_posix.c
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ uct_posix_segment_open(uct_mm_md_t *md, uct_mm_seg_id_t *seg_id_p, int *fd_p)
static ucs_status_t
uct_posix_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t);
uct_posix_md_config_t *posix_config = ucs_derived_of(md->config,
Expand Down
3 changes: 2 additions & 1 deletion src/uct/sm/mm/sysv/mm_sysv.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ static ucs_status_t uct_sysv_mem_attach_common(int shmid, void **address_p)
static ucs_status_t
uct_sysv_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
uct_mm_md_t *md = ucs_derived_of(tl_md, uct_mm_md_t);
ucs_status_t status;
Expand Down
3 changes: 2 additions & 1 deletion src/uct/ze/copy/ze_copy_md.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ static ucs_status_t uct_ze_copy_md_query(uct_md_h md, uct_md_attr_v2_t *md_attr)
static ucs_status_t
uct_ze_copy_mem_alloc(uct_md_h tl_md, size_t *length_p, void **address_p,
ucs_memory_type_t mem_type, unsigned flags,
const char *alloc_name, uct_mem_h *memh_p)
const char *alloc_name, ucs_sys_device_t sys_dev,
uct_mem_h *memh_p)
{
uct_ze_copy_md_t *md = ucs_derived_of(tl_md, uct_ze_copy_md_t);
ze_host_mem_alloc_desc_t host_desc = {};
Expand Down
Loading