Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow asarray to work on sequences of usm_array #1139

Merged
merged 9 commits into from
Mar 29, 2023
42 changes: 15 additions & 27 deletions dpctl/memory/_memory.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -679,17 +679,13 @@ cdef class _Memory:
cdef class MemoryUSMShared(_Memory):
"""
MemoryUSMShared(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM shared memory.

Non-positive alignments are not used (malloc_shared is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-shared memory.

MemoryUSMShared(usm_obj) constructor creates instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and to expose
a contiguous block of USM shared allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'shared'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_shared`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down Expand Up @@ -720,17 +716,13 @@ cdef class MemoryUSMShared(_Memory):
cdef class MemoryUSMHost(_Memory):
"""
MemoryUSMHost(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM host memory.

Non-positive alignments are not used (malloc_host is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-host memory.

MemoryUSMDevice(usm_obj) constructor create instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and to expose
a contiguous block of USM host allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'host'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_host`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down Expand Up @@ -762,17 +754,13 @@ cdef class MemoryUSMHost(_Memory):
cdef class MemoryUSMDevice(_Memory):
"""
MemoryUSMDevice(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM device memory.

Non-positive alignments are not used (malloc_device is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-device memory.

MemoryUSMDevice(usm_obj) constructor create instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and exposing
a contiguous block of USM device allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'device'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_device`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down
23 changes: 17 additions & 6 deletions dpctl/memory/_sycl_usm_array_interface_utils.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -88,27 +88,38 @@ cdef object _pointers_from_shape_and_stride(

Returns: tuple(min_disp, nbytes)
"""
cdef Py_ssize_t nelems = 1
cdef Py_ssize_t min_disp = 0
cdef Py_ssize_t max_disp = 0
cdef int i
cdef Py_ssize_t sh_i = 0
cdef Py_ssize_t str_i = 0
if (nd > 0):
if (ary_strides is None):
nelems = 1
for si in ary_shape:
sh_i = int(si)
if (sh_i <= 0):
if (sh_i < 0):
raise ValueError("Array shape elements need to be positive")
nelems = nelems * sh_i
return (ary_offset, nelems * itemsize)
return (ary_offset, max(nelems, 1) * itemsize)
else:
min_disp = ary_offset
max_disp = ary_offset
for i in range(nd):
str_i = int(ary_strides[i])
sh_i = int(ary_shape[i])
if (sh_i <= 0):
if (sh_i < 0):
raise ValueError("Array shape elements need to be positive")
if (str_i > 0):
max_disp += str_i * (sh_i - 1)
if (sh_i > 0):
if (str_i > 0):
max_disp += str_i * (sh_i - 1)
else:
min_disp += str_i * (sh_i - 1)
else:
min_disp += str_i * (sh_i - 1);
nelems = 0
if nelems == 0:
return (ary_offset, itemsize)
return (min_disp, (max_disp - min_disp + 1) * itemsize)
elif (nd == 0):
return (ary_offset, itemsize)
Expand Down
180 changes: 152 additions & 28 deletions dpctl/tensor/_ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import dpctl.tensor._tensor_impl as ti
import dpctl.utils
from dpctl.tensor._device import normalize_queue_device
from dpctl.tensor._usmarray import _is_object_with_buffer_protocol

__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"

Expand Down Expand Up @@ -66,11 +67,12 @@ def _array_info_dispatch(obj):
return _empty_tuple, complex, _host_set
if isinstance(obj, (list, tuple, range)):
return _array_info_sequence(obj)
if any(
isinstance(obj, s)
for s in [np.integer, np.floating, np.complexfloating, np.bool_]
):
return _empty_tuple, obj.dtype, _host_set
if _is_object_with_buffer_protocol(obj):
np_obj = np.array(obj)
return np_obj.shape, np_obj.dtype, _host_set
if hasattr(obj, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(obj)
return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
raise ValueError(type(obj))


Expand Down Expand Up @@ -219,6 +221,18 @@ def _map_to_device_dtype(dt, q):
raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")


def _usm_ndarray_from_suai(obj):
sua_iface = getattr(obj, "__sycl_usm_array_interface__")
membuf = dpm.as_usm_memory(obj)
ary = dpt.usm_ndarray(
sua_iface["shape"],
dtype=sua_iface["typestr"],
buffer=membuf,
strides=sua_iface.get("strides", None),
)
return ary


def _asarray_from_numpy_ndarray(
ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
):
Expand Down Expand Up @@ -276,17 +290,6 @@ def _asarray_from_numpy_ndarray(
return res


def _is_object_with_buffer_protocol(obj):
"Returns `True` if object support Python buffer protocol"
try:
# use context manager to ensure
# buffer is instantly released
with memoryview(obj):
return True
except TypeError:
return False


def _ensure_native_dtype_device_support(dtype, dev) -> None:
"""Check that dtype is natively supported by device.

Expand Down Expand Up @@ -318,6 +321,122 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None:
)


def _usm_types_walker(o, usm_types_list):
if isinstance(o, dpt.usm_ndarray):
usm_types_list.append(o.usm_type)
return
if hasattr(o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(o)
usm_types_list.append(usm_ar.usm_type)
return
if isinstance(o, (list, tuple)):
for el in o:
_usm_types_walker(el, usm_types_list)
return
raise TypeError


def _device_copy_walker(seq_o, res, events):
if isinstance(seq_o, dpt.usm_ndarray):
exec_q = res.sycl_queue
ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
src=seq_o, dst=res, sycl_queue=exec_q
)
events.append(ht_ev)
return
if hasattr(seq_o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(seq_o)
exec_q = res.sycl_queue
ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
src=usm_ar, dst=res, sycl_queue=exec_q
)
events.append(ht_ev)
return
if isinstance(seq_o, (list, tuple)):
for i, el in enumerate(seq_o):
_device_copy_walker(el, res[i], events)
return
raise TypeError


def _copy_through_host_walker(seq_o, usm_res):
if isinstance(seq_o, dpt.usm_ndarray):
usm_res[...] = dpt.asnumpy(seq_o).copy()
return
if hasattr(seq_o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(seq_o)
usm_res[...] = dpt.asnumpy(usm_ar).copy()
return
if isinstance(seq_o, (list, tuple)):
for i, el in enumerate(seq_o):
_copy_through_host_walker(el, usm_res[i])
return
usm_res[...] = np.asarray(seq_o)


def _asarray_from_seq(
seq_obj,
seq_shape,
seq_dt,
seq_dev,
dtype=None,
usm_type=None,
sycl_queue=None,
order="C",
):
"`obj` is a sequence"
if usm_type is None:
usm_types_in_seq = []
_usm_types_walker(seq_obj, usm_types_in_seq)
usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
dpctl.utils.validate_usm_type(usm_type)
if sycl_queue is None:
exec_q = seq_dev
alloc_q = seq_dev
else:
exec_q = dpctl.utils.get_execution_queue(
(
sycl_queue,
seq_dev,
)
)
alloc_q = sycl_queue
if dtype is None:
dtype = _map_to_device_dtype(seq_dt, alloc_q)
else:
_mapped_dt = _map_to_device_dtype(dtype, alloc_q)
if _mapped_dt != dtype:
raise ValueError(
f"Device {sycl_queue.sycl_device} "
f"does not support {dtype} natively."
)
dtype = _mapped_dt
if order in "KA":
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if order in "KA":
if order in ("K", "A"):

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using str appears faster overall:

In [1]: t = "C"

In [2]: %timeit t in ("K", "A")
40.3 ns ± 0.768 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [3]: %timeit t in "KA"
26.9 ns ± 0.518 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [4]: %timeit t in ("K", "A")
40.3 ns ± 0.486 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [5]: %timeit t in "KA"
27.6 ns ± 1.17 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [6]: t = "K"

In [7]: %timeit t in ("K", "A")
25.7 ns ± 0.39 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [8]: %timeit t in "KA"
26.2 ns ± 0.488 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

order = "C"
if isinstance(exec_q, dpctl.SyclQueue):
res = dpt.empty(
seq_shape,
dtype=dtype,
usm_type=usm_type,
sycl_queue=alloc_q,
order=order,
)
ht_events = []
_device_copy_walker(seq_obj, res, ht_events)
dpctl.SyclEvent.wait_for(ht_events)
return res
else:
res = dpt.empty(
seq_shape,
dtype=dtype,
usm_type=usm_type,
sycl_queue=alloc_q,
order=order,
)
_copy_through_host_walker(seq_obj, res)
return res


def asarray(
obj,
dtype=None,
Expand All @@ -327,7 +446,9 @@ def asarray(
sycl_queue=None,
order="K",
):
"""
""" asarray(obj, dtype=None, copy=None, device=None, \
usm_type=None, sycl_queue=None, order="K")

Converts `obj` to :class:`dpctl.tensor.usm_ndarray`.

Args:
Expand All @@ -347,7 +468,7 @@ def asarray(
allocations if possible, but allowed to perform a copy otherwise.
Default: `None`.
order ("C","F","A","K", optional): memory layout of the output array.
Default: "C"
Default: "K"
device (optional): array API concept of device where the output array
is created. `device` can be `None`, a oneAPI filter selector string,
an instance of :class:`dpctl.SyclDevice` corresponding to a
Expand Down Expand Up @@ -407,14 +528,7 @@ def asarray(
order=order,
)
if hasattr(obj, "__sycl_usm_array_interface__"):
sua_iface = getattr(obj, "__sycl_usm_array_interface__")
membuf = dpm.as_usm_memory(obj)
ary = dpt.usm_ndarray(
sua_iface["shape"],
dtype=sua_iface["typestr"],
buffer=membuf,
strides=sua_iface.get("strides", None),
)
ary = _usm_ndarray_from_suai(obj)
return _asarray_from_usm_ndarray(
ary,
dtype=dtype,
Expand Down Expand Up @@ -452,7 +566,7 @@ def asarray(
raise ValueError(
"Converting Python sequence to usm_ndarray requires a copy"
)
_, _, devs = _array_info_sequence(obj)
seq_shape, seq_dt, devs = _array_info_sequence(obj)
if devs == _host_set:
return _asarray_from_numpy_ndarray(
np.asarray(obj, dtype=dtype, order=order),
Expand All @@ -461,7 +575,17 @@ def asarray(
sycl_queue=sycl_queue,
order=order,
)
# for sequences
elif len(devs) == 1:
return _asarray_from_seq(
obj,
seq_shape,
seq_dt,
list(devs)[0],
dtype=dtype,
usm_type=usm_type,
sycl_queue=sycl_queue,
order=order,
)
raise NotImplementedError(
"Converting Python sequences is not implemented"
)
Expand Down
5 changes: 5 additions & 0 deletions dpctl/tensor/_usmarray.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1629,3 +1629,8 @@ cdef api object UsmNDArray_MakeFromPtr(
offset=offset
)
return arr


def _is_object_with_buffer_protocol(o):
"Returns True if object support Python buffer protocol"
return _is_buffer(o)
Loading