Skip to content
Merged
42 changes: 15 additions & 27 deletions dpctl/memory/_memory.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -679,17 +679,13 @@ cdef class _Memory:
cdef class MemoryUSMShared(_Memory):
"""
MemoryUSMShared(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM shared memory.

Non-positive alignments are not used (malloc_shared is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-shared memory.

MemoryUSMShared(usm_obj) constructor creates instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and to expose
a contiguous block of USM shared allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'shared'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_shared`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down Expand Up @@ -720,17 +716,13 @@ cdef class MemoryUSMShared(_Memory):
cdef class MemoryUSMHost(_Memory):
"""
MemoryUSMHost(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM host memory.

Non-positive alignments are not used (malloc_host is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-host memory.

MemoryUSMDevice(usm_obj) constructor create instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and to expose
a contiguous block of USM host allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'host'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_host`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down Expand Up @@ -762,17 +754,13 @@ cdef class MemoryUSMHost(_Memory):
cdef class MemoryUSMDevice(_Memory):
"""
MemoryUSMDevice(nbytes, alignment=0, queue=None, copy=False)
allocates nbytes of USM device memory.

Non-positive alignments are not used (malloc_device is used instead).
For the queue=None case the ``dpctl.SyclQueue()`` is used to allocate
memory.
An object representing allocation of SYCL USM-device memory.

MemoryUSMDevice(usm_obj) constructor create instance from `usm_obj`
expected to implement `__sycl_usm_array_interface__` protocol and exposing
a contiguous block of USM device allocation. Use `copy=True` to
perform a copy if USM type of the allocation represented by the argument
is other than 'device'.
Non-positive ``alignment`` values are not ignored and
the allocator ``malloc_device`` is used for allocation instead.
If ``queue`` is ``None`` a cached default-constructed
:class:`dpctl.SyclQueue` is used to allocate memory.
"""
def __cinit__(self, other, *, Py_ssize_t alignment=0,
SyclQueue queue=None, int copy=False):
Expand Down
23 changes: 17 additions & 6 deletions dpctl/memory/_sycl_usm_array_interface_utils.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -88,27 +88,38 @@ cdef object _pointers_from_shape_and_stride(

Returns: tuple(min_disp, nbytes)
"""
cdef Py_ssize_t nelems = 1
cdef Py_ssize_t min_disp = 0
cdef Py_ssize_t max_disp = 0
cdef int i
cdef Py_ssize_t sh_i = 0
cdef Py_ssize_t str_i = 0
if (nd > 0):
if (ary_strides is None):
nelems = 1
for si in ary_shape:
sh_i = int(si)
if (sh_i <= 0):
if (sh_i < 0):
raise ValueError("Array shape elements need to be positive")
nelems = nelems * sh_i
return (ary_offset, nelems * itemsize)
return (ary_offset, max(nelems, 1) * itemsize)
else:
min_disp = ary_offset
max_disp = ary_offset
for i in range(nd):
str_i = int(ary_strides[i])
sh_i = int(ary_shape[i])
if (sh_i <= 0):
if (sh_i < 0):
raise ValueError("Array shape elements need to be positive")
if (str_i > 0):
max_disp += str_i * (sh_i - 1)
if (sh_i > 0):
if (str_i > 0):
max_disp += str_i * (sh_i - 1)
else:
min_disp += str_i * (sh_i - 1)
else:
min_disp += str_i * (sh_i - 1);
nelems = 0
if nelems == 0:
return (ary_offset, itemsize)
return (min_disp, (max_disp - min_disp + 1) * itemsize)
elif (nd == 0):
return (ary_offset, itemsize)
Expand Down
180 changes: 152 additions & 28 deletions dpctl/tensor/_ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import dpctl.tensor._tensor_impl as ti
import dpctl.utils
from dpctl.tensor._device import normalize_queue_device
from dpctl.tensor._usmarray import _is_object_with_buffer_protocol

__doc__ = "Implementation of creation functions in :module:`dpctl.tensor`"

Expand Down Expand Up @@ -66,11 +67,12 @@ def _array_info_dispatch(obj):
return _empty_tuple, complex, _host_set
if isinstance(obj, (list, tuple, range)):
return _array_info_sequence(obj)
if any(
isinstance(obj, s)
for s in [np.integer, np.floating, np.complexfloating, np.bool_]
):
return _empty_tuple, obj.dtype, _host_set
if _is_object_with_buffer_protocol(obj):
np_obj = np.array(obj)
return np_obj.shape, np_obj.dtype, _host_set
if hasattr(obj, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(obj)
return usm_ar.shape, usm_ar.dtype, frozenset([usm_ar.sycl_queue])
raise ValueError(type(obj))


Expand Down Expand Up @@ -219,6 +221,18 @@ def _map_to_device_dtype(dt, q):
raise RuntimeError(f"Unrecognized data type '{dt}' encountered.")


def _usm_ndarray_from_suai(obj):
sua_iface = getattr(obj, "__sycl_usm_array_interface__")
membuf = dpm.as_usm_memory(obj)
ary = dpt.usm_ndarray(
sua_iface["shape"],
dtype=sua_iface["typestr"],
buffer=membuf,
strides=sua_iface.get("strides", None),
)
return ary


def _asarray_from_numpy_ndarray(
ary, dtype=None, usm_type=None, sycl_queue=None, order="K"
):
Expand Down Expand Up @@ -276,17 +290,6 @@ def _asarray_from_numpy_ndarray(
return res


def _is_object_with_buffer_protocol(obj):
"Returns `True` if object support Python buffer protocol"
try:
# use context manager to ensure
# buffer is instantly released
with memoryview(obj):
return True
except TypeError:
return False


def _ensure_native_dtype_device_support(dtype, dev) -> None:
"""Check that dtype is natively supported by device.

Expand Down Expand Up @@ -318,6 +321,122 @@ def _ensure_native_dtype_device_support(dtype, dev) -> None:
)


def _usm_types_walker(o, usm_types_list):
if isinstance(o, dpt.usm_ndarray):
usm_types_list.append(o.usm_type)
return
if hasattr(o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(o)
usm_types_list.append(usm_ar.usm_type)
return
if isinstance(o, (list, tuple)):
for el in o:
_usm_types_walker(el, usm_types_list)
return
raise TypeError


def _device_copy_walker(seq_o, res, events):
if isinstance(seq_o, dpt.usm_ndarray):
exec_q = res.sycl_queue
ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
src=seq_o, dst=res, sycl_queue=exec_q
)
events.append(ht_ev)
return
if hasattr(seq_o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(seq_o)
exec_q = res.sycl_queue
ht_ev, _ = ti._copy_usm_ndarray_into_usm_ndarray(
src=usm_ar, dst=res, sycl_queue=exec_q
)
events.append(ht_ev)
return
if isinstance(seq_o, (list, tuple)):
for i, el in enumerate(seq_o):
_device_copy_walker(el, res[i], events)
return
raise TypeError


def _copy_through_host_walker(seq_o, usm_res):
if isinstance(seq_o, dpt.usm_ndarray):
usm_res[...] = dpt.asnumpy(seq_o).copy()
return
if hasattr(seq_o, "__sycl_usm_array_interface__"):
usm_ar = _usm_ndarray_from_suai(seq_o)
usm_res[...] = dpt.asnumpy(usm_ar).copy()
return
if isinstance(seq_o, (list, tuple)):
for i, el in enumerate(seq_o):
_copy_through_host_walker(el, usm_res[i])
return
usm_res[...] = np.asarray(seq_o)


def _asarray_from_seq(
seq_obj,
seq_shape,
seq_dt,
seq_dev,
dtype=None,
usm_type=None,
sycl_queue=None,
order="C",
):
"`obj` is a sequence"
if usm_type is None:
usm_types_in_seq = []
_usm_types_walker(seq_obj, usm_types_in_seq)
usm_type = dpctl.utils.get_coerced_usm_type(usm_types_in_seq)
dpctl.utils.validate_usm_type(usm_type)
if sycl_queue is None:
exec_q = seq_dev
alloc_q = seq_dev
else:
exec_q = dpctl.utils.get_execution_queue(
(
sycl_queue,
seq_dev,
)
)
alloc_q = sycl_queue
if dtype is None:
dtype = _map_to_device_dtype(seq_dt, alloc_q)
else:
_mapped_dt = _map_to_device_dtype(dtype, alloc_q)
if _mapped_dt != dtype:
raise ValueError(
f"Device {sycl_queue.sycl_device} "
f"does not support {dtype} natively."
)
dtype = _mapped_dt
if order in "KA":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if order in "KA":
if order in ("K", "A"):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using str appears faster overall:

In [1]: t = "C"

In [2]: %timeit t in ("K", "A")
40.3 ns ± 0.768 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [3]: %timeit t in "KA"
26.9 ns ± 0.518 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [4]: %timeit t in ("K", "A")
40.3 ns ± 0.486 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [5]: %timeit t in "KA"
27.6 ns ± 1.17 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [6]: t = "K"

In [7]: %timeit t in ("K", "A")
25.7 ns ± 0.39 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

In [8]: %timeit t in "KA"
26.2 ns ± 0.488 ns per loop (mean ± std. dev. of 7 runs, 10,000,000 loops each)

order = "C"
if isinstance(exec_q, dpctl.SyclQueue):
res = dpt.empty(
seq_shape,
dtype=dtype,
usm_type=usm_type,
sycl_queue=alloc_q,
order=order,
)
ht_events = []
_device_copy_walker(seq_obj, res, ht_events)
dpctl.SyclEvent.wait_for(ht_events)
return res
else:
res = dpt.empty(
seq_shape,
dtype=dtype,
usm_type=usm_type,
sycl_queue=alloc_q,
order=order,
)
_copy_through_host_walker(seq_obj, res)
return res


def asarray(
obj,
dtype=None,
Expand All @@ -327,7 +446,9 @@ def asarray(
sycl_queue=None,
order="K",
):
"""
""" asarray(obj, dtype=None, copy=None, device=None, \
usm_type=None, sycl_queue=None, order="K")

Converts `obj` to :class:`dpctl.tensor.usm_ndarray`.

Args:
Expand All @@ -347,7 +468,7 @@ def asarray(
allocations if possible, but allowed to perform a copy otherwise.
Default: `None`.
order ("C","F","A","K", optional): memory layout of the output array.
Default: "C"
Default: "K"
device (optional): array API concept of device where the output array
is created. `device` can be `None`, a oneAPI filter selector string,
an instance of :class:`dpctl.SyclDevice` corresponding to a
Expand Down Expand Up @@ -407,14 +528,7 @@ def asarray(
order=order,
)
if hasattr(obj, "__sycl_usm_array_interface__"):
sua_iface = getattr(obj, "__sycl_usm_array_interface__")
membuf = dpm.as_usm_memory(obj)
ary = dpt.usm_ndarray(
sua_iface["shape"],
dtype=sua_iface["typestr"],
buffer=membuf,
strides=sua_iface.get("strides", None),
)
ary = _usm_ndarray_from_suai(obj)
return _asarray_from_usm_ndarray(
ary,
dtype=dtype,
Expand Down Expand Up @@ -452,7 +566,7 @@ def asarray(
raise ValueError(
"Converting Python sequence to usm_ndarray requires a copy"
)
_, _, devs = _array_info_sequence(obj)
seq_shape, seq_dt, devs = _array_info_sequence(obj)
if devs == _host_set:
return _asarray_from_numpy_ndarray(
np.asarray(obj, dtype=dtype, order=order),
Expand All @@ -461,7 +575,17 @@ def asarray(
sycl_queue=sycl_queue,
order=order,
)
# for sequences
elif len(devs) == 1:
return _asarray_from_seq(
obj,
seq_shape,
seq_dt,
list(devs)[0],
dtype=dtype,
usm_type=usm_type,
sycl_queue=sycl_queue,
order=order,
)
raise NotImplementedError(
"Converting Python sequences is not implemented"
)
Expand Down
5 changes: 5 additions & 0 deletions dpctl/tensor/_usmarray.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1629,3 +1629,8 @@ cdef api object UsmNDArray_MakeFromPtr(
offset=offset
)
return arr


def _is_object_with_buffer_protocol(o):
"Returns True if object support Python buffer protocol"
return _is_buffer(o)
Loading