Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dpctl/tensor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@
isfinite,
isinf,
isnan,
multiply,
sqrt,
subtract,
)

__all__ = [
Expand Down Expand Up @@ -186,5 +188,7 @@
"isfinite",
"sqrt",
"divide",
"multiply",
"subtract",
"equal",
]
57 changes: 51 additions & 6 deletions dpctl/tensor/_elementwise_funcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
# B01: ===== ADD (x1, x2)

_add_docstring_ = """
add(x1, x2, order='K')
add(x1, x2, out=None, order='K')

Calculates the sum for each element `x1_i` of the input array `x1` with
the respective element `x2_i` of the input array `x2`.
Expand Down Expand Up @@ -94,7 +94,7 @@

# U11: ==== COS (x)
_cos_docstring = """
cos(x, order='K')
cos(x, out=None, order='K')

Computes cosine for each element `x_i` for input array `x`.
"""
Expand All @@ -106,7 +106,7 @@

# B08: ==== DIVIDE (x1, x2)
_divide_docstring_ = """
divide(x1, x2, order='K')
divide(x1, x2, out=None, order='K')

Calculates the ratio for each element `x1_i` of the input array `x1` with
the respective element `x2_i` of the input array `x2`.
Expand All @@ -128,7 +128,7 @@

# B09: ==== EQUAL (x1, x2)
_equal_docstring_ = """
equal(x1, x2, order='K')
equal(x1, x2, out=None, order='K')

Calculates equality test results for each element `x1_i` of the input array `x1`
with the respective element `x2_i` of the input array `x2`.
Expand Down Expand Up @@ -172,6 +172,8 @@

# U17: ==== ISFINITE (x)
_isfinite_docstring_ = """
isfinite(x, out=None, order='K')

Computes if every element of input array is a finite number.
"""

Expand All @@ -181,6 +183,8 @@

# U18: ==== ISINF (x)
_isinf_docstring_ = """
isinf(x, out=None, order='K')

Computes if every element of input array is an infinity.
"""

Expand All @@ -190,6 +194,8 @@

# U19: ==== ISNAN (x)
_isnan_docstring_ = """
isnan(x, out=None, order='K')

Computes if every element of input array is a NaN.
"""

Expand Down Expand Up @@ -231,7 +237,25 @@
# FIXME: implement B18

# B19: ==== MULTIPLY (x1, x2)
# FIXME: implement B19
_multiply_docstring_ = """
multiply(x1, x2, out=None, order='K')

Calculates the product for each element `x1_i` of the input array `x1`
with the respective element `x2_i` of the input array `x2`.

Args:
x1 (usm_ndarray):
First input array, expected to have numeric data type.
x2 (usm_ndarray):
Second input array, also expected to have numeric data type.
Returns:
usm_narray:
an array containing the element-wise products. The data type of
the returned array is determined by the Type Promotion Rules.
"""
multiply = BinaryElementwiseFunc(
"multiply", ti._multiply_result_type, ti._multiply, _multiply_docstring_
)

# U25: ==== NEGATIVE (x)
# FIXME: implement U25
Expand Down Expand Up @@ -268,6 +292,8 @@

# U33: ==== SQRT (x)
_sqrt_docstring_ = """
sqrt(x, out=None, order='K')

Computes sqrt for each element `x_i` for input array `x`.
"""

Expand All @@ -276,7 +302,26 @@
)

# B23: ==== SUBTRACT (x1, x2)
# FIXME: implement B23
_subtract_docstring_ = """
subtract(x1, x2, out=None, order='K')

Calculates the difference bewteen each element `x1_i` of the input
array `x1` and the respective element `x2_i` of the input array `x2`.

Args:
x1 (usm_ndarray):
First input array, expected to have numeric data type.
x2 (usm_ndarray):
Second input array, also expected to have numeric data type.
Returns:
usm_narray:
an array containing the element-wise differences. The data type
of the returned array is determined by the Type Promotion Rules.
"""
subtract = BinaryElementwiseFunc(
"subtract", ti._subtract_result_type, ti._subtract, _subtract_docstring_
)


# U34: ==== TAN (x)
# FIXME: implement U34
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,27 +115,9 @@ sycl::event abs_contig_impl(sycl::queue exec_q,
char *res_p,
const std::vector<sycl::event> &depends = {})
{
sycl::event abs_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
constexpr unsigned int vec_sz = 4;
constexpr unsigned int n_vecs = 2;
const size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

using resTy = typename AbsOutputType<argTy>::value_type;
const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<abs_contig_kernel<argTy, resTy, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
AbsContigFunctor<argTy, resTy, vec_sz, n_vecs>(arg_tp, res_tp,
nelems));
});
return abs_ev;
return elementwise_common::unary_contig_impl<
argTy, AbsOutputType, AbsContigFunctor, abs_contig_kernel>(
exec_q, nelems, arg_p, res_p, depends);
}

template <typename fnT, typename T> struct AbsContigFactory
Expand Down Expand Up @@ -182,24 +164,10 @@ sycl::event abs_strided_impl(sycl::queue exec_q,
const std::vector<sycl::event> &depends,
const std::vector<sycl::event> &additional_depends)
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.depends_on(additional_depends);

using resTy = typename AbsOutputType<argTy>::value_type;
using IndexerT =
typename dpctl::tensor::offset_utils::TwoOffsets_StridedIndexer;

IndexerT indexer{nd, arg_offset, res_offset, shape_and_strides};

const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<abs_strided_kernel<argTy, resTy, IndexerT>>(
{nelems},
AbsStridedFunctor<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
});
return comp_ev;
return elementwise_common::unary_strided_impl<
argTy, AbsOutputType, AbsStridedFunctor, abs_strided_kernel>(
exec_q, nelems, nd, shape_and_strides, arg_p, arg_offset, res_p,
res_offset, depends, additional_depends);
}

template <typename fnT, typename T> struct AbsStridedFactory
Expand Down
118 changes: 14 additions & 104 deletions dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -184,32 +184,10 @@ sycl::event add_contig_impl(sycl::queue exec_q,
py::ssize_t res_offset,
const std::vector<sycl::event> &depends = {})
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);

size_t lws = 64;
constexpr unsigned int vec_sz = 4;
constexpr unsigned int n_vecs = 2;
const size_t n_groups =
((nelems + lws * n_vecs * vec_sz - 1) / (lws * n_vecs * vec_sz));
const auto gws_range = sycl::range<1>(n_groups * lws);
const auto lws_range = sycl::range<1>(lws);

using resTy = typename AddOutputType<argTy1, argTy2>::value_type;

const argTy1 *arg1_tp =
reinterpret_cast<const argTy1 *>(arg1_p) + arg1_offset;
const argTy2 *arg2_tp =
reinterpret_cast<const argTy2 *>(arg2_p) + arg2_offset;
resTy *res_tp = reinterpret_cast<resTy *>(res_p) + res_offset;

cgh.parallel_for<
add_contig_kernel<argTy1, argTy2, resTy, vec_sz, n_vecs>>(
sycl::nd_range<1>(gws_range, lws_range),
AddContigFunctor<argTy1, argTy2, resTy, vec_sz, n_vecs>(
arg1_tp, arg2_tp, res_tp, nelems));
});
return comp_ev;
return elementwise_common::binary_contig_impl<
argTy1, argTy2, AddOutputType, AddContigFunctor, add_contig_kernel>(
exec_q, nelems, arg1_p, arg1_offset, arg2_p, arg2_offset, res_p,
res_offset, depends);
}

template <typename fnT, typename T1, typename T2> struct AddContigFactory
Expand Down Expand Up @@ -256,28 +234,11 @@ sycl::event add_strided_impl(sycl::queue exec_q,
const std::vector<sycl::event> &depends,
const std::vector<sycl::event> &additional_depends)
{
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.depends_on(additional_depends);

using resTy = typename AddOutputType<argTy1, argTy2>::value_type;

using IndexerT =
typename dpctl::tensor::offset_utils::ThreeOffsets_StridedIndexer;

IndexerT indexer{nd, arg1_offset, arg2_offset, res_offset,
shape_and_strides};

const argTy1 *arg1_tp = reinterpret_cast<const argTy1 *>(arg1_p);
const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

cgh.parallel_for<
add_strided_strided_kernel<argTy1, argTy2, resTy, IndexerT>>(
{nelems}, AddStridedFunctor<argTy1, argTy2, resTy, IndexerT>(
arg1_tp, arg2_tp, res_tp, indexer));
});
return comp_ev;
return elementwise_common::binary_strided_impl<
argTy1, argTy2, AddOutputType, AddStridedFunctor,
add_strided_strided_kernel>(
exec_q, nelems, nd, shape_and_strides, arg1_p, arg1_offset, arg2_p,
arg2_offset, res_p, res_offset, depends, additional_depends);
}

template <typename fnT, typename T1, typename T2> struct AddStridedFactory
Expand Down Expand Up @@ -322,62 +283,11 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl(
py::ssize_t res_offset,
const std::vector<sycl::event> &depends = {})
{
const argT1 *mat = reinterpret_cast<const argT1 *>(mat_p) + mat_offset;
const argT2 *vec = reinterpret_cast<const argT2 *>(vec_p) + vec_offset;
resT *res = reinterpret_cast<resT *>(res_p) + res_offset;

const auto &dev = exec_q.get_device();
const auto &sg_sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
// Get device-specific kernel info max_sub_group_size
size_t max_sgSize =
*(std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));

size_t n1_padded = n1 + max_sgSize;
argT2 *padded_vec = sycl::malloc_device<argT2>(n1_padded, exec_q);

if (padded_vec == nullptr) {
throw std::runtime_error("Could not allocate memory on the device");
}
sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends); // ensure vec contains actual data
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
auto i = id[0];
padded_vec[i] = vec[i % n1];
});
});

// sub-group spans work-items [I, I + sgSize)
// base = ndit.get_global_linear_id() - sg.get_local_id()[0]
// Generically, sg.load( &mat[base]) may load arrays from
// different rows of mat. The start corresponds to row (base / n0)
// We read sg.load(&padded_vec[(base / n0)]). The vector is padded to
// ensure that reads are accessible

size_t lws = 64;

sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(make_padded_vec_ev);

auto lwsRange = sycl::range<1>(lws);
size_t n_elems = n0 * n1;
size_t n_groups = (n_elems + lws - 1) / lws;
auto gwsRange = sycl::range<1>(n_groups * lws);

cgh.parallel_for<
class add_matrix_row_broadcast_sg_krn<argT1, argT2, resT>>(
sycl::nd_range<1>(gwsRange, lwsRange),
AddContigMatrixContigRowBroadcastingFunctor<argT1, argT2, resT>(
mat, padded_vec, res, n_elems, n1));
});

sycl::event tmp_cleanup_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(comp_ev);
sycl::context ctx = exec_q.get_context();
cgh.host_task([ctx, padded_vec]() { sycl::free(padded_vec, ctx); });
});
host_tasks.push_back(tmp_cleanup_ev);

return comp_ev;
return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl<
argT1, argT2, resT, AddContigMatrixContigRowBroadcastingFunctor,
add_matrix_row_broadcast_sg_krn>(exec_q, host_tasks, n0, n1, mat_p,
mat_offset, vec_p, vec_offset, res_p,
res_offset, depends);
}

template <typename fnT, typename T1, typename T2>
Expand Down
Loading