Skip to content

Commit

Permalink
Merge pull request #1890 from IntelPython/fix-gh-1887
Browse files Browse the repository at this point in the history
Check pointer alignment when copying from strided array to C-contiguous array
  • Loading branch information
ndgrigorian authored Nov 13, 2024
2 parents 79bc274 + 0bed4aa commit dd2812f
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 56 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed
* Fix for `tensor.result_type` when all inputs are Python built-in scalars [gh-1877](https://github.com/IntelPython/dpctl/pull/1877)

* Improved error in constructors `tensor.full` and `tensor.full_like` when provided a non-numeric fill value [gh-1878](https://github.com/IntelPython/dpctl/pull/1878)
* Added a check for pointer alignment when copying to C-contiguous memory [gh-1890](https://github.com/IntelPython/dpctl/pull/1890)

### Maintenance

Expand Down
163 changes: 108 additions & 55 deletions dpctl/tensor/libtensor/include/kernels/copy_as_contiguous.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ namespace copy_as_contig

template <typename T,
typename IndexerT,
int vec_sz = 4,
int n_vecs = 2,
std::uint32_t vec_sz = 4u,
std::uint32_t n_vecs = 2u,
bool enable_sg_loadstore = true>
class CopyAsCContigFunctor
{
Expand All @@ -66,53 +66,63 @@ class CopyAsCContigFunctor

void operator()(sycl::nd_item<1> ndit) const
{
static_assert(vec_sz > 0);
static_assert(n_vecs > 0);
static_assert(vec_sz * n_vecs < (std::uint32_t(1) << 8));

constexpr std::uint8_t elems_per_wi =
static_cast<std::uint8_t>(vec_sz * n_vecs);

using dpctl::tensor::type_utils::is_complex;
if constexpr (!enable_sg_loadstore || is_complex<T>::value) {
const std::uint32_t sgSize =
const std::uint16_t sgSize =
ndit.get_sub_group().get_local_range()[0];
const std::size_t gid = ndit.get_global_linear_id();

const std::size_t base =
(gid / sgSize) * sgSize * n_vecs * vec_sz + (gid % sgSize);
for (size_t offset = base;
offset < std::min(nelems, base + sgSize * (n_vecs * vec_sz));
offset += sgSize)
{
// base = (gid / sgSize) * sgSize * elems_per_wi + (gid % sgSize)
// gid % sgSize == gid - (gid / sgSize) * sgSize
const std::size_t elems_per_sg = sgSize * (elems_per_wi - 1);
const std::size_t base = (gid / sgSize) * elems_per_sg + gid;
const std::size_t offset_max =
std::min(nelems, base + sgSize * elems_per_wi);

for (size_t offset = base; offset < offset_max; offset += sgSize) {
auto src_offset = src_indexer(offset);
dst_p[offset] = src_p[src_offset];
}
}
else {
auto sg = ndit.get_sub_group();
const std::uint32_t sgSize = sg.get_local_range()[0];
const size_t base = n_vecs * vec_sz *
(ndit.get_group(0) * ndit.get_local_range(0) +
sg.get_group_id()[0] * sgSize);
const std::uint16_t sgSize = sg.get_max_local_range()[0];
const size_t base =
elems_per_wi * (ndit.get_group(0) * ndit.get_local_range(0) +
sg.get_group_id()[0] * sgSize);

if (base + n_vecs * vec_sz * sgSize < nelems) {
if (base + elems_per_wi * sgSize < nelems) {
sycl::vec<T, vec_sz> dst_vec;

#pragma unroll
for (std::uint32_t it = 0; it < n_vecs * vec_sz; it += vec_sz) {
for (std::uint8_t it = 0; it < elems_per_wi; it += vec_sz) {
const size_t block_start_id = base + it * sgSize;
auto dst_multi_ptr = sycl::address_space_cast<
sycl::access::address_space::global_space,
sycl::access::decorated::yes>(
&dst_p[base + it * sgSize]);
sycl::access::decorated::yes>(&dst_p[block_start_id]);

const size_t elem_id0 = block_start_id + sg.get_local_id();
#pragma unroll
for (std::uint32_t k = 0; k < vec_sz; k++) {
ssize_t src_offset = src_indexer(
base + (it + k) * sgSize + sg.get_local_id());
for (std::uint8_t k = 0; k < vec_sz; k++) {
const size_t elem_id = elem_id0 + k * sgSize;
const ssize_t src_offset = src_indexer(elem_id);
dst_vec[k] = src_p[src_offset];
}
sg.store<vec_sz>(dst_multi_ptr, dst_vec);
}
}
else {
for (size_t k = base + sg.get_local_id()[0]; k < nelems;
k += sgSize)
{
ssize_t src_offset = src_indexer(k);
const size_t lane_id = sg.get_local_id()[0];
const size_t k0 = base + lane_id;
for (size_t k = k0; k < nelems; k += sgSize) {
const ssize_t src_offset = src_indexer(k);
dst_p[k] = src_p[src_offset];
}
}
Expand All @@ -121,36 +131,23 @@ class CopyAsCContigFunctor
};

template <typename T,
typename IndexT,
int vec_sz,
int n_vecs,
bool enable_sgload>
class as_contig_krn;

template <typename T>
sycl::event
as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
const ssize_t *shape_and_strides,
const char *src_p,
char *dst_p,
const std::vector<sycl::event> &depends)
typename IndexerT,
std::uint32_t vec_sz,
std::uint32_t n_vecs,
bool enable_sg_load,
typename KernelName>
sycl::event submit_c_contiguous_copy(sycl::queue &exec_q,
size_t nelems,
const T *src,
T *dst,
const IndexerT &src_indexer,
const std::vector<sycl::event> &depends)
{
dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);

const T *src_tp = reinterpret_cast<const T *>(src_p);
T *dst_tp = reinterpret_cast<T *>(dst_p);

using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);
static_assert(vec_sz > 0);
static_assert(n_vecs > 0);
static_assert(vec_sz * n_vecs < (std::uint32_t(1) << 8));

constexpr std::size_t preferred_lws = 256;
constexpr std::uint32_t n_vecs = 2;
constexpr std::uint32_t vec_sz = 4;
constexpr bool enable_sg_load = true;
using KernelName =
as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;

const auto &kernel_id = sycl::get_kernel_id<KernelName>();

Expand All @@ -167,9 +164,11 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
const std::size_t lws =
((preferred_lws + max_sg_size - 1) / max_sg_size) * max_sg_size;

constexpr std::uint32_t nelems_per_wi = n_vecs * vec_sz;
size_t n_groups =
(nelems + nelems_per_wi * lws - 1) / (nelems_per_wi * lws);
constexpr std::uint8_t nelems_per_wi = n_vecs * vec_sz;

const size_t nelems_per_group = nelems_per_wi * lws;
const size_t n_groups =
(nelems + nelems_per_group - 1) / (nelems_per_group);

sycl::event copy_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
Expand All @@ -181,8 +180,62 @@ as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gRange, lRange),
CopyAsCContigFunctor<T, IndexerT, vec_sz, n_vecs, enable_sg_load>(
nelems, src_tp, dst_tp, src_indexer));
nelems, src, dst, src_indexer));
});
return copy_ev;
}

template <typename T,
typename IndexT,
std::uint32_t vec_sz,
std::uint32_t n_vecs,
bool enable_sgload>
class as_contig_krn;

template <typename T>
sycl::event
as_c_contiguous_array_generic_impl(sycl::queue &exec_q,
size_t nelems,
int nd,
const ssize_t *shape_and_strides,
const char *src_p,
char *dst_p,
const std::vector<sycl::event> &depends)
{
dpctl::tensor::type_utils::validate_type_for_device<T>(exec_q);

const T *src_tp = reinterpret_cast<const T *>(src_p);
T *dst_tp = reinterpret_cast<T *>(dst_p);

using IndexerT = dpctl::tensor::offset_utils::StridedIndexer;
const IndexerT src_indexer(nd, ssize_t(0), shape_and_strides);

constexpr std::uint32_t vec_sz = 4u;
constexpr std::uint32_t n_vecs = 2u;

using dpctl::tensor::kernels::alignment_utils::
disabled_sg_loadstore_wrapper_krn;
using dpctl::tensor::kernels::alignment_utils::is_aligned;
using dpctl::tensor::kernels::alignment_utils::required_alignment;

sycl::event copy_ev;
if (is_aligned<required_alignment>(dst_p)) {
constexpr bool enable_sg_load = true;
using KernelName =
as_contig_krn<T, IndexerT, vec_sz, n_vecs, enable_sg_load>;
copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
enable_sg_load, KernelName>(
exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
}
else {
constexpr bool disable_sg_load = false;
using InnerKernelName =
as_contig_krn<T, IndexerT, vec_sz, n_vecs, disable_sg_load>;
using KernelName = disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
copy_ev = submit_c_contiguous_copy<T, IndexerT, vec_sz, n_vecs,
disable_sg_load, KernelName>(
exec_q, nelems, src_tp, dst_tp, src_indexer, depends);
}

return copy_ev;
}
Expand Down
16 changes: 16 additions & 0 deletions dpctl/tests/test_usm_ndarray_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -2632,3 +2632,19 @@ def test_full_functions_raise_type_error():
x = dpt.ones(1, dtype="i4")
with pytest.raises(TypeError):
dpt.full_like(x, "0")


@pytest.mark.parametrize("dt", _all_dtypes)
def test_setitem_copy_as_contig_alignment(dt):
q = get_queue_or_skip()
skip_if_dtype_not_supported(dt, q)

dtype_ = dpt.dtype(dt)
n0, n1 = 8, 23

x = dpt.zeros((n0, n1), dtype=dtype_, sycl_queue=q)

vals = dpt.ones(n1, dtype=dtype_, sycl_queue=q)[dpt.newaxis, :]
x[1:, ...] = vals
assert dpt.all(x[0] == 0)
assert dpt.all(x[1:, :] == vals)

0 comments on commit dd2812f

Please sign in to comment.