From 1b2ec24f9f1d86ba9db0a1c512f3bf32ad08a76e Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Sat, 10 May 2025 06:45:20 -0700 Subject: [PATCH 1/9] Fixed StridedBufferView::to_numpy on Metal devices Fixed 4 elements alignment issue on Metal devices. --- src/slangpy_ext/device/cursor_utils.h | 24 +++++++++++++++++++ .../utils/slangpystridedbufferview.cpp | 23 ++++++++++++++++-- 2 files changed, 45 insertions(+), 2 deletions(-) diff --git a/src/slangpy_ext/device/cursor_utils.h b/src/slangpy_ext/device/cursor_utils.h index 1ebdc476..7cb05049 100644 --- a/src/slangpy_ext/device/cursor_utils.h +++ b/src/slangpy_ext/device/cursor_utils.h @@ -14,6 +14,30 @@ namespace sgl { +inline size_t get_scalar_type_size(TypeReflection::ScalarType type) +{ + switch (type) { + case TypeReflection::ScalarType::int8: + case TypeReflection::ScalarType::uint8: + return 1; + case TypeReflection::ScalarType::int16: + case TypeReflection::ScalarType::uint16: + case TypeReflection::ScalarType::float16: + return 2; + case TypeReflection::ScalarType::bool_: + case TypeReflection::ScalarType::int32: + case TypeReflection::ScalarType::uint32: + case TypeReflection::ScalarType::float32: + return 4; + case TypeReflection::ScalarType::int64: + case TypeReflection::ScalarType::uint64: + case TypeReflection::ScalarType::float64: + return 8; + default: + return 0; + } +} + /// Helper to convert from numpy type mask to slang scalar type. inline std::optional dtype_to_scalar_type(nb::dlpack::dtype dtype) { diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index a833415f..b2ae5d2a 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include +#include "device/cursor_utils.h" #include "nanobind.h" #include "sgl/device/device.h" @@ -362,8 +363,26 @@ nb::ndarray StridedBufferView::to_numpy() const size_t dtype_size = desc().element_layout->stride(); size_t byte_offset = desc().offset * dtype_size; size_t data_size = m_storage->size() - byte_offset; - void* data = new uint8_t[data_size]; - m_storage->get_data(data, data_size, byte_offset); + uint8_t* data = new uint8_t[data_size]; +#if SGL_MACOS + auto type = this->cursor()->element_type(); + bool is_matrix = type->kind() == TypeReflection::Kind::matrix; + uint32_t num_col = type->col_count(); + if (is_matrix && num_col < 4 && num_col != 2) { + auto element_count = this->element_count(); + uint32_t num_row = type->row_count(); + size_t scalar_size = get_scalar_type_size(type->scalar_type()); + for (size_t i = 0; i < num_row * element_count; i++) { + size_t block = i / num_row; + size_t row = i % num_row; + uint8_t* target = data + block * dtype_size + num_col * row * scalar_size; + m_storage->get_data(target, num_col * scalar_size, byte_offset + 4 * i * scalar_size); + } + } else +#endif + { + m_storage->get_data(data, data_size, byte_offset); + } nb::capsule owner(data, [](void* p) noexcept { delete[] reinterpret_cast(p); }); return to_ndarray(data, owner, desc()); From 2915a51dd7ebcaddb7aea3b1cd2f47ec33aaaada Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Sat, 10 May 2025 16:34:37 -0700 Subject: [PATCH 2/9] Revert "Fixed StridedBufferView::to_numpy on Metal devices" This reverts commit ff18b9bd8b83e3f141387f50ed4db905717a22c4. --- src/slangpy_ext/device/cursor_utils.h | 24 ------------------- .../utils/slangpystridedbufferview.cpp | 23 ++---------------- 2 files changed, 2 insertions(+), 45 deletions(-) diff --git a/src/slangpy_ext/device/cursor_utils.h b/src/slangpy_ext/device/cursor_utils.h index 7cb05049..1ebdc476 100644 --- a/src/slangpy_ext/device/cursor_utils.h +++ b/src/slangpy_ext/device/cursor_utils.h @@ -14,30 +14,6 @@ namespace sgl { -inline size_t get_scalar_type_size(TypeReflection::ScalarType type) -{ - switch (type) { - case TypeReflection::ScalarType::int8: - case TypeReflection::ScalarType::uint8: - return 1; - case TypeReflection::ScalarType::int16: - case TypeReflection::ScalarType::uint16: - case TypeReflection::ScalarType::float16: - return 2; - case TypeReflection::ScalarType::bool_: - case TypeReflection::ScalarType::int32: - case TypeReflection::ScalarType::uint32: - case TypeReflection::ScalarType::float32: - return 4; - case TypeReflection::ScalarType::int64: - case TypeReflection::ScalarType::uint64: - case TypeReflection::ScalarType::float64: - return 8; - default: - return 0; - } -} - /// Helper to convert from numpy type mask to slang scalar type. inline std::optional dtype_to_scalar_type(nb::dlpack::dtype dtype) { diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index b2ae5d2a..a833415f 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception #include -#include "device/cursor_utils.h" #include "nanobind.h" #include "sgl/device/device.h" @@ -363,26 +362,8 @@ nb::ndarray StridedBufferView::to_numpy() const size_t dtype_size = desc().element_layout->stride(); size_t byte_offset = desc().offset * dtype_size; size_t data_size = m_storage->size() - byte_offset; - uint8_t* data = new uint8_t[data_size]; -#if SGL_MACOS - auto type = this->cursor()->element_type(); - bool is_matrix = type->kind() == TypeReflection::Kind::matrix; - uint32_t num_col = type->col_count(); - if (is_matrix && num_col < 4 && num_col != 2) { - auto element_count = this->element_count(); - uint32_t num_row = type->row_count(); - size_t scalar_size = get_scalar_type_size(type->scalar_type()); - for (size_t i = 0; i < num_row * element_count; i++) { - size_t block = i / num_row; - size_t row = i % num_row; - uint8_t* target = data + block * dtype_size + num_col * row * scalar_size; - m_storage->get_data(target, num_col * scalar_size, byte_offset + 4 * i * scalar_size); - } - } else -#endif - { - m_storage->get_data(data, data_size, byte_offset); - } + void* data = new uint8_t[data_size]; + m_storage->get_data(data, data_size, byte_offset); nb::capsule owner(data, [](void* p) noexcept { delete[] reinterpret_cast(p); }); return to_ndarray(data, owner, desc()); From b95db93d7e94112eacff5aeeb9c3ffa713f25b63 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Sat, 10 May 2025 17:01:37 -0700 Subject: [PATCH 3/9] Fixed StridedBufferView matrix alignment issue on Metal devices Fix the issue in to_ndarra instead. This version should be more efficient and flexible. --- .../utils/slangpystridedbufferview.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index a833415f..657c99ca 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -7,6 +7,7 @@ #include "sgl/device/command.h" #include "sgl/device/buffer_cursor.h" +#include "sgl/device/reflection.h" #include "utils/slangpybuffer.h" namespace sgl::slangpy { @@ -324,7 +325,19 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str // Buffer with shape (5, ) of struct Foo { ... } -> ndarray of shape (5, sizeof(Foo)) and dtype uint8 bool is_scalar = innermost_layout->type()->kind() == TypeReflection::Kind::scalar; auto dtype_shape = desc.dtype->get_shape(); - auto dtype_strides = dtype_shape.calc_contiguous_strides(); + Shape dtype_strides; +#if SGL_MACOS + auto type_refl = desc.dtype->buffer_type_layout()->type(); + bool is_matrix = type_refl->kind() == TypeReflection::Kind::matrix; + auto row_count = type_refl->row_count(); + auto col_count = type_refl->col_count(); + if (is_matrix && col_count < 4 && col_count != 2) { + dtype_strides = Shape({4, 1}); + } else +#endif + { + dtype_strides = dtype_shape.calc_contiguous_strides(); + } size_t innermost_size = is_scalar ? innermost_layout->stride() : 1; TypeReflection::ScalarType scalar_type From 587332c3d2e058b8199c65cef9463966cf098ed4 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Sat, 10 May 2025 17:56:50 -0700 Subject: [PATCH 4/9] Fixed StridedBufferView matrix alignment issue for copy_from_numpy Fixed matrix alignment stride by copying the data from ndarray to a temperary buffer. --- .../utils/slangpystridedbufferview.cpp | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index 657c99ca..15f39804 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include +#include #include "nanobind.h" #include "sgl/device/device.h" @@ -410,7 +412,32 @@ void StridedBufferView::copy_from_numpy(nb::ndarray data) size_t buffer_size = m_storage->size() - byte_offset; SGL_CHECK(data_size <= buffer_size, "Numpy array is larger than the buffer ({} > {})", data_size, buffer_size); - m_storage->set_data(data.data(), data_size, byte_offset); +#if SGL_MACOS + auto type_refl = this->desc().dtype->buffer_type_layout()->type(); + bool is_matrix = type_refl->kind() == TypeReflection::Kind::matrix; + auto row_count = type_refl->row_count(); + auto col_count = type_refl->col_count(); + if (is_matrix && col_count < 4 && col_count != 2) { + // Get dlpack type from scalar type. + ref innermost = innermost_type(this->desc().dtype); + ref innermost_layout = innermost->buffer_type_layout(); + size_t scalar_size = innermost_layout->stride(); + // Temporary buffer. + std::vector buffer(data_size / col_count * 4); + // Copy row by row. + for (size_t i = 0; i < data_size / (col_count * scalar_size); i++) { + std::memcpy( + buffer.data() + i * 4 * scalar_size, + static_cast(data.data()) + i * col_count * scalar_size, + col_count * scalar_size + ); + } + m_storage->set_data(buffer.data(), buffer.size(), byte_offset); + } else +#endif + { + m_storage->set_data(data.data(), data_size, byte_offset); + } } } // namespace sgl::slangpy From b2f0fb4734f832377768d7873cd6d24e539034f5 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Mon, 12 May 2025 21:06:58 -0700 Subject: [PATCH 5/9] Revert "Fixed StridedBufferView matrix alignment issue for copy_from_numpy" This reverts commit 3f9d5bd26a76fc9042cb40851abdf9d77b43b95f. --- .../utils/slangpystridedbufferview.cpp | 29 +------------------ 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index 15f39804..657c99ca 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,8 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#include #include -#include #include "nanobind.h" #include "sgl/device/device.h" @@ -412,32 +410,7 @@ void StridedBufferView::copy_from_numpy(nb::ndarray data) size_t buffer_size = m_storage->size() - byte_offset; SGL_CHECK(data_size <= buffer_size, "Numpy array is larger than the buffer ({} > {})", data_size, buffer_size); -#if SGL_MACOS - auto type_refl = this->desc().dtype->buffer_type_layout()->type(); - bool is_matrix = type_refl->kind() == TypeReflection::Kind::matrix; - auto row_count = type_refl->row_count(); - auto col_count = type_refl->col_count(); - if (is_matrix && col_count < 4 && col_count != 2) { - // Get dlpack type from scalar type. - ref innermost = innermost_type(this->desc().dtype); - ref innermost_layout = innermost->buffer_type_layout(); - size_t scalar_size = innermost_layout->stride(); - // Temporary buffer. - std::vector buffer(data_size / col_count * 4); - // Copy row by row. - for (size_t i = 0; i < data_size / (col_count * scalar_size); i++) { - std::memcpy( - buffer.data() + i * 4 * scalar_size, - static_cast(data.data()) + i * col_count * scalar_size, - col_count * scalar_size - ); - } - m_storage->set_data(buffer.data(), buffer.size(), byte_offset); - } else -#endif - { - m_storage->set_data(data.data(), data_size, byte_offset); - } + m_storage->set_data(data.data(), data_size, byte_offset); } } // namespace sgl::slangpy From 0762b70a9dfc3d8fb12ddedd7ed8f1b8a4c87289 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Mon, 12 May 2025 21:07:00 -0700 Subject: [PATCH 6/9] Revert "Fixed StridedBufferView matrix alignment issue on Metal devices" This reverts commit 25c9e9f7f697a37a0eafee2b531c1665f60ddd87. --- .../utils/slangpystridedbufferview.cpp | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index 657c99ca..a833415f 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -7,7 +7,6 @@ #include "sgl/device/command.h" #include "sgl/device/buffer_cursor.h" -#include "sgl/device/reflection.h" #include "utils/slangpybuffer.h" namespace sgl::slangpy { @@ -325,19 +324,7 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str // Buffer with shape (5, ) of struct Foo { ... } -> ndarray of shape (5, sizeof(Foo)) and dtype uint8 bool is_scalar = innermost_layout->type()->kind() == TypeReflection::Kind::scalar; auto dtype_shape = desc.dtype->get_shape(); - Shape dtype_strides; -#if SGL_MACOS - auto type_refl = desc.dtype->buffer_type_layout()->type(); - bool is_matrix = type_refl->kind() == TypeReflection::Kind::matrix; - auto row_count = type_refl->row_count(); - auto col_count = type_refl->col_count(); - if (is_matrix && col_count < 4 && col_count != 2) { - dtype_strides = Shape({4, 1}); - } else -#endif - { - dtype_strides = dtype_shape.calc_contiguous_strides(); - } + auto dtype_strides = dtype_shape.calc_contiguous_strides(); size_t innermost_size = is_scalar ? innermost_layout->stride() : 1; TypeReflection::ScalarType scalar_type From 8e12752ba3f58a1ef68fc03dbd9965e2640c4ce9 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Mon, 12 May 2025 21:44:46 -0700 Subject: [PATCH 7/9] Calculate to_numpy dtype_stride using element type layout reflection --- src/slangpy_ext/utils/slangpystridedbufferview.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index a833415f..be8418a8 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include #include +#include #include "nanobind.h" #include "sgl/device/device.h" @@ -324,7 +326,6 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str // Buffer with shape (5, ) of struct Foo { ... } -> ndarray of shape (5, sizeof(Foo)) and dtype uint8 bool is_scalar = innermost_layout->type()->kind() == TypeReflection::Kind::scalar; auto dtype_shape = desc.dtype->get_shape(); - auto dtype_strides = dtype_shape.calc_contiguous_strides(); size_t innermost_size = is_scalar ? innermost_layout->stride() : 1; TypeReflection::ScalarType scalar_type @@ -339,9 +340,13 @@ static nb::ndarray to_ndarray(void* data, nb::handle owner, const Str sizes.push_back(desc.shape[i]); strides.push_back(desc.strides[i] * dtype_size / innermost_size); } + // Use cursor reflection to calculate dtype stride. + ref curr_type = desc.dtype; for (size_t i = 0; i < dtype_shape.size(); ++i) { sizes.push_back(dtype_shape[i]); - strides.push_back(dtype_strides[i]); + curr_type = curr_type->element_type(); + auto dtype_stride = curr_type->buffer_type_layout()->stride() / innermost_size; + strides.push_back(dtype_stride); } // If the innermost dtype is not a scalar, add one innermost dimension over // the bytes of the element From 5e1756ddf712bee9a971525ea744274abf9e3131 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Tue, 13 May 2025 00:00:50 -0700 Subject: [PATCH 8/9] Convert the returned ndarray as contiguous as the stride might be off on metal device --- slangpy/tests/slangpy_tests/test_buffer_views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/slangpy/tests/slangpy_tests/test_buffer_views.py b/slangpy/tests/slangpy_tests/test_buffer_views.py index f634afc9..7835821b 100644 --- a/slangpy/tests/slangpy_tests/test_buffer_views.py +++ b/slangpy/tests/slangpy_tests/test_buffer_views.py @@ -108,7 +108,7 @@ def test_to_numpy( strides = Shape(unravelled_shape).calc_contiguous_strides() byte_strides = tuple(s * np_dtype.itemsize for s in strides) - ndarray = buffer.to_numpy() + ndarray = np.ascontiguousarray(buffer.to_numpy()) assert ndarray.shape == unravelled_shape assert ndarray.strides == byte_strides assert ndarray.dtype == np_dtype From ca156a9b93eee800dc5f106b3fd121e404055d77 Mon Sep 17 00:00:00 2001 From: Fangjun Zhou Date: Tue, 13 May 2025 00:01:42 -0700 Subject: [PATCH 9/9] Fix StridedBufferView::copy_from_numpy without platform specific macro --- .../utils/slangpystridedbufferview.cpp | 52 ++++++++++++++++++- 1 file changed, 51 insertions(+), 1 deletion(-) diff --git a/src/slangpy_ext/utils/slangpystridedbufferview.cpp b/src/slangpy_ext/utils/slangpystridedbufferview.cpp index be8418a8..dc34e411 100644 --- a/src/slangpy_ext/utils/slangpystridedbufferview.cpp +++ b/src/slangpy_ext/utils/slangpystridedbufferview.cpp @@ -9,6 +9,7 @@ #include "sgl/device/command.h" #include "sgl/device/buffer_cursor.h" +#include "sgl/device/reflection.h" #include "utils/slangpybuffer.h" namespace sgl::slangpy { @@ -66,6 +67,21 @@ ref innermost_type(ref type) return result; } +std::vector> type_stack(ref type) +{ + std::vector> res; + ref curr = type; + while (true) { + res.push_back(curr); + ref child = curr->element_type(); + if (!child || child == curr) { + break; + } + curr = child; + } + return res; +} + StridedBufferView::StridedBufferView(Device* device, const StridedBufferViewDesc& desc, ref storage) { if (!storage) { @@ -393,6 +409,8 @@ nb::ndarray StridedBufferView::to_torch() const void StridedBufferView::copy_from_numpy(nb::ndarray data) { + // StridedBufferView::is_contiguous() == true does not necessarily means the internal buffer is continuous in memory + // (4 element alignment requirement on metal will break this continuity). SGL_CHECK(is_ndarray_contiguous(data), "Source Numpy array must be contiguous"); SGL_CHECK(is_contiguous(), "Destination buffer view must be contiguous"); @@ -402,7 +420,39 @@ void StridedBufferView::copy_from_numpy(nb::ndarray data) size_t buffer_size = m_storage->size() - byte_offset; SGL_CHECK(data_size <= buffer_size, "Numpy array is larger than the buffer ({} > {})", data_size, buffer_size); - m_storage->set_data(data.data(), data_size, byte_offset); + // At this point, the only possible way to break stride in a contiguous buffer is metal buffer alignment in the + // second last dimension. (matrix or vector) + auto kind = desc().dtype->buffer_type_layout()->kind(); + if (kind != TypeReflection::Kind::vector && kind != TypeReflection::Kind::matrix) { + m_storage->set_data(data.data(), data_size, byte_offset); + return; + } + // Get dlpack type from scalar type. + auto stack = type_stack(desc().dtype); + ref innermost = stack[stack.size() - 1]; + ref innermost_layout = innermost->buffer_type_layout(); + size_t innermost_size = innermost_layout->stride(); + ref second_innermost = stack[stack.size() - 2]; + ref second_innermost_layout = second_innermost->buffer_type_layout(); + size_t second_innermost_size = second_innermost_layout->stride(); + // Alignment fits. + if (second_innermost_size == second_innermost_layout->type()->col_count() * innermost_size) { + m_storage->set_data(data.data(), data_size, byte_offset); + return; + } + + // Copy with local buffer. + std::vector buffer(buffer_size); + auto actual_size = second_innermost_layout->type()->col_count() * innermost_size; + // Write element. + for (size_t i = 0; i < data_size / actual_size; i++) { + std::memcpy( + buffer.data() + i * second_innermost_size, + static_cast(data.data()) + i * actual_size, + actual_size + ); + } + m_storage->set_data(buffer.data(), buffer.size(), byte_offset); } } // namespace sgl::slangpy