From 9ae0c55e34e5675fda986dd925727fc931d3a441 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 2 Jul 2025 13:10:26 -0700 Subject: [PATCH 01/33] Initial non-ABI breaking impl in runtime --- sycl/include/sycl/detail/cg_types.hpp | 1 + .../oneapi/experimental/enqueue_functions.hpp | 11 +- .../ext/oneapi/experimental/enqueue_types.hpp | 33 +++++ sycl/include/sycl/handler.hpp | 12 ++ sycl/source/detail/cg.hpp | 14 +++ sycl/source/detail/memory_manager.cpp | 6 +- sycl/source/detail/memory_manager.hpp | 14 ++- sycl/source/detail/scheduler/commands.cpp | 34 ++++- sycl/source/handler.cpp | 11 ++ sycl/test-e2e/USM/prefetch_exp.cpp | 116 ++++++++++++++++++ 10 files changed, 242 insertions(+), 10 deletions(-) create mode 100644 sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp create mode 100644 sycl/test-e2e/USM/prefetch_exp.cpp diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index 843db64dea661..53c9b764c41a4 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -67,6 +67,7 @@ enum class CGType : unsigned int { EnqueueNativeCommand = 27, AsyncAlloc = 28, AsyncFree = 29, + PrefetchUSMExpD2H = 30, }; template struct check_fn_signature { diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index f599078a6769e..6bda2d58c030d 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -369,11 +370,17 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count, CodeLoc); } -inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes) { - CGH.prefetch(Ptr, NumBytes); +inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes, + prefetch_type Type = prefetch_type::device) { + if (Type == prefetch_type::device) { + CGH.prefetch(Ptr, NumBytes); + } else { + CGH.ext_oneapi_prefetch_d2h(Ptr, NumBytes); + } } inline void prefetch(queue Q, void *Ptr, size_t NumBytes, + prefetch_type Type = prefetch_type::device, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { submit( diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp new file mode 100644 index 0000000000000..dacd45126a7fb --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_types.hpp @@ -0,0 +1,33 @@ +//==--------------- enqueue_types.hpp ---- SYCL enqueue types --------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace sycl { +inline namespace _V1 { +namespace ext::oneapi::experimental { + +/// @brief Indicates the destination device for USM data to be prefetched to. +enum class prefetch_type { device, host }; + +inline std::string prefetchTypeToString(prefetch_type value) { + switch (value) { + case sycl::ext::oneapi::experimental::prefetch_type::device: + return "prefetch_type::device"; + case sycl::ext::oneapi::experimental::prefetch_type::host: + return "prefetch_type::host"; + default: + return "prefetch_type::unknown"; + } +} + +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 7045cfe670a62..3802a6334ef60 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -154,6 +154,9 @@ namespace ext ::oneapi ::experimental { template class work_group_memory; template class dynamic_work_group_memory; struct image_descriptor; +enum class prefetch_type; +void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type); + __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr); __SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, size_t size); @@ -3682,6 +3685,15 @@ class __SYCL_EXPORT handler { void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height); + // Implementation of prefetch from device back to host + void ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count); + + // The enqueue_functions module's prefetch function is friended in order for + // it to be able to call private handler function ext_oneapi_prefetch_d2h. + friend void sycl::ext::oneapi::experimental::prefetch( + handler &CGH, void *Ptr, size_t NumBytes, + sycl::ext::oneapi::experimental::prefetch_type Type); + // Implementation of memcpy to device_global. void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScoped, size_t NumBytes, diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index f48f6ace13ddd..141cc029e9836 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -408,6 +408,20 @@ class CGPrefetchUSM : public CG { size_t getLength() { return MLength; } }; +/// "Prefetch USM" command group class. +class CGPrefetchUSMExpD2H : public CG { + void *MDst; + size_t MLength; + +public: + CGPrefetchUSMExpD2H(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, + detail::code_location loc = {}) + : CG(CGType::PrefetchUSMExpD2H, std::move(CGData), std::move(loc)), + MDst(DstPtr), MLength(Length) {} + void *getDst() { return MDst; } + size_t getLength() { return MLength; } +}; + /// "Advise USM" command group class. class CGAdviseUSM : public CG { void *MDst; diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index aeba41eee8a17..264061c79d2aa 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -924,7 +924,8 @@ void MemoryManager::fill_usm(void *Mem, queue_impl &Queue, size_t Length, void MemoryManager::prefetch_usm(void *Mem, queue_impl &Queue, size_t Length, std::vector DepEvents, - ur_event_handle_t *OutEvent) { + ur_event_handle_t *OutEvent, + sycl::ext::oneapi::experimental::prefetch_type Dest) { const AdapterPtr &Adapter = Queue.getAdapter(); Adapter->call(Queue.getHandleRef(), Mem, Length, 0, DepEvents.size(), @@ -1537,7 +1538,8 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::context_impl *Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, - ur_exp_command_buffer_sync_point_t *OutSyncPoint) { + ur_exp_command_buffer_sync_point_t *OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type Dest) { const AdapterPtr &Adapter = Context->getAdapter(); Adapter->call( CommandBuffer, Mem, Length, ur_usm_migration_flags_t(0), Deps.size(), diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index 728aaa0e9ebc7..c0ad42473fafe 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -11,6 +11,7 @@ #include #include #include +#include // for prefetch_type #include #include #include @@ -146,9 +147,12 @@ class MemoryManager { std::vector DepEvents, ur_event_handle_t *OutEvent); - static void prefetch_usm(void *Ptr, queue_impl &Queue, size_t Len, - std::vector DepEvents, - ur_event_handle_t *OutEvent); + static void prefetch_usm( + void *Ptr, queue_impl &Queue, size_t Len, + std::vector DepEvents, + ur_event_handle_t *OutEvent, + sycl::ext::oneapi::experimental::prefetch_type Dest = + sycl::ext::oneapi::experimental::prefetch_type::device); static void advise_usm(const void *Ptr, queue_impl &Queue, size_t Len, ur_usm_advice_flags_t Advice, @@ -245,7 +249,9 @@ class MemoryManager { sycl::detail::context_impl *Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, - ur_exp_command_buffer_sync_point_t *OutSyncPoint); + ur_exp_command_buffer_sync_point_t *OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type Dest = + sycl::ext::oneapi::experimental::prefetch_type::device); static void ext_oneapi_advise_usm_cmd_buffer( sycl::detail::context_impl *Context, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 427e6c5a53bd6..fc314a9b638bd 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1911,6 +1911,9 @@ static std::string_view cgTypeToString(detail::CGType Type) { case detail::CGType::PrefetchUSM: return "prefetch usm"; break; + case detail::CGType::PrefetchUSMExpD2H: + return "prefetch usm (experimental, device to host)"; + break; case detail::CGType::CodeplayHostTask: return "host task"; break; @@ -2987,7 +2990,21 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { if (auto Result = callMemOpHelper( MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, &MQueue->getContextImpl(), MCommandBuffer, Prefetch->getDst(), - Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint); + Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type::device); + Result != UR_RESULT_SUCCESS) + return Result; + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } + case CGType::PrefetchUSMExpD2H: { + CGPrefetchUSMExpD2H *Prefetch = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, + &MQueue->getContextImpl(), MCommandBuffer, Prefetch->getDst(), + Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + sycl::ext::oneapi::experimental::prefetch_type::host); Result != UR_RESULT_SUCCESS) return Result; @@ -3298,7 +3315,20 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get(); if (auto Result = callMemOpHelper( MemoryManager::prefetch_usm, Prefetch->getDst(), *MQueue, - Prefetch->getLength(), std::move(RawEvents), Event); + Prefetch->getLength(), std::move(RawEvents), Event, + sycl::ext::oneapi::experimental::prefetch_type::device); + Result != UR_RESULT_SUCCESS) + return Result; + + SetEventHandleOrDiscard(); + return UR_RESULT_SUCCESS; + } + case CGType::PrefetchUSMExpD2H: { + CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::prefetch_usm, Prefetch->getDst(), *MQueue, + Prefetch->getLength(), std::move(RawEvents), Event, + sycl::ext::oneapi::experimental::prefetch_type::host); Result != UR_RESULT_SUCCESS) return Result; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 8724e829fe326..994d0fcdbaa45 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -702,6 +702,10 @@ event handler::finalize() { CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); break; + case detail::CGType::PrefetchUSMExpD2H: + CommandGroup.reset(new detail::CGPrefetchUSMExpD2H( + MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); + break; case detail::CGType::AdviseUSM: CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice, std::move(impl->CGData), @@ -1450,6 +1454,13 @@ void handler::prefetch(const void *Ptr, size_t Count) { setType(detail::CGType::PrefetchUSM); } +void handler::ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count) { + throwIfActionIsCreated(); + MDstPtr = const_cast(Ptr); + MLength = Count; + setType(detail::CGType::PrefetchUSMExpD2H); +} + void handler::mem_advise(const void *Ptr, size_t Count, int Advice) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp new file mode 100644 index 0000000000000..bd48007219771 --- /dev/null +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -0,0 +1,116 @@ +//==-------- prefetch_exp.cpp - Experimental 2-way USM prefetch test -------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// RUN: %{build} -o %t1.out +// RUN: %{run} %t1.out + +#include +#include +#include + +using namespace sycl; + +static constexpr int Count = 100; + +int main() { + queue q([](exception_list el) { + for (auto &e : el) + throw e; + }); + + if (!q.get_device().get_info()) { + // USM not supported, skipping test and returning early. + return 0; + } + + float *Src = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), + q.get_context()); + float *Dest = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), + q.get_context()); + for (int i = 0; i < Count; i++) + Src[i] = i; + + { + // Test host-to-device prefetch via prefetch(handler ...). + event InitPrefetch = + ext::oneapi::experimental::submit_with_event(q, [&](handler &CGH) { + ext::oneapi::experimental::prefetch(CGH, Src, sizeof(float) * Count); + }); + + q.submit([&](handler &CGH) { + CGH.depends_on(InitPrefetch); + CGH.single_task([=]() { + // for (int i = 0; i < Count; i++) + // Dest[i] = 2 * Src[i]; + }); + }); + q.wait_and_throw(); + + // for (int i = 0; i < Count; i++) { + // assert(Dest[i] == i * 2); + // } + + // Test device-to-host prefetch via prefetch(handler ...). + // event InitPrefetchBack = q.submit([&](handler &CGH) { + // CGH.single_task([=]() { + // for (int i = 0; i < Count; i++) + // Dest[i] = 4 * Src[i]; + // }); + // }); + + // ext::oneapi::experimental::submit(q, [&](handler &CGH) { + // CGH.depends_on(InitPrefetch); + // ext::oneapi::experimental::prefetch( + // CGH, Dest, sizeof(float) * Count, + // ext::oneapi::experimental::prefetch_type::host); + // }); + // q.wait_and_throw(); + + // for (int i = 0; i < Count; i++) { + // assert(Dest[i] == i * 4); + // } + } + + // { + // // Test host-to-device prefetch via prefetch(queue ...). + // ext::oneapi::experimental::prefetch( + // q, Src, sizeof(float) * Count, + // ext::oneapi::experimental::prefetch_type::device); + // q.wait_and_throw(); + // q.submit([&](handler &CGH) { + // CGH.single_task([=]() { + // for (int i = 0; i < Count; i++) + // Dest[i] = 3 * Src[i]; + // }); + // }); + // q.wait_and_throw(); + + // for (int i = 0; i < Count; i++) { + // assert(Dest[i] == i * 3); + // } + + // // Test device-to-host prefetch via prefetch(queue ...). + // q.submit([&](handler &CGH) { + // CGH.single_task([=]() { + // for (int i = 0; i < Count; i++) + // Dest[i] = 6 * Src[i]; + // }); + // }); + // q.wait_and_throw(); + // ext::oneapi::experimental::prefetch( + // q, Src, sizeof(float) * Count, + // ext::oneapi::experimental::prefetch_type::host); + // q.wait_and_throw(); + + // for (int i = 0; i < Count; i++) { + // assert(Dest[i] == i * 6); + // } + // } + free(Src, q); + free(Dest, q); +} From 59ae7776ded56512bc16d56fef0513e26ff7f252 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Thu, 3 Jul 2025 13:41:06 -0700 Subject: [PATCH 02/33] Initial UR implementation, + opencl adapter --- sycl/test-e2e/USM/prefetch_exp.cpp | 46 ++++++++--------- unified-runtime/include/ur_api.h | 12 +++-- unified-runtime/include/ur_print.hpp | 26 +++++++--- unified-runtime/scripts/core/enqueue.yml | 11 ++-- .../scripts/core/exp-command-buffer.yml | 2 +- .../source/adapters/opencl/common.hpp | 1 + .../adapters/opencl/extension_functions.def | 1 + .../source/adapters/opencl/usm.cpp | 51 ++++++++++--------- unified-runtime/source/ur_api.cpp | 2 +- 9 files changed, 89 insertions(+), 63 deletions(-) diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp index bd48007219771..87542ff3bb767 100644 --- a/sycl/test-e2e/USM/prefetch_exp.cpp +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -45,35 +45,35 @@ int main() { q.submit([&](handler &CGH) { CGH.depends_on(InitPrefetch); CGH.single_task([=]() { - // for (int i = 0; i < Count; i++) - // Dest[i] = 2 * Src[i]; + for (int i = 0; i < Count; i++) + Dest[i] = 2 * Src[i]; }); }); q.wait_and_throw(); - // for (int i = 0; i < Count; i++) { - // assert(Dest[i] == i * 2); - // } + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 2); + } // Test device-to-host prefetch via prefetch(handler ...). - // event InitPrefetchBack = q.submit([&](handler &CGH) { - // CGH.single_task([=]() { - // for (int i = 0; i < Count; i++) - // Dest[i] = 4 * Src[i]; - // }); - // }); - - // ext::oneapi::experimental::submit(q, [&](handler &CGH) { - // CGH.depends_on(InitPrefetch); - // ext::oneapi::experimental::prefetch( - // CGH, Dest, sizeof(float) * Count, - // ext::oneapi::experimental::prefetch_type::host); - // }); - // q.wait_and_throw(); - - // for (int i = 0; i < Count; i++) { - // assert(Dest[i] == i * 4); - // } + event InitPrefetchBack = q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 4 * Src[i]; + }); + }); + + ext::oneapi::experimental::submit(q, [&](handler &CGH) { + CGH.depends_on(InitPrefetchBack); + ext::oneapi::experimental::prefetch( + CGH, Dest, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::host); + }); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 4); + } } // { diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index 484ca4ef94768..f4273ea907b13 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -8596,18 +8596,20 @@ typedef enum ur_map_flag_t { #define UR_MAP_FLAGS_MASK 0xfffffff8 /////////////////////////////////////////////////////////////////////////////// -/// @brief Map flags +/// @brief USM migration flags, indicating the direction data is migrated in typedef uint32_t ur_usm_migration_flags_t; typedef enum ur_usm_migration_flag_t { - /// Default migration TODO: Add more enums! - UR_USM_MIGRATION_FLAG_DEFAULT = UR_BIT(0), + /// Migrate data from host to device + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE = UR_BIT(0), + /// Migrate data from device to host + UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST = UR_BIT(1), /// @cond UR_USM_MIGRATION_FLAG_FORCE_UINT32 = 0x7fffffff /// @endcond } ur_usm_migration_flag_t; /// @brief Bit Mask for validating ur_usm_migration_flags_t -#define UR_USM_MIGRATION_FLAGS_MASK 0xfffffffe +#define UR_USM_MIGRATION_FLAGS_MASK 0xfffffffc /////////////////////////////////////////////////////////////////////////////// /// @brief Enqueue a command to map a region of the buffer object into the host @@ -11841,7 +11843,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index f68ca6d086c33..75bfe74215950 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -10968,8 +10968,11 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t flag) { inline std::ostream &operator<<(std::ostream &os, enum ur_usm_migration_flag_t value) { switch (value) { - case UR_USM_MIGRATION_FLAG_DEFAULT: - os << "UR_USM_MIGRATION_FLAG_DEFAULT"; + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + os << "UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE"; + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + os << "UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST"; break; default: os << "unknown enumerator"; @@ -10987,15 +10990,26 @@ inline ur_result_t printFlag(std::ostream &os, uint32_t val = flag; bool first = true; - if ((val & UR_USM_MIGRATION_FLAG_DEFAULT) == - (uint32_t)UR_USM_MIGRATION_FLAG_DEFAULT) { - val ^= (uint32_t)UR_USM_MIGRATION_FLAG_DEFAULT; + if ((val & UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) == + (uint32_t)UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { + val ^= (uint32_t)UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE; + if (!first) { + os << " | "; + } else { + first = false; + } + os << UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE; + } + + if ((val & UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) == + (uint32_t)UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { + val ^= (uint32_t)UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; if (!first) { os << " | "; } else { first = false; } - os << UR_USM_MIGRATION_FLAG_DEFAULT; + os << UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; } if (val != 0) { std::bitset<32> bits(val); diff --git a/unified-runtime/scripts/core/enqueue.yml b/unified-runtime/scripts/core/enqueue.yml index 20d7d7bc2ab3f..a6148bd366f64 100644 --- a/unified-runtime/scripts/core/enqueue.yml +++ b/unified-runtime/scripts/core/enqueue.yml @@ -915,13 +915,16 @@ etors: value: "$X_BIT(2)" --- #-------------------------------------------------------------------------- type: enum -desc: "Map flags" -class: $xDevice +desc: "USM migration flags, indicating the direction data is migrated in" +class: $xEnqueue name: $x_usm_migration_flags_t etors: - - name: DEFAULT - desc: "Default migration TODO: Add more enums! " + - name: HOST_TO_DEVICE + desc: "Migrate data from host to device" value: "$X_BIT(0)" + - name: DEVICE_TO_HOST + desc: "Migrate data from device to host" + value: "$X_BIT(1)" --- #-------------------------------------------------------------------------- type: function desc: "Enqueue a command to map a region of the buffer object into the host address space and return a pointer to the mapped region" diff --git a/unified-runtime/scripts/core/exp-command-buffer.yml b/unified-runtime/scripts/core/exp-command-buffer.yml index e8f2caa15d59d..a194777f9e40b 100644 --- a/unified-runtime/scripts/core/exp-command-buffer.yml +++ b/unified-runtime/scripts/core/exp-command-buffer.yml @@ -1025,7 +1025,7 @@ params: desc: "[in] size in bytes to be fetched." - type: $x_usm_migration_flags_t name: flags - desc: "[in] USM prefetch flags" + desc: "[in] USM migration flags" - type: uint32_t name: numSyncPointsInWaitList desc: "[in] The number of sync points in the provided dependency list." diff --git a/unified-runtime/source/adapters/opencl/common.hpp b/unified-runtime/source/adapters/opencl/common.hpp index 0cfa916e49273..fc335186fce2a 100644 --- a/unified-runtime/source/adapters/opencl/common.hpp +++ b/unified-runtime/source/adapters/opencl/common.hpp @@ -187,6 +187,7 @@ CONSTFIX char CreateBufferWithPropertiesName[] = CONSTFIX char SetKernelArgMemPointerName[] = "clSetKernelArgMemPointerINTEL"; CONSTFIX char EnqueueMemFillName[] = "clEnqueueMemFillINTEL"; CONSTFIX char EnqueueMemcpyName[] = "clEnqueueMemcpyINTEL"; +CONSTFIX char EnqueueMigrateMemName[] = "clEnqueueMigrateMemINTEL"; CONSTFIX char GetMemAllocInfoName[] = "clGetMemAllocInfoINTEL"; CONSTFIX char SetProgramSpecializationConstantName[] = "clSetProgramSpecializationConstant"; diff --git a/unified-runtime/source/adapters/opencl/extension_functions.def b/unified-runtime/source/adapters/opencl/extension_functions.def index c7b4861807d98..47e85f918a222 100644 --- a/unified-runtime/source/adapters/opencl/extension_functions.def +++ b/unified-runtime/source/adapters/opencl/extension_functions.def @@ -8,6 +8,7 @@ CL_EXTENSION_FUNC(clMemBlockingFreeINTEL) CL_EXTENSION_FUNC(clSetKernelArgMemPointerINTEL) CL_EXTENSION_FUNC(clEnqueueMemFillINTEL) CL_EXTENSION_FUNC(clEnqueueMemcpyINTEL) +CL_EXTENSION_FUNC(clEnqueueMigrateMemINTEL) CL_EXTENSION_FUNC(clGetMemAllocInfoINTEL) CL_EXTENSION_FUNC(clEnqueueWriteGlobalVariable) CL_EXTENSION_FUNC(clEnqueueReadGlobalVariable) diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index e3c510c745766..d39b7906bdc0d 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -524,36 +524,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( [[maybe_unused]] ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // Have to look up the context from the kernel + cl_context CLContext = hQueue->Context->CLContext; + + clEnqueueMigrateMemINTEL_fn EnqueueMigrateMem = nullptr; + UR_RETURN_ON_FAILURE( + cl_ext::getExtFuncFromContext( + CLContext, ur::cl::getAdapter()->fnCache.clEnqueueMigrateMemINTELCache, + cl_ext::EnqueueMigrateMemName, &EnqueueMigrateMem)); + cl_event Event; std::vector CLWaitEvents(numEventsInWaitList); for (uint32_t i = 0; i < numEventsInWaitList; i++) { CLWaitEvents[i] = phEventWaitList[i]->CLEvent; } - CL_RETURN_ON_FAILURE(clEnqueueMarkerWithWaitList( - hQueue->CLQueue, numEventsInWaitList, CLWaitEvents.data(), - ifUrEvent(phEvent, Event))); - UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); - return UR_RESULT_SUCCESS; - /* - // Use this once impls support it. - // Have to look up the context from the kernel - cl_context CLContext = hQueue->Context; - clEnqueueMigrateMemINTEL_fn FuncPtr; - ur_result_t Err = cl_ext::getExtFuncFromContext( - CLContext, "clEnqueueMigrateMemINTEL", &FuncPtr); - - ur_result_t RetVal; - if (Err != UR_RESULT_SUCCESS) { - RetVal = Err; - } else { - RetVal = map_cl_error_to_ur( - FuncPtr(hQueue->CLQueue, pMem, size, flags, - numEventsInWaitList, - reinterpret_cast(phEventWaitList), - reinterpret_cast(phEvent))); + cl_mem_migration_flags MigrationFlag; + switch (flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + MigrationFlag = 0; + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + MigrationFlag = CL_MIGRATE_MEM_OBJECT_HOST; + break; + default: + cl_adapter::setErrorMessage("Invalid USM migration flag", + UR_RESULT_ERROR_INVALID_ENUMERATION); + return UR_RESULT_ERROR_INVALID_ENUMERATION; } - */ + + CL_RETURN_ON_FAILURE(EnqueueMigrateMem( + hQueue->CLQueue, pMem, size, MigrationFlag, numEventsInWaitList, + CLWaitEvents.data(), ifUrEvent(phEvent, Event))); + + UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise( diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index fc2889cee9a94..4c9db9f122ab3 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -8205,7 +8205,7 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, From 6e6fe08d1ce47cf7b14e4abab161ff9c271331d1 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 7 Jul 2025 06:36:45 -0700 Subject: [PATCH 03/33] Add migration flags to memorymanager --- sycl/source/detail/memory_manager.cpp | 12 ++++++++++-- unified-runtime/source/adapters/opencl/usm.cpp | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 264061c79d2aa..180ef88459110 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -927,8 +927,12 @@ void MemoryManager::prefetch_usm(void *Mem, queue_impl &Queue, size_t Length, ur_event_handle_t *OutEvent, sycl::ext::oneapi::experimental::prefetch_type Dest) { const AdapterPtr &Adapter = Queue.getAdapter(); + ur_usm_migration_flags_t MigrationFlag = + (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) + ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE + : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; Adapter->call(Queue.getHandleRef(), Mem, - Length, 0, DepEvents.size(), + Length, MigrationFlag, DepEvents.size(), DepEvents.data(), OutEvent); } @@ -1541,8 +1545,12 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( ur_exp_command_buffer_sync_point_t *OutSyncPoint, sycl::ext::oneapi::experimental::prefetch_type Dest) { const AdapterPtr &Adapter = Context->getAdapter(); + ur_usm_migration_flags_t MigrationFlag = + (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) + ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE + : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; Adapter->call( - CommandBuffer, Mem, Length, ur_usm_migration_flags_t(0), Deps.size(), + CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), Deps.data(), 0, nullptr, OutSyncPoint, nullptr, nullptr); } diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index d39b7906bdc0d..c1d428be72d15 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -533,7 +533,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( CLContext, ur::cl::getAdapter()->fnCache.clEnqueueMigrateMemINTELCache, cl_ext::EnqueueMigrateMemName, &EnqueueMigrateMem)); - cl_event Event; + cl_event Event = nullptr; std::vector CLWaitEvents(numEventsInWaitList); for (uint32_t i = 0; i < numEventsInWaitList; i++) { CLWaitEvents[i] = phEventWaitList[i]->CLEvent; From fb827be2d95032575f7ecdc6fcc909344a61906c Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 7 Jul 2025 13:54:39 -0700 Subject: [PATCH 04/33] Add CUDA adapter impl --- .../source/adapters/cuda/enqueue.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index 8eb00ccab2ca1..2724282d6689c 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -1503,14 +1503,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_migration_flags_t /*flags*/, uint32_t numEventsInWaitList, + ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + ur_device_handle_t Device = hQueue->getDevice(); + int dstDevice; + switch (flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + dstDevice = Device->get(); + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + dstDevice = CU_DEVICE_CPU; + break; + default: + setErrorMessage("Invalid USM migration flag", + UR_RESULT_ERROR_INVALID_ENUMERATION); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + size_t PointerRangeSize = 0; UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); - ur_device_handle_t Device = hQueue->getDevice(); std::unique_ptr EventPtr{nullptr}; try { @@ -1551,7 +1565,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( } UR_CHECK_ERROR( - cuMemPrefetchAsync((CUdeviceptr)pMem, size, Device->get(), CuStream)); + cuMemPrefetchAsync((CUdeviceptr)pMem, size, dstDevice, CuStream)); } catch (ur_result_t Err) { return Err; } From c42c2335e92ca5f88dd6eba6ed7ab61ffdd88a44 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Wed, 9 Jul 2025 12:32:05 -0700 Subject: [PATCH 05/33] More preliminary adapter support --- .../source/adapters/hip/enqueue.cpp | 19 +++++++++++++++++-- .../adapters/level_zero/command_buffer.cpp | 14 ++++++++++++-- .../source/adapters/level_zero/memory.cpp | 15 +++++++++++++-- 3 files changed, 42 insertions(+), 6 deletions(-) diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index dc0fac8050eb9..b5504981471bd 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -1324,11 +1324,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, - ur_usm_migration_flags_t /*flags*/, uint32_t numEventsInWaitList, + ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *HIPDevicePtr = const_cast(pMem); ur_device_handle_t Device = hQueue->getDevice(); + hipDevice_t TargetDevice; + switch (flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + TargetDevice = Device->get(); + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + // HIP doesn't have a constant for host like CUDA does; -1 is used instead + // https://github.com/ROCm/HIP/blob/3d60bd3a6415c2/docs/how-to/unified_memory.rst#L376 + TargetDevice = hipCpuDeviceId; + break; + default: + setErrorMessage("Invalid USM migration flag", + UR_RESULT_ERROR_INVALID_ENUMERATION); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } // HIP_POINTER_ATTRIBUTE_RANGE_SIZE is not an attribute in ROCM < 5, // so we can't perform this check for such cases. @@ -1386,7 +1401,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( } UR_CHECK_ERROR( - hipMemPrefetchAsync(pMem, size, hQueue->getDevice()->get(), HIPStream)); + hipMemPrefetchAsync(pMem, size, TargetDevice, HIPStream)); releaseEvent(); } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 1e68069db51b2..b1091cc1814aa 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1313,7 +1313,7 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( ur_result_t urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t CommandBuffer, const void *Mem, size_t Size, - ur_usm_migration_flags_t /*Flags*/, uint32_t NumSyncPointsInWaitList, + ur_usm_migration_flags_t Flags, uint32_t NumSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *SyncPointWaitList, uint32_t /*NumEventsInWaitList*/, const ur_event_handle_t * /*EventWaitList*/, @@ -1327,6 +1327,16 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( UR_COMMAND_USM_PREFETCH, CommandBuffer, CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList, SyncPointWaitList, true, RetSyncPoint, ZeEventList, ZeLaunchEvent)); + switch(Flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + UR_LOG(ERR, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + UR_LOG(ERR, "commandBufferAppendUSMPrefetch: invalid USM migration flag"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } if (!ZeEventList.empty()) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -1335,7 +1345,7 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( } // Add the prefetch command to the command-buffer. - // Note that L0 does not handle migration flags. + // TODO Support migration flags after L0 backend support is added. ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (CommandBuffer->ZeComputeCommandList, Mem, Size)); diff --git a/unified-runtime/source/adapters/level_zero/memory.cpp b/unified-runtime/source/adapters/level_zero/memory.cpp index 3b1158645e77a..e7c2a9e341dc4 100644 --- a/unified-runtime/source/adapters/level_zero/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/memory.cpp @@ -1265,7 +1265,7 @@ ur_result_t urEnqueueUSMPrefetch( /// [in] size in bytes to be fetched size_t Size, /// [in] USM prefetch flags - ur_usm_migration_flags_t /*Flags*/, + ur_usm_migration_flags_t Flags, /// [in] size of the event wait list uint32_t NumEventsInWaitList, /// [in][optional][range(0, numEventsInWaitList)] pointer to a list of @@ -1276,6 +1276,17 @@ ur_result_t urEnqueueUSMPrefetch( /// [in,out][optional] return an event object that identifies this /// particular command instance. ur_event_handle_t *OutEvent) { + switch(Flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + UR_LOG(ERR, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + default: + UR_LOG(ERR, "enqueueUSMPrefetch: invalid USM migration flag"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + // Lock automatically releases when this goes out of scope. std::scoped_lock lock(Queue->Mutex); @@ -1315,7 +1326,7 @@ ur_result_t urEnqueueUSMPrefetch( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } - // TODO: figure out how to translate "flags" + // TODO: Support migration flags after L0 backend support is added ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Mem, Size)); // TODO: Level Zero does not have a completion "event" with the prefetch API, From 41d9a6f3faa5bc86d6a1c953395af6110b56d884 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 14 Jul 2025 13:51:30 -0700 Subject: [PATCH 06/33] Update USM testing --- .../enqueue/urEnqueueUSMPrefetch.cpp | 17 +++++++++-------- .../conformance/exp_command_buffer/commands.cpp | 7 +++++-- .../exp_command_buffer/event_sync.cpp | 2 +- .../conformance/exp_command_buffer/in-order.cpp | 4 ++-- .../exp_command_buffer/update/event_sync.cpp | 2 +- 5 files changed, 18 insertions(+), 14 deletions(-) diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp index e0cb371ff09ac..88ef85cd93c4d 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp @@ -20,7 +20,8 @@ struct urEnqueueUSMPrefetchWithParamTest UUR_DEVICE_TEST_SUITE_WITH_PARAM( urEnqueueUSMPrefetchWithParamTest, - ::testing::Values(UR_USM_MIGRATION_FLAG_DEFAULT), + ::testing::Values(UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, + UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST), uur::deviceTestWithParamPrinter); TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) { @@ -102,14 +103,14 @@ UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueUSMPrefetchTest); TEST_P(urEnqueueUSMPrefetchTest, InvalidNullHandleQueue) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_HANDLE, urEnqueueUSMPrefetch(nullptr, ptr, allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, nullptr)); } TEST_P(urEnqueueUSMPrefetchTest, InvalidNullPointerMem) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER, urEnqueueUSMPrefetch(queue, nullptr, allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, nullptr)); } @@ -123,7 +124,7 @@ TEST_P(urEnqueueUSMPrefetchTest, InvalidEnumeration) { TEST_P(urEnqueueUSMPrefetchTest, InvalidSizeZero) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, urEnqueueUSMPrefetch(queue, ptr, 0, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, nullptr)); } @@ -132,14 +133,14 @@ TEST_P(urEnqueueUSMPrefetchTest, InvalidSizeTooLarge) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_SIZE, urEnqueueUSMPrefetch(queue, ptr, allocation_size * 2, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, nullptr)); } TEST_P(urEnqueueUSMPrefetchTest, InvalidEventWaitList) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, urEnqueueUSMPrefetch(queue, ptr, allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 1, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 1, nullptr, nullptr)); ur_event_handle_t validEvent; @@ -147,12 +148,12 @@ TEST_P(urEnqueueUSMPrefetchTest, InvalidEventWaitList) { ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST, urEnqueueUSMPrefetch(queue, ptr, allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, &validEvent, nullptr)); ur_event_handle_t inv_evt = nullptr; ASSERT_EQ_RESULT(urEnqueueUSMPrefetch(queue, ptr, allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 1, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 1, &inv_evt, nullptr), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index 22ac628c2726f..625b6e7a446c1 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -140,10 +140,13 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { // No Prefetch command in cl_khr_command_buffer - UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); + // UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[0], allocation_size, 0, 0, nullptr, 0, + cmd_buf_handle, device_ptrs[0], allocation_size, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 0, + nullptr, nullptr, nullptr, nullptr)); + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + cmd_buf_handle, device_ptrs[0], allocation_size, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); } diff --git a/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp b/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp index ba592053876cd..9248d653e5b0c 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp @@ -426,7 +426,7 @@ TEST_P(CommandEventSyncTest, USMPrefetchExp) { // Test prefetch command waiting on queue event ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[1], allocation_size, 0 /* migration flags*/, + cmd_buf_handle, device_ptrs[1], allocation_size, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], nullptr)); ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); diff --git a/unified-runtime/test/conformance/exp_command_buffer/in-order.cpp b/unified-runtime/test/conformance/exp_command_buffer/in-order.cpp index fd6335197cdf0..45357340ed6df 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/in-order.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/in-order.cpp @@ -101,7 +101,7 @@ struct urInOrderUSMCommandBufferExpTest : urInOrderCommandBufferExpTest { if (hints) { ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( in_order_cb, device_ptrs[0], allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, nullptr, 0, nullptr, nullptr, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); } @@ -124,7 +124,7 @@ struct urInOrderUSMCommandBufferExpTest : urInOrderCommandBufferExpTest { if (hints) { ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( in_order_cb, device_ptrs[0], allocation_size, - UR_USM_MIGRATION_FLAG_DEFAULT, 0, nullptr, 0, nullptr, nullptr, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); } diff --git a/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp b/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp index fe0dc03728545..a518ac03b4655 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp @@ -723,7 +723,7 @@ TEST_P(CommandEventSyncUpdateTest, USMPrefetchExp) { // Test prefetch command waiting on queue event ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( updatable_cmd_buf_handle, device_ptrs[1], allocation_size, - 0 /* migration flags*/, 0, nullptr, 1, &external_events[0], nullptr, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], &command_handles[0])); ASSERT_NE(nullptr, command_handles[0]); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); From 742a6363cbd78a866952053377053cf224cd8d3a Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Thu, 17 Jul 2025 13:18:43 -0700 Subject: [PATCH 07/33] Revise UR impl to not error, add graph testing --- sycl/source/detail/graph/node_impl.hpp | 12 +++ sycl/source/detail/memory_manager.cpp | 11 +-- .../ext_oneapi_enqueue_functions_prefetch.cpp | 89 +++++++++++++++++++ .../source/adapters/cuda/enqueue.cpp | 3 +- .../source/adapters/hip/enqueue.cpp | 7 +- .../adapters/level_zero/command_buffer.cpp | 10 ++- .../source/adapters/level_zero/memory.cpp | 8 +- .../adapters/level_zero/v2/command_buffer.cpp | 10 +++ .../level_zero/v2/command_list_manager.cpp | 10 ++- .../adapters/level_zero/v2/queue_api.cpp | 10 +++ .../source/adapters/opencl/usm.cpp | 28 +++--- 11 files changed, 159 insertions(+), 39 deletions(-) create mode 100644 sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp diff --git a/sycl/source/detail/graph/node_impl.hpp b/sycl/source/detail/graph/node_impl.hpp index aed90d3f04906..3de9e70a95542 100644 --- a/sycl/source/detail/graph/node_impl.hpp +++ b/sycl/source/detail/graph/node_impl.hpp @@ -62,6 +62,7 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { case sycl::detail::CGType::FillUSM: return node_type::memfill; case sycl::detail::CGType::PrefetchUSM: + case sycl::detail::CGType::PrefetchUSMExpD2H: return node_type::prefetch; case sycl::detail::CGType::AdviseUSM: return node_type::memadvise; @@ -258,6 +259,8 @@ class node_impl : public std::enable_shared_from_this { return createCGCopy(); case sycl::detail::CGType::PrefetchUSM: return createCGCopy(); + case sycl::detail::CGType::PrefetchUSMExpD2H: + return createCGCopy(); case sycl::detail::CGType::AdviseUSM: return createCGCopy(); case sycl::detail::CGType::Copy2DUSM: @@ -671,6 +674,15 @@ class node_impl : public std::enable_shared_from_this { << " Length: " << Prefetch->getLength() << "\\n"; } break; + case sycl::detail::CGType::PrefetchUSMExpD2H: + Stream << "CGPrefetchUSMExpD2H (Experimental, Device to host) \\n"; + if (Verbose) { + sycl::detail::CGPrefetchUSMExpD2H *Prefetch = + static_cast(MCommandGroup.get()); + Stream << "Dst: " << Prefetch->getDst() + << " Length: " << Prefetch->getLength() << "\\n"; + } + break; case sycl::detail::CGType::AdviseUSM: Stream << "CGAdviseUSM \\n"; if (Verbose) { diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index a66916b997110..412472e48fbb1 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -1543,22 +1543,15 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::context_impl *Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, -<<<<<<< HEAD ur_exp_command_buffer_sync_point_t *OutSyncPoint, sycl::ext::oneapi::experimental::prefetch_type Dest) { - const AdapterPtr &Adapter = Context->getAdapter(); + adapter_impl &Adapter = Context->getAdapter(); ur_usm_migration_flags_t MigrationFlag = (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; - Adapter->call( - CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), -======= - ur_exp_command_buffer_sync_point_t *OutSyncPoint) { - adapter_impl &Adapter = Context->getAdapter(); Adapter.call( - CommandBuffer, Mem, Length, ur_usm_migration_flags_t(0), Deps.size(), ->>>>>>> c2eaf0f95b0aac823118912b648fb385593c99b9 + CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), Deps.data(), 0, nullptr, OutSyncPoint, nullptr, nullptr); } diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp new file mode 100644 index 0000000000000..06b98ee3320b0 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -0,0 +1,89 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests prefetch functionality in enqueue functions + +#include "../graph_common.hpp" +#include + +static constexpr int N = 100; +static constexpr int Pattern = 42; + +int main() { + queue Q{}; + if (!Q.get_device().get_info()) { + // USM not supported, skipping test and returning early. + return 0; + } + + int *Src = + (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + int *Dst = + (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + for (int i = 0; i < N; i++) + Src[i] = Pattern; + + { + exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}}; + + Graph.begin_recording(Q); + + // Test submitting host-to-device prefetch + event TestH2D = exp_ext::submit_with_event( + Q, [&](handler &CGH) { exp_ext::prefetch(CGH, Src, sizeof(int) * N); }); + + exp_ext::submit(Q, [&](handler &CGH) { + CGH.depends_on(TestH2D); + exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { + Dst[i] = Src[i] * 2; + }); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + exp_ext::execute_graph(Q, GraphExec); + Q.wait_and_throw(); + } + + // Check host-to-device prefetch results + for (int i = 0; i < N; i++) + assert(Dst[i] == Pattern * 2); + + { + exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}}; + + Graph.begin_recording(Q); + + // Test submitting device-to-host prefetch + event TestD2H = exp_ext::submit_with_event(Q, [&](handler &CGH) { + exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { + Dst[i] = Src[i] + 1; + }); + }); + + exp_ext::submit(Q, [&](handler &CGH) { + CGH.depends_on(TestD2H); + exp_ext::prefetch(CGH, Dst, sizeof(int) * N, + exp_ext::prefetch_type::host); + }); + + Graph.end_recording(); + + auto GraphExec = Graph.finalize(); + + exp_ext::execute_graph(Q, GraphExec); + Q.wait_and_throw(); + } + + // Check device-to-host prefetch results + for (int i = 0; i < N; i++) + assert(Dst[i] == Pattern + 1); + + return 0; +} \ No newline at end of file diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index 2724282d6689c..e84288ab20663 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -1506,7 +1506,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - ur_device_handle_t Device = hQueue->getDevice(); int dstDevice; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: @@ -1521,6 +1520,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_ENUMERATION; } + ur_device_handle_t Device = hQueue->getDevice(); + size_t PointerRangeSize = 0; UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index b5504981471bd..23a27c4349a57 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -1327,16 +1327,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - void *HIPDevicePtr = const_cast(pMem); - ur_device_handle_t Device = hQueue->getDevice(); hipDevice_t TargetDevice; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: TargetDevice = Device->get(); break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - // HIP doesn't have a constant for host like CUDA does; -1 is used instead - // https://github.com/ROCm/HIP/blob/3d60bd3a6415c2/docs/how-to/unified_memory.rst#L376 TargetDevice = hipCpuDeviceId; break; default: @@ -1345,6 +1341,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_ENUMERATION; } + void *HIPDevicePtr = const_cast(pMem); + ur_device_handle_t Device = hQueue->getDevice(); + // HIP_POINTER_ATTRIBUTE_RANGE_SIZE is not an attribute in ROCM < 5, // so we can't perform this check for such cases. #if HIP_VERSION_MAJOR >= 5 diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index b1091cc1814aa..7385dae6336f1 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1331,8 +1331,8 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(ERR, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_LOG(WARN, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); + break; default: UR_LOG(ERR, "commandBufferAppendUSMPrefetch: invalid USM migration flag"); return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -1346,8 +1346,10 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( // Add the prefetch command to the command-buffer. // TODO Support migration flags after L0 backend support is added. - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (CommandBuffer->ZeComputeCommandList, Mem, Size)); + if (Flags == UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (CommandBuffer->ZeComputeCommandList, Mem, Size)); + } if (!CommandBuffer->IsInOrderCmdList) { // Level Zero does not have a completion "event" with the prefetch API, diff --git a/unified-runtime/source/adapters/level_zero/memory.cpp b/unified-runtime/source/adapters/level_zero/memory.cpp index e7c2a9e341dc4..550233a050fcd 100644 --- a/unified-runtime/source/adapters/level_zero/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/memory.cpp @@ -1280,8 +1280,8 @@ ur_result_t urEnqueueUSMPrefetch( case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(ERR, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + UR_LOG(WARN, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); + break; default: UR_LOG(ERR, "enqueueUSMPrefetch: invalid USM migration flag"); return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -1327,7 +1327,9 @@ ur_result_t urEnqueueUSMPrefetch( (ZeCommandList, WaitList.Length, WaitList.ZeEventList)); } // TODO: Support migration flags after L0 backend support is added - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Mem, Size)); + if (Flags == UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, (ZeCommandList, Mem, Size)); + } // TODO: Level Zero does not have a completion "event" with the prefetch API, // so manually add command to signal our event. diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 92118587d4da3..763e4350ad7ca 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -586,6 +586,16 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( // the same issue as in urCommandBufferAppendKernelLaunchExp + switch(flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + UR_LOG(WARN, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); + break; + default: + UR_LOG(ERR, "commandBufferAppendUSMPrefetch: invalid USM migration flag"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 728db1360b0bd..2e5c85593f698 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -283,7 +283,7 @@ ur_result_t ur_command_list_manager::appendUSMFill( } ur_result_t ur_command_list_manager::appendUSMPrefetch( - const void *pMem, size_t size, ur_usm_migration_flags_t /*flags*/, + const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); @@ -296,9 +296,11 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( ZE2UR_CALL(zeCommandListAppendWaitOnEvents, (zeCommandList.get(), numWaitEvents, pWaitEvents)); } - // TODO: figure out how to translate "flags" - ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (zeCommandList.get(), pMem, size)); + // TODO: Support migration flags after L0 backend support is added + if (flags == UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { + ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, + (zeCommandList.get(), pMem, size)); + } if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, (zeCommandList.get(), zeSignalEvent)); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index d043a68dcaec7..1dfe7450f5404 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -261,6 +261,16 @@ ur_result_t urEnqueueUSMPrefetch(ur_queue_handle_t hQueue, const void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + switch(flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + UR_LOG(WARN, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); + break; + default: + UR_LOG(ERR, "enqueueUSMPrefetch: invalid USM migration flag"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } return hQueue->get().enqueueUSMPrefetch( pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index c1d428be72d15..0c4de31acf3bf 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -524,6 +524,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( [[maybe_unused]] ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + cl_mem_migration_flags MigrationFlag; + switch (flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + MigrationFlag = 0; + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + MigrationFlag = CL_MIGRATE_MEM_OBJECT_HOST; + break; + default: + cl_adapter::setErrorMessage("Invalid USM migration flag", + UR_RESULT_ERROR_INVALID_ENUMERATION); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + // Have to look up the context from the kernel cl_context CLContext = hQueue->Context->CLContext; @@ -539,20 +553,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( CLWaitEvents[i] = phEventWaitList[i]->CLEvent; } - cl_mem_migration_flags MigrationFlag; - switch (flags) { - case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: - MigrationFlag = 0; - break; - case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - MigrationFlag = CL_MIGRATE_MEM_OBJECT_HOST; - break; - default: - cl_adapter::setErrorMessage("Invalid USM migration flag", - UR_RESULT_ERROR_INVALID_ENUMERATION); - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } - CL_RETURN_ON_FAILURE(EnqueueMigrateMem( hQueue->CLQueue, pMem, size, MigrationFlag, numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); From 6654b6ef51a0117eaf679133f3c87f25a5a6efb6 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Thu, 17 Jul 2025 14:31:50 -0700 Subject: [PATCH 08/33] Fix bug --- unified-runtime/source/adapters/cuda/enqueue.cpp | 3 +-- unified-runtime/source/adapters/hip/enqueue.cpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/cuda/enqueue.cpp b/unified-runtime/source/adapters/cuda/enqueue.cpp index e84288ab20663..2724282d6689c 100644 --- a/unified-runtime/source/adapters/cuda/enqueue.cpp +++ b/unified-runtime/source/adapters/cuda/enqueue.cpp @@ -1506,6 +1506,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + ur_device_handle_t Device = hQueue->getDevice(); int dstDevice; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: @@ -1520,8 +1521,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_ENUMERATION; } - ur_device_handle_t Device = hQueue->getDevice(); - size_t PointerRangeSize = 0; UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index 23a27c4349a57..182d9a487ec23 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -1327,6 +1327,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + ur_device_handle_t Device = hQueue->getDevice(); hipDevice_t TargetDevice; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: @@ -1340,9 +1341,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_RESULT_ERROR_INVALID_ENUMERATION); return UR_RESULT_ERROR_INVALID_ENUMERATION; } - void *HIPDevicePtr = const_cast(pMem); - ur_device_handle_t Device = hQueue->getDevice(); // HIP_POINTER_ATTRIBUTE_RANGE_SIZE is not an attribute in ROCM < 5, // so we can't perform this check for such cases. From 96059fc758ac4a023486d1b628d7817044bf8e6a Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 18 Jul 2025 08:06:04 -0700 Subject: [PATCH 09/33] Fix bug in enqueue function header --- sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 6bda2d58c030d..2f5ab3c4f8cf7 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -384,7 +384,7 @@ inline void prefetch(queue Q, void *Ptr, size_t NumBytes, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { submit( - std::move(Q), [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes); }, + std::move(Q), [&](handler &CGH) { prefetch(CGH, Ptr, NumBytes, Type); }, CodeLoc); } From b427472bd012418f164034d0f7d0ff1469897bf4 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Fri, 18 Jul 2025 08:18:47 -0700 Subject: [PATCH 10/33] update ur testing --- .../enqueue/urEnqueueUSMPrefetch.cpp | 11 +++++++++ .../exp_command_buffer/commands.cpp | 23 +++++++++++++------ 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp index 88ef85cd93c4d..d6e692ff5c2e5 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp @@ -31,6 +31,12 @@ TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) { // this file. uur::NativeCPU{}); + // if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { + // // Intel GPU drivers do not currently support prefetching memory from + // // device back to host. + // UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::OpenCL); + // } + ur_event_handle_t prefetch_event = nullptr; ASSERT_SUCCESS(urEnqueueUSMPrefetch(queue, ptr, allocation_size, getParam(), 0, nullptr, &prefetch_event)); @@ -51,6 +57,11 @@ TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) { */ TEST_P(urEnqueueUSMPrefetchWithParamTest, CheckWaitEvent) { UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); + // if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { + // // Intel GPU drivers do not currently support prefetching memory from + // // device back to host. + // UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::OpenCL); + // } ur_queue_handle_t fill_queue; ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &fill_queue)); diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index 625b6e7a446c1..fc59195aff676 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -138,16 +138,25 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); } -TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { +struct urCommandBufferCommandsPrefetchTest : urCommandBufferCommandsTest { }; + +UUR_DEVICE_TEST_SUITE_WITH_PARAM( + urCommandBufferCommandsPrefetchTest, + ::testing::Values(UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, + UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST), + uur::deviceTestWithParamPrinter); + +TEST_P(urCommandBufferCommandsPrefetchTest, + urCommandBufferAppendUSMPrefetchExp) { // No Prefetch command in cl_khr_command_buffer - // UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); + UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); + if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero); + } ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[0], allocation_size, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 0, - nullptr, nullptr, nullptr, nullptr)); - ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[0], allocation_size, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST, 0, nullptr, 0, - nullptr, nullptr, nullptr, nullptr)); + cmd_buf_handle, device_ptrs[0], allocation_size, getParam(), 0, nullptr, + 0, nullptr, nullptr, nullptr, nullptr)); } TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMAdviseExp) { From 4f09c40ea7c2b37c287349a94264959e2f75bb64 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 21 Jul 2025 12:30:41 -0700 Subject: [PATCH 11/33] fix build issue in new command buffer ur test --- .../exp_command_buffer/commands.cpp | 28 +++++++++---------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index fc59195aff676..7645fcd952542 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -138,25 +138,23 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { 0, nullptr, 0, nullptr, nullptr, nullptr, nullptr)); } -struct urCommandBufferCommandsPrefetchTest : urCommandBufferCommandsTest { }; +TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { + //UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); -UUR_DEVICE_TEST_SUITE_WITH_PARAM( - urCommandBufferCommandsPrefetchTest, - ::testing::Values(UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, - UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST), - uur::deviceTestWithParamPrinter); + ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( + cmd_buf_handle, device_ptrs[0], allocation_size, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 0, nullptr, nullptr, + nullptr, nullptr)); +} -TEST_P(urCommandBufferCommandsPrefetchTest, - urCommandBufferAppendUSMPrefetchExp) { - // No Prefetch command in cl_khr_command_buffer - UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); - if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero); - } +TEST_P(urCommandBufferCommandsTest, + urCommandBufferAppendUSMPrefetchExpDeviceToHost) { + UUR_KNOWN_FAILURE_ON(uur::LevelZero); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[0], allocation_size, getParam(), 0, nullptr, - 0, nullptr, nullptr, nullptr, nullptr)); + cmd_buf_handle, device_ptrs[0], allocation_size, + UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST, 0, nullptr, 0, nullptr, nullptr, + nullptr, nullptr)); } TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMAdviseExp) { From e3b9e9ed759a6a05cd3a577f828f83b2c3c693c2 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 21 Jul 2025 13:07:47 -0700 Subject: [PATCH 12/33] Fix bug --- .../test/conformance/exp_command_buffer/commands.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index 7645fcd952542..7c56ceeb933e3 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -149,7 +149,7 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExpDeviceToHost) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero); + UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( cmd_buf_handle, device_ptrs[0], allocation_size, From ba1f9f6f61ffe3926b521216695d92d9e2e95261 Mon Sep 17 00:00:00 2001 From: "Li, Ian" Date: Mon, 21 Jul 2025 14:46:50 -0700 Subject: [PATCH 13/33] Fix memory leak --- .../ext_oneapi_enqueue_functions_prefetch.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index 06b98ee3320b0..eb6e3f5c663e0 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -1,5 +1,14 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out +// +// Opencl currently does not support: +// - Prefetching from device to host for any device, as there is no driver +// support yet +// - Prefetching from host to device on specifically CPU, due to a bug that +// required a spec change: Currently waiting for said spec change to be +// implemented. +// See: https://github.com/KhronosGroup/OpenCL-Docs/pull/1412/files#diff-7e4c12789cfc81c40637d32b7113b0cca2c3ee0beabaabb9acd9da743f7b5780R974 +// // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Extra run to check for immediate-command-list in Level Zero @@ -85,5 +94,8 @@ int main() { for (int i = 0; i < N; i++) assert(Dst[i] == Pattern + 1); + free(Src, Q); + free(Dst, Q); + return 0; -} \ No newline at end of file +} From 294702ca97be1c8b788ada960cc853a3db60d279 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 22 Jul 2025 10:44:59 -0700 Subject: [PATCH 14/33] Disable opencl adapter --- .../source/adapters/opencl/usm.cpp | 25 ++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index 0c4de31acf3bf..6964433d84d81 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -524,13 +524,22 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( [[maybe_unused]] ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - cl_mem_migration_flags MigrationFlag; + // TODO: Uncomment implementation when issues with impl are resolved. + + // cl_mem_migration_flags MigrationFlag; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: - MigrationFlag = 0; + // Note: currently opencl:cpu will break with this value, but opencl:gpu + // will work just fine. A spec change has been made to address this issue, + // and is waiting to be implemented: + // https://github.com/KhronosGroup/OpenCL-Docs/pull/1412/files#diff-7e4c12789cfc81c40637d32b7113b0cca2c3ee0beabaabb9acd9da743f7b5780R974 + + // MigrationFlag = 0; // OpenCL spec stipulates 0 as host break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - MigrationFlag = CL_MIGRATE_MEM_OBJECT_HOST; + // Note: there is currently no driver support for this. + + // MigrationFlag = CL_MIGRATE_MEM_OBJECT_HOST; break; default: cl_adapter::setErrorMessage("Invalid USM migration flag", @@ -538,6 +547,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_INVALID_ENUMERATION; } + /* // Have to look up the context from the kernel cl_context CLContext = hQueue->Context->CLContext; @@ -546,6 +556,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( cl_ext::getExtFuncFromContext( CLContext, ur::cl::getAdapter()->fnCache.clEnqueueMigrateMemINTELCache, cl_ext::EnqueueMigrateMemName, &EnqueueMigrateMem)); + */ cl_event Event = nullptr; std::vector CLWaitEvents(numEventsInWaitList); @@ -553,9 +564,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( CLWaitEvents[i] = phEventWaitList[i]->CLEvent; } + /* CL_RETURN_ON_FAILURE(EnqueueMigrateMem( hQueue->CLQueue, pMem, size, MigrationFlag, numEventsInWaitList, CLWaitEvents.data(), ifUrEvent(phEvent, Event))); + */ + + // TODO: when issues with impl are fully resolved, delete this and use + // waitlisting from EnqueueMigrateMem instead. + CL_RETURN_ON_FAILURE(clEnqueueMarkerWithWaitList( + hQueue->CLQueue, numEventsInWaitList, CLWaitEvents.data(), + ifUrEvent(phEvent, Event))); UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); return UR_RESULT_SUCCESS; From 6dbf10a9c65e3a4ef0ca72b718a74c8d9b8fbcd3 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 22 Jul 2025 11:33:02 -0700 Subject: [PATCH 15/33] Disable opencl enqueue function grpah tests --- .../ext_oneapi_enqueue_functions_prefetch.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index eb6e3f5c663e0..ea99b113f7dcd 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -1,13 +1,8 @@ // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// Opencl currently does not support: -// - Prefetching from device to host for any device, as there is no driver -// support yet -// - Prefetching from host to device on specifically CPU, due to a bug that -// required a spec change: Currently waiting for said spec change to be -// implemented. -// See: https://github.com/KhronosGroup/OpenCL-Docs/pull/1412/files#diff-7e4c12789cfc81c40637d32b7113b0cca2c3ee0beabaabb9acd9da743f7b5780R974 +// OpenCL currently does not support command buffers: +// UNSUPPORTED: opencl // // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} From a2263f6f83ca4d5d8da8ae0738f1f5d274913d71 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 22 Jul 2025 12:13:43 -0700 Subject: [PATCH 16/33] ammend test --- .../test/conformance/exp_command_buffer/commands.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index 7c56ceeb933e3..38546ee4fc963 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -139,7 +139,8 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendMemBufferFillExp) { } TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { - //UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); + // No Prefetch command in cl_khr_command_buffer + UUR_KNOWN_FAILURE_ON(uur::OpenCL{}); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( cmd_buf_handle, device_ptrs[0], allocation_size, @@ -149,7 +150,9 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExpDeviceToHost) { - UUR_KNOWN_FAILURE_ON(uur::LevelZero{}); + // No Prefetch command in cl_khr_command_buffer + // No driver support for prefetching from device to host on Intel GPUs + UUR_KNOWN_FAILURE_ON(uur::OpenCL{}, uur::LevelZero{}); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( cmd_buf_handle, device_ptrs[0], allocation_size, From 1d16e609adc5441d3641d98e6a5ee15eb7317975 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Wed, 23 Jul 2025 07:53:33 -0700 Subject: [PATCH 17/33] Add breaking changes preview hotpath --- sycl/include/sycl/detail/cg_types.hpp | 4 ++ .../oneapi/experimental/enqueue_functions.hpp | 4 ++ sycl/include/sycl/handler.hpp | 27 ++++++++-- sycl/source/detail/cg.hpp | 23 ++++++++- sycl/source/detail/graph/node_impl.hpp | 25 ++++++++++ sycl/source/detail/scheduler/commands.cpp | 49 ++++++++++++++++--- sycl/source/handler.cpp | 18 +++++++ 7 files changed, 138 insertions(+), 12 deletions(-) diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index 53c9b764c41a4..b082c34fa5cbe 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -67,7 +67,11 @@ enum class CGType : unsigned int { EnqueueNativeCommand = 27, AsyncAlloc = 28, AsyncFree = 29, +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + PrefetchUSMExp = 30, +#else PrefetchUSMExpD2H = 30, +#endif }; template struct check_fn_signature { diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 2f5ab3c4f8cf7..0b82f0ce10fa9 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -372,11 +372,15 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count, inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type = prefetch_type::device) { +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + CGH.ext_oneapi_prefetch_exp(Ptr, NumBytes, Type); +#else if (Type == prefetch_type::device) { CGH.prefetch(Ptr, NumBytes); } else { CGH.ext_oneapi_prefetch_d2h(Ptr, NumBytes); } +#endif } inline void prefetch(queue Q, void *Ptr, size_t NumBytes, diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 9f8d78f5d3433..c42a177f37878 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3424,6 +3424,10 @@ class __SYCL_EXPORT handler { void *MDstPtr = nullptr; /// Length to copy or fill (for USM operations). size_t MLength = 0; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + // Prefetch direction for ext_oneapi_prefetch_exp + ext::oneapi::experimental::prefetch_type MPrefetchType; +#endif /// Pattern that is used to fill memory object in case command type is fill. std::vector MPattern; /// Storage for a lambda or function object. @@ -3690,14 +3694,27 @@ class __SYCL_EXPORT handler { void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height); - // Implementation of prefetch from device back to host + +// Implementation of enqueue_functions extension's USM prefetch, allowing for +// prefetching memory from both host to device and vice versa. +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + // Prefetch implementation that accounts for prefetching both directions, but + // introduces a "prefetch type" field to handler/CG nodes: this results in an + // ABI break. + void ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type); +#else + // Non-ABI breaking implementation that implements prefetching from device to + // host as a separate function. void ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count); + // TODO upon next ABI-breaking cycle, decide which approach to go with. +#endif - // The enqueue_functions module's prefetch function is friended in order for - // it to be able to call private handler function ext_oneapi_prefetch_d2h. - friend void sycl::ext::oneapi::experimental::prefetch( + // Enqueue_functions extension's prefetch function is friended in order to + // call private handler function ext_oneapi_prefetch_d2h. + friend void ext::oneapi::experimental::prefetch( handler &CGH, void *Ptr, size_t NumBytes, - sycl::ext::oneapi::experimental::prefetch_type Type); + ext::oneapi::experimental::prefetch_type Type); // Implementation of memcpy to device_global. void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src, diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 6c67b379c2239..47693d4674f32 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -408,7 +408,27 @@ class CGPrefetchUSM : public CG { size_t getLength() { return MLength; } }; -/// "Prefetch USM" command group class. +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +/// Enqueue_functions extension USM Prefetch command group class +class CGPrefetchUSMExp : public CG { + void *MDst; + size_t MLength; + ext::oneapi::experimental::prefetch_type MPrefetchType; + +public: + CGPrefetchUSMExp(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, + ext::oneapi::experimental::prefetch_type PrefetchType, + detail::code_location loc = {}) + : CG(CGType::PrefetchUSMExp, std::move(CGData), std::move(loc)), + MDst(DstPtr), MLength(Length), MPrefetchType(PrefetchType) {} + void *getDst() { return MDst; } + size_t getLength() { return MLength; } + ext::oneapi::experimental::prefetch_type getPrefetchType() { + return MPrefetchType; + } +}; +#else +/// Enqueue_functions USM device-to-host prefetch command group class class CGPrefetchUSMExpD2H : public CG { void *MDst; size_t MLength; @@ -421,6 +441,7 @@ class CGPrefetchUSMExpD2H : public CG { void *getDst() { return MDst; } size_t getLength() { return MLength; } }; +#endif /// "Advise USM" command group class. class CGAdviseUSM : public CG { diff --git a/sycl/source/detail/graph/node_impl.hpp b/sycl/source/detail/graph/node_impl.hpp index 0424421d4e086..d4c3498cb1e8f 100644 --- a/sycl/source/detail/graph/node_impl.hpp +++ b/sycl/source/detail/graph/node_impl.hpp @@ -16,6 +16,7 @@ #include // for kernel_param_kind_t #include // for node +#include // for prefetchType #include #include @@ -55,7 +56,11 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { case sycl::detail::CGType::FillUSM: return node_type::memfill; case sycl::detail::CGType::PrefetchUSM: +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case sycl::detail::CGType::PrefetchUSMExp: +#else case sycl::detail::CGType::PrefetchUSMExpD2H: +#endif return node_type::prefetch; case sycl::detail::CGType::AdviseUSM: return node_type::memadvise; @@ -248,8 +253,13 @@ class node_impl : public std::enable_shared_from_this { return createCGCopy(); case sycl::detail::CGType::PrefetchUSM: return createCGCopy(); +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case sycl::detail::CGType::PrefetchUSMExp: + return createCGCopy(); +#else case sycl::detail::CGType::PrefetchUSMExpD2H: return createCGCopy(); +#endif case sycl::detail::CGType::AdviseUSM: return createCGCopy(); case sycl::detail::CGType::Copy2DUSM: @@ -661,6 +671,20 @@ class node_impl : public std::enable_shared_from_this { << " Length: " << Prefetch->getLength() << "\\n"; } break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case sycl::detail::CGType::PrefetchUSMExp: + Stream << "CGPrefetchUSMExp \\n"; + if (Verbose) { + sycl::detail::CGPrefetchUSMExp *PrefetchExp = + static_cast(MCommandGroup.get()); + Stream << "Dst: " << PrefetchExp->getDst() + << " Length: " << PrefetchExp->getLength() + << " PrefetchType: " + << sycl::ext::oneapi::experimental::prefetchTypeToString( + PrefetchExp->getPrefetchType()) << "\\n"; + } + break; +#else case sycl::detail::CGType::PrefetchUSMExpD2H: Stream << "CGPrefetchUSMExpD2H (Experimental, Device to host) \\n"; if (Verbose) { @@ -670,6 +694,7 @@ class node_impl : public std::enable_shared_from_this { << " Length: " << Prefetch->getLength() << "\\n"; } break; +#endif case sycl::detail::CGType::AdviseUSM: Stream << "CGAdviseUSM \\n"; if (Verbose) { diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 858c5a5a1bbaf..082abaf747e79 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1913,9 +1913,15 @@ static std::string_view cgTypeToString(detail::CGType Type) { case detail::CGType::PrefetchUSM: return "prefetch usm"; break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case detail::CGType::PrefetchUSMExp: + return "prefetch usm (experimental)"; + break; +#else case detail::CGType::PrefetchUSMExpD2H: return "prefetch usm (experimental, device to host)"; break; +#endif case detail::CGType::CodeplayHostTask: return "host task"; break; @@ -3000,12 +3006,27 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case CGType::PrefetchUSMExp: { + CGPrefetchUSMExp *PrefetchExp = (CGPrefetchUSMExp *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, + &MQueue->getContextImpl(), MCommandBuffer, PrefetchExp->getDst(), + PrefetchExp->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + PrefetchExp->getPrefetchType()); + Result != UR_RESULT_SUCCESS) + return Result; + + MEvent->setSyncPoint(OutSyncPoint); + return UR_RESULT_SUCCESS; + } +#else case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSMExpD2H *Prefetch = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + CGPrefetchUSMExpD2H *PrefetchD2H = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); if (auto Result = callMemOpHelper( MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, - &MQueue->getContextImpl(), MCommandBuffer, Prefetch->getDst(), - Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, + &MQueue->getContextImpl(), MCommandBuffer, PrefetchD2H->getDst(), + PrefetchD2H->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, sycl::ext::oneapi::experimental::prefetch_type::host); Result != UR_RESULT_SUCCESS) return Result; @@ -3013,6 +3034,7 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } +#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = callMemOpHelper( @@ -3325,11 +3347,25 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case CGType::PrefetchUSMExp: { + CGPrefetchUSMExp *PrefetchExp = (CGPrefetchUSMExp *)MCommandGroup.get(); + if (auto Result = callMemOpHelper( + MemoryManager::prefetch_usm, PrefetchExp->getDst(), *MQueue, + PrefetchExp->getLength(), std::move(RawEvents), Event, + PrefetchExp->getPrefetchType()); + Result != UR_RESULT_SUCCESS) + return Result; + + SetEventHandleOrDiscard(); + return UR_RESULT_SUCCESS; + } +#else case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSM *Prefetch = (CGPrefetchUSM *)MCommandGroup.get(); + CGPrefetchUSMExpD2H *PrefetchD2H = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); if (auto Result = callMemOpHelper( - MemoryManager::prefetch_usm, Prefetch->getDst(), *MQueue, - Prefetch->getLength(), std::move(RawEvents), Event, + MemoryManager::prefetch_usm, PrefetchD2H->getDst(), *MQueue, + PrefetchD2H->getLength(), std::move(RawEvents), Event, sycl::ext::oneapi::experimental::prefetch_type::host); Result != UR_RESULT_SUCCESS) return Result; @@ -3337,6 +3373,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } +#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 0f475b547ac2d..f1f14ac49ded1 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -722,10 +722,17 @@ event handler::finalize() { CommandGroup.reset(new detail::CGPrefetchUSM( MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); break; +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + case detail::CGType::PrefetchUSMExp: + CommandGroup.reset(new detail::CGPrefetchUSMExp( + MDstPtr, MLength, std::move(impl->CGData), MPrefetchType, MCodeLoc)); + break; +#else case detail::CGType::PrefetchUSMExpD2H: CommandGroup.reset(new detail::CGPrefetchUSMExpD2H( MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); break; +#endif case detail::CGType::AdviseUSM: CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice, std::move(impl->CGData), @@ -1483,12 +1490,23 @@ void handler::prefetch(const void *Ptr, size_t Count) { setType(detail::CGType::PrefetchUSM); } +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES +void handler::ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type) { + throwIfActionIsCreated(); + MDstPtr = const_cast(Ptr); + MLength = Count; + MPrefetchType = Type; + setType(detail::CGType::PrefetchUSMExp); +} +#else void handler::ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); MLength = Count; setType(detail::CGType::PrefetchUSMExpD2H); } +#endif void handler::mem_advise(const void *Ptr, size_t Count, int Advice) { throwIfActionIsCreated(); From 64bec808a2ac6125891bcc0a2817346a5e4ded59 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Wed, 23 Jul 2025 10:35:23 -0700 Subject: [PATCH 18/33] formatting --- sycl/include/sycl/handler.hpp | 9 +- sycl/source/detail/cg.hpp | 10 +- sycl/source/detail/graph/node_impl.hpp | 11 +- sycl/source/detail/memory_manager.cpp | 20 +- sycl/source/detail/memory_manager.hpp | 14 +- sycl/source/detail/scheduler/commands.cpp | 6 +- sycl/source/handler.cpp | 5 +- .../ext_oneapi_enqueue_functions_prefetch.cpp | 12 +- .../source/adapters/hip/enqueue.cpp | 3 +- .../source/adapters/level_zero/adapter.cpp | 339 +++++++++--------- .../adapters/level_zero/command_buffer.cpp | 7 +- .../source/adapters/level_zero/memory.cpp | 5 +- .../source/adapters/level_zero/platform.cpp | 8 +- .../adapters/level_zero/v2/command_buffer.cpp | 10 - .../level_zero/v2/command_list_manager.cpp | 14 +- .../adapters/level_zero/v2/queue_api.cpp | 10 - .../source/adapters/mock/ur_mockddi.cpp | 2 +- .../source/adapters/opencl/usm.cpp | 6 +- .../loader/layers/tracing/ur_trcddi.cpp | 2 +- .../loader/layers/validation/ur_valddi.cpp | 2 +- unified-runtime/source/loader/ur_ldrddi.cpp | 2 +- unified-runtime/source/loader/ur_libapi.cpp | 2 +- .../exp_command_buffer/commands.cpp | 4 +- .../exp_command_buffer/event_sync.cpp | 6 +- .../exp_command_buffer/update/event_sync.cpp | 4 +- 25 files changed, 252 insertions(+), 261 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index c42a177f37878..fd93c61e63ae5 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3694,15 +3694,14 @@ class __SYCL_EXPORT handler { void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height); - // Implementation of enqueue_functions extension's USM prefetch, allowing for -// prefetching memory from both host to device and vice versa. +// prefetching memory from both host to device and vice versa. #ifdef __INTEL_PREVIEW_BREAKING_CHANGES // Prefetch implementation that accounts for prefetching both directions, but // introduces a "prefetch type" field to handler/CG nodes: this results in an // ABI break. void ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, - ext::oneapi::experimental::prefetch_type Type); + ext::oneapi::experimental::prefetch_type Type); #else // Non-ABI breaking implementation that implements prefetching from device to // host as a separate function. @@ -3713,8 +3712,8 @@ class __SYCL_EXPORT handler { // Enqueue_functions extension's prefetch function is friended in order to // call private handler function ext_oneapi_prefetch_d2h. friend void ext::oneapi::experimental::prefetch( - handler &CGH, void *Ptr, size_t NumBytes, - ext::oneapi::experimental::prefetch_type Type); + handler &CGH, void *Ptr, size_t NumBytes, + ext::oneapi::experimental::prefetch_type Type); // Implementation of memcpy to device_global. void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src, diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index 47693d4674f32..aa35b4f89403a 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -409,7 +409,7 @@ class CGPrefetchUSM : public CG { }; #ifdef __INTEL_PREVIEW_BREAKING_CHANGES -/// Enqueue_functions extension USM Prefetch command group class +/// Enqueue_functions extension USM Prefetch command group class class CGPrefetchUSMExp : public CG { void *MDst; size_t MLength; @@ -417,8 +417,8 @@ class CGPrefetchUSMExp : public CG { public: CGPrefetchUSMExp(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, - ext::oneapi::experimental::prefetch_type PrefetchType, - detail::code_location loc = {}) + ext::oneapi::experimental::prefetch_type PrefetchType, + detail::code_location loc = {}) : CG(CGType::PrefetchUSMExp, std::move(CGData), std::move(loc)), MDst(DstPtr), MLength(Length), MPrefetchType(PrefetchType) {} void *getDst() { return MDst; } @@ -428,14 +428,14 @@ class CGPrefetchUSMExp : public CG { } }; #else -/// Enqueue_functions USM device-to-host prefetch command group class +/// Enqueue_functions USM device-to-host prefetch command group class class CGPrefetchUSMExpD2H : public CG { void *MDst; size_t MLength; public: CGPrefetchUSMExpD2H(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, - detail::code_location loc = {}) + detail::code_location loc = {}) : CG(CGType::PrefetchUSMExpD2H, std::move(CGData), std::move(loc)), MDst(DstPtr), MLength(Length) {} void *getDst() { return MDst; } diff --git a/sycl/source/detail/graph/node_impl.hpp b/sycl/source/detail/graph/node_impl.hpp index d4c3498cb1e8f..9a357f81b0095 100644 --- a/sycl/source/detail/graph/node_impl.hpp +++ b/sycl/source/detail/graph/node_impl.hpp @@ -15,8 +15,8 @@ #include // for CGType #include // for kernel_param_kind_t -#include // for node #include // for prefetchType +#include // for node #include #include @@ -678,10 +678,10 @@ class node_impl : public std::enable_shared_from_this { sycl::detail::CGPrefetchUSMExp *PrefetchExp = static_cast(MCommandGroup.get()); Stream << "Dst: " << PrefetchExp->getDst() - << " Length: " << PrefetchExp->getLength() - << " PrefetchType: " + << " Length: " << PrefetchExp->getLength() << " PrefetchType: " << sycl::ext::oneapi::experimental::prefetchTypeToString( - PrefetchExp->getPrefetchType()) << "\\n"; + PrefetchExp->getPrefetchType()) + << "\\n"; } break; #else @@ -689,7 +689,8 @@ class node_impl : public std::enable_shared_from_this { Stream << "CGPrefetchUSMExpD2H (Experimental, Device to host) \\n"; if (Verbose) { sycl::detail::CGPrefetchUSMExpD2H *Prefetch = - static_cast(MCommandGroup.get()); + static_cast( + MCommandGroup.get()); Stream << "Dst: " << Prefetch->getDst() << " Length: " << Prefetch->getLength() << "\\n"; } diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 798b98186c282..528989a359c9c 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -922,18 +922,18 @@ void MemoryManager::fill_usm(void *Mem, queue_impl &Queue, size_t Length, DepEvents.size(), DepEvents.data(), OutEvent); } -void MemoryManager::prefetch_usm(void *Mem, queue_impl &Queue, size_t Length, - std::vector DepEvents, - ur_event_handle_t *OutEvent, - sycl::ext::oneapi::experimental::prefetch_type Dest) { +void MemoryManager::prefetch_usm( + void *Mem, queue_impl &Queue, size_t Length, + std::vector DepEvents, ur_event_handle_t *OutEvent, + sycl::ext::oneapi::experimental::prefetch_type Dest) { adapter_impl &Adapter = Queue.getAdapter(); ur_usm_migration_flags_t MigrationFlag = (Dest == sycl::ext::oneapi::experimental::prefetch_type::device) ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; - Adapter.call(Queue.getHandleRef(), Mem, - Length, MigrationFlag, DepEvents.size(), - DepEvents.data(), OutEvent); + Adapter.call( + Queue.getHandleRef(), Mem, Length, MigrationFlag, DepEvents.size(), + DepEvents.data(), OutEvent); } void MemoryManager::advise_usm(const void *Mem, queue_impl &Queue, @@ -1544,7 +1544,7 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( sycl::detail::context_impl *Context, ur_exp_command_buffer_handle_t CommandBuffer, void *Mem, size_t Length, std::vector Deps, - ur_exp_command_buffer_sync_point_t *OutSyncPoint, + ur_exp_command_buffer_sync_point_t *OutSyncPoint, sycl::ext::oneapi::experimental::prefetch_type Dest) { adapter_impl &Adapter = Context->getAdapter(); ur_usm_migration_flags_t MigrationFlag = @@ -1552,8 +1552,8 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer( ? UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE : UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST; Adapter.call( - CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), - Deps.data(), 0, nullptr, OutSyncPoint, nullptr, nullptr); + CommandBuffer, Mem, Length, MigrationFlag, Deps.size(), Deps.data(), 0, + nullptr, OutSyncPoint, nullptr, nullptr); } void MemoryManager::ext_oneapi_advise_usm_cmd_buffer( diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp index c0ad42473fafe..049394964a3d2 100644 --- a/sycl/source/detail/memory_manager.hpp +++ b/sycl/source/detail/memory_manager.hpp @@ -147,12 +147,12 @@ class MemoryManager { std::vector DepEvents, ur_event_handle_t *OutEvent); - static void prefetch_usm( - void *Ptr, queue_impl &Queue, size_t Len, - std::vector DepEvents, - ur_event_handle_t *OutEvent, - sycl::ext::oneapi::experimental::prefetch_type Dest = - sycl::ext::oneapi::experimental::prefetch_type::device); + static void + prefetch_usm(void *Ptr, queue_impl &Queue, size_t Len, + std::vector DepEvents, + ur_event_handle_t *OutEvent, + sycl::ext::oneapi::experimental::prefetch_type Dest = + sycl::ext::oneapi::experimental::prefetch_type::device); static void advise_usm(const void *Ptr, queue_impl &Queue, size_t Len, ur_usm_advice_flags_t Advice, @@ -251,7 +251,7 @@ class MemoryManager { std::vector Deps, ur_exp_command_buffer_sync_point_t *OutSyncPoint, sycl::ext::oneapi::experimental::prefetch_type Dest = - sycl::ext::oneapi::experimental::prefetch_type::device); + sycl::ext::oneapi::experimental::prefetch_type::device); static void ext_oneapi_advise_usm_cmd_buffer( sycl::detail::context_impl *Context, diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 082abaf747e79..6d043fb88266a 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3022,7 +3022,8 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { } #else case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSMExpD2H *PrefetchD2H = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + CGPrefetchUSMExpD2H *PrefetchD2H = + (CGPrefetchUSMExpD2H *)MCommandGroup.get(); if (auto Result = callMemOpHelper( MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, &MQueue->getContextImpl(), MCommandBuffer, PrefetchD2H->getDst(), @@ -3362,7 +3363,8 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { } #else case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSMExpD2H *PrefetchD2H = (CGPrefetchUSMExpD2H *)MCommandGroup.get(); + CGPrefetchUSMExpD2H *PrefetchD2H = + (CGPrefetchUSMExpD2H *)MCommandGroup.get(); if (auto Result = callMemOpHelper( MemoryManager::prefetch_usm, PrefetchD2H->getDst(), *MQueue, PrefetchD2H->getLength(), std::move(RawEvents), Event, diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index f1f14ac49ded1..647ab51cf91d2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -1491,8 +1491,9 @@ void handler::prefetch(const void *Ptr, size_t Count) { } #ifdef __INTEL_PREVIEW_BREAKING_CHANGES -void handler::ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, - ext::oneapi::experimental::prefetch_type Type) { +void handler::ext_oneapi_prefetch_exp( + const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); MLength = Count; diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index ea99b113f7dcd..dd59858153fd4 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -9,7 +9,7 @@ // Extra run to check for immediate-command-list in Level Zero // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// Tests prefetch functionality in enqueue functions +// Tests prefetch functionality in enqueue functions #include "../graph_common.hpp" #include @@ -42,9 +42,8 @@ int main() { exp_ext::submit(Q, [&](handler &CGH) { CGH.depends_on(TestH2D); - exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { - Dst[i] = Src[i] * 2; - }); + exp_ext::parallel_for(CGH, range<1>(N), + [=](id<1> i) { Dst[i] = Src[i] * 2; }); }); Graph.end_recording(); @@ -66,9 +65,8 @@ int main() { // Test submitting device-to-host prefetch event TestD2H = exp_ext::submit_with_event(Q, [&](handler &CGH) { - exp_ext::parallel_for(CGH, range<1>(N), [=](id<1> i) { - Dst[i] = Src[i] + 1; - }); + exp_ext::parallel_for(CGH, range<1>(N), + [=](id<1> i) { Dst[i] = Src[i] + 1; }); }); exp_ext::submit(Q, [&](handler &CGH) { diff --git a/unified-runtime/source/adapters/hip/enqueue.cpp b/unified-runtime/source/adapters/hip/enqueue.cpp index 182d9a487ec23..2206fbbf3eb3a 100644 --- a/unified-runtime/source/adapters/hip/enqueue.cpp +++ b/unified-runtime/source/adapters/hip/enqueue.cpp @@ -1398,8 +1398,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_SUCCESS; } - UR_CHECK_ERROR( - hipMemPrefetchAsync(pMem, size, TargetDevice, HIPStream)); + UR_CHECK_ERROR(hipMemPrefetchAsync(pMem, size, TargetDevice, HIPStream)); releaseEvent(); } catch (ur_result_t Err) { return Err; diff --git a/unified-runtime/source/adapters/level_zero/adapter.cpp b/unified-runtime/source/adapters/level_zero/adapter.cpp index 388af44695578..9de4138f7e433 100644 --- a/unified-runtime/source/adapters/level_zero/adapter.cpp +++ b/unified-runtime/source/adapters/level_zero/adapter.cpp @@ -373,207 +373,204 @@ ur_adapter_handle_t_::ur_adapter_handle_t_() setEnvVar("ZEL_ENABLE_BASIC_LEAK_CHECKER", "1"); } - uint32_t UserForcedSysManInit = 0; - // Check if the user has disabled the default L0 Env initialization. - const int UrSysManEnvInitEnabled = [&UserForcedSysManInit] { - const char *UrRet = std::getenv("UR_L0_ENABLE_SYSMAN_ENV_DEFAULT"); - if (!UrRet) - return 1; - UserForcedSysManInit &= 1; - return std::atoi(UrRet); - }(); - - // Dynamically load the new L0 apis separately. - // This must be done to avoid attempting to use symbols that do - // not exist in older loader runtimes. + uint32_t UserForcedSysManInit = 0; + // Check if the user has disabled the default L0 Env initialization. + const int UrSysManEnvInitEnabled = [&UserForcedSysManInit] { + const char *UrRet = std::getenv("UR_L0_ENABLE_SYSMAN_ENV_DEFAULT"); + if (!UrRet) + return 1; + UserForcedSysManInit &= 1; + return std::atoi(UrRet); + }(); + + // Dynamically load the new L0 apis separately. + // This must be done to avoid attempting to use symbols that do + // not exist in older loader runtimes. #ifndef UR_STATIC_LEVEL_ZERO #ifdef _WIN32 - processHandle = GetModuleHandle(NULL); + processHandle = GetModuleHandle(NULL); #else - processHandle = nullptr; + processHandle = nullptr; #endif #endif - // Setting these environment variables before running zeInit will enable - // the validation layer in the Level Zero loader. - if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { - setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); - setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); - } - - if (getenv("SYCL_ENABLE_PCI") != nullptr) { - UR_LOG( - WARN, - "WARNING: SYCL_ENABLE_PCI is deprecated and no longer needed.\n"); - } + // Setting these environment variables before running zeInit will enable + // the validation layer in the Level Zero loader. + if (UrL0Debug & UR_L0_DEBUG_VALIDATION) { + setEnvVar("ZE_ENABLE_VALIDATION_LAYER", "1"); + setEnvVar("ZE_ENABLE_PARAMETER_VALIDATION", "1"); + } - // TODO: We can still safely recover if something goes wrong during the - // init. Implement handling segfault using sigaction. + if (getenv("SYCL_ENABLE_PCI") != nullptr) { + UR_LOG(WARN, + "WARNING: SYCL_ENABLE_PCI is deprecated and no longer needed.\n"); + } - // We must only initialize the driver once, even if urPlatformGet() is - // called multiple times. Declaring the return value as "static" ensures - // it's only called once. + // TODO: We can still safely recover if something goes wrong during the + // init. Implement handling segfault using sigaction. - // Set ZES_ENABLE_SYSMAN by default if the user has not set it. - if (UrSysManEnvInitEnabled) { - setEnvVar("ZES_ENABLE_SYSMAN", "1"); - } + // We must only initialize the driver once, even if urPlatformGet() is + // called multiple times. Declaring the return value as "static" ensures + // it's only called once. - // Init with all flags set to enable for all driver types to be init in - // the application. - ze_init_flags_t L0InitFlags = ZE_INIT_FLAG_GPU_ONLY; - if (UrL0InitAllDrivers) { - L0InitFlags |= ZE_INIT_FLAG_VPU_ONLY; - } - UR_LOG(DEBUG, "\nzeInit with flags value of {}\n", - static_cast(L0InitFlags)); - ZeInitResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags)); - if (ZeInitResult != ZE_RESULT_SUCCESS) { - const char *ErrorString = "Unknown"; - zeParseError(ZeInitResult, ErrorString); - UR_LOG(ERR, "\nzeInit failed with {}\n", ErrorString); - } + // Set ZES_ENABLE_SYSMAN by default if the user has not set it. + if (UrSysManEnvInitEnabled) { + setEnvVar("ZES_ENABLE_SYSMAN", "1"); + } - bool useInitDrivers = false; - zel_version_t loader_version = {}; - size_t num_components; - auto result = zelLoaderGetVersions(&num_components, nullptr); - if (result == ZE_RESULT_SUCCESS) { - zel_component_version_t *versions = - new zel_component_version_t[num_components]; - result = zelLoaderGetVersions(&num_components, versions); - if (result == ZE_RESULT_SUCCESS) { - for (size_t i = 0; i < num_components; ++i) { - if (strncmp(versions[i].component_name, "loader", - strlen("loader")) == 0) { - loader_version = versions[i].component_lib_version; - UR_LOG(DEBUG, "\nLevel Zero Loader Version: {}.{}.{}\n", - loader_version.major, loader_version.minor, - loader_version.patch); - break; - } - } - } - delete[] versions; - if (loader_version.major > 1 || - (loader_version.major == 1 && loader_version.minor > 19) || - (loader_version.major == 1 && loader_version.minor == 19 && - loader_version.patch >= 2)) { - useInitDrivers = true; - } + // Init with all flags set to enable for all driver types to be init in + // the application. + ze_init_flags_t L0InitFlags = ZE_INIT_FLAG_GPU_ONLY; + if (UrL0InitAllDrivers) { + L0InitFlags |= ZE_INIT_FLAG_VPU_ONLY; + } + UR_LOG(DEBUG, "\nzeInit with flags value of {}\n", + static_cast(L0InitFlags)); + ZeInitResult = ZE_CALL_NOCHECK(zeInit, (L0InitFlags)); + if (ZeInitResult != ZE_RESULT_SUCCESS) { + const char *ErrorString = "Unknown"; + zeParseError(ZeInitResult, ErrorString); + UR_LOG(ERR, "\nzeInit failed with {}\n", ErrorString); + } - if ((loader_version.major == 1 && loader_version.minor < 21) || - (loader_version.major == 1 && loader_version.minor == 21 && - loader_version.patch < 2)) { - UR_LOG( - WARN, - "WARNING: Level Zero Loader version is older than 1.21.2. " - "Please update to the latest version for API logging support.\n"); + bool useInitDrivers = false; + zel_version_t loader_version = {}; + size_t num_components; + auto result = zelLoaderGetVersions(&num_components, nullptr); + if (result == ZE_RESULT_SUCCESS) { + zel_component_version_t *versions = + new zel_component_version_t[num_components]; + result = zelLoaderGetVersions(&num_components, versions); + if (result == ZE_RESULT_SUCCESS) { + for (size_t i = 0; i < num_components; ++i) { + if (strncmp(versions[i].component_name, "loader", strlen("loader")) == + 0) { + loader_version = versions[i].component_lib_version; + UR_LOG(DEBUG, "\nLevel Zero Loader Version: {}.{}.{}\n", + loader_version.major, loader_version.minor, + loader_version.patch); + break; } } + } + delete[] versions; + if (loader_version.major > 1 || + (loader_version.major == 1 && loader_version.minor > 19) || + (loader_version.major == 1 && loader_version.minor == 19 && + loader_version.patch >= 2)) { + useInitDrivers = true; + } - if (useInitDrivers) { + if ((loader_version.major == 1 && loader_version.minor < 21) || + (loader_version.major == 1 && loader_version.minor == 21 && + loader_version.patch < 2)) { + UR_LOG(WARN, + "WARNING: Level Zero Loader version is older than 1.21.2. " + "Please update to the latest version for API logging support.\n"); + } + } + + if (useInitDrivers) { #ifdef UR_STATIC_LEVEL_ZERO - initDriversFunctionPtr = zeInitDrivers; + initDriversFunctionPtr = zeInitDrivers; #else - initDriversFunctionPtr = - (ze_pfnInitDrivers_t)ur_loader::LibLoader::getFunctionPtr( - processHandle, "zeInitDrivers"); + initDriversFunctionPtr = + (ze_pfnInitDrivers_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zeInitDrivers"); #endif - if (initDriversFunctionPtr) { - UR_LOG(DEBUG, "\nzeInitDrivers with flags value of {}\n", - static_cast(InitDriversDesc.flags)); - ZeInitDriversResult = - ZE_CALL_NOCHECK(initDriversFunctionPtr, - (&ZeInitDriversCount, nullptr, &InitDriversDesc)); - if (ZeInitDriversResult == ZE_RESULT_SUCCESS) { - InitDriversSupported = true; - } else { - const char *ErrorString = "Unknown"; - zeParseError(ZeInitDriversResult, ErrorString); - UR_LOG(ERR, "\nzeInitDrivers failed with {}\n", ErrorString); - } - } + if (initDriversFunctionPtr) { + UR_LOG(DEBUG, "\nzeInitDrivers with flags value of {}\n", + static_cast(InitDriversDesc.flags)); + ZeInitDriversResult = + ZE_CALL_NOCHECK(initDriversFunctionPtr, + (&ZeInitDriversCount, nullptr, &InitDriversDesc)); + if (ZeInitDriversResult == ZE_RESULT_SUCCESS) { + InitDriversSupported = true; + } else { + const char *ErrorString = "Unknown"; + zeParseError(ZeInitDriversResult, ErrorString); + UR_LOG(ERR, "\nzeInitDrivers failed with {}\n", ErrorString); } + } + } - if (ZeInitResult != ZE_RESULT_SUCCESS && - ZeInitDriversResult != ZE_RESULT_SUCCESS) { - // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. - UR_LOG(ERR, "Level Zero Uninitialized\n"); - return; - } + if (ZeInitResult != ZE_RESULT_SUCCESS && + ZeInitDriversResult != ZE_RESULT_SUCCESS) { + // Absorb the ZE_RESULT_ERROR_UNINITIALIZED and just return 0 Platforms. + UR_LOG(ERR, "Level Zero Uninitialized\n"); + return; + } - PlatformVec platforms; + PlatformVec platforms; - bool forceLoadedAdapter = ur_getenv("UR_ADAPTERS_FORCE_LOAD").has_value(); - if (!forceLoadedAdapter) { + bool forceLoadedAdapter = ur_getenv("UR_ADAPTERS_FORCE_LOAD").has_value(); + if (!forceLoadedAdapter) { #ifdef UR_ADAPTER_LEVEL_ZERO_V2 - auto [useV2, reason] = shouldUseV2Adapter(); - if (!useV2) { - UR_LOG(INFO, "Skipping L0 V2 adapter: {}", reason); - return; - } + auto [useV2, reason] = shouldUseV2Adapter(); + if (!useV2) { + UR_LOG(INFO, "Skipping L0 V2 adapter: {}", reason); + return; + } #else - auto [useV1, reason] = shouldUseV1Adapter(); - if (!useV1) { - UR_LOG(INFO, "Skipping L0 V1 adapter: {}", reason); - return; - } + auto [useV1, reason] = shouldUseV1Adapter(); + if (!useV1) { + UR_LOG(INFO, "Skipping L0 V1 adapter: {}", reason); + return; + } #endif - } + } - // Check if the user has enabled the default L0 SysMan initialization. - const int UrSysmanZesinitEnable = [&UserForcedSysManInit] { - const char *UrRet = std::getenv("UR_L0_ENABLE_ZESINIT_DEFAULT"); - if (!UrRet) - return 0; - UserForcedSysManInit &= 2; - return std::atoi(UrRet); - }(); - - bool ZesInitNeeded = UrSysmanZesinitEnable && !UrSysManEnvInitEnabled; - // Unless the user has forced the SysMan init, we will check the device - // version to see if the zesInit is needed. - if (UserForcedSysManInit == 0 && checkDeviceIntelGPUIpVersionOrNewer( - 0x05004000) == UR_RESULT_SUCCESS) { - if (UrSysManEnvInitEnabled) { - setEnvVar("ZES_ENABLE_SYSMAN", "0"); - } - ZesInitNeeded = true; - } - if (ZesInitNeeded) { + // Check if the user has enabled the default L0 SysMan initialization. + const int UrSysmanZesinitEnable = [&UserForcedSysManInit] { + const char *UrRet = std::getenv("UR_L0_ENABLE_ZESINIT_DEFAULT"); + if (!UrRet) + return 0; + UserForcedSysManInit &= 2; + return std::atoi(UrRet); + }(); + + bool ZesInitNeeded = UrSysmanZesinitEnable && !UrSysManEnvInitEnabled; + // Unless the user has forced the SysMan init, we will check the device + // version to see if the zesInit is needed. + if (UserForcedSysManInit == 0 && + checkDeviceIntelGPUIpVersionOrNewer(0x05004000) == UR_RESULT_SUCCESS) { + if (UrSysManEnvInitEnabled) { + setEnvVar("ZES_ENABLE_SYSMAN", "0"); + } + ZesInitNeeded = true; + } + if (ZesInitNeeded) { #ifdef UR_STATIC_LEVEL_ZERO - getDeviceByUUIdFunctionPtr = zesDriverGetDeviceByUuidExp; - getSysManDriversFunctionPtr = zesDriverGet; - sysManInitFunctionPtr = zesInit; + getDeviceByUUIdFunctionPtr = zesDriverGetDeviceByUuidExp; + getSysManDriversFunctionPtr = zesDriverGet; + sysManInitFunctionPtr = zesInit; #else - getDeviceByUUIdFunctionPtr = (zes_pfnDriverGetDeviceByUuidExp_t) - ur_loader::LibLoader::getFunctionPtr(processHandle, - "zesDriverGetDeviceByUuidExp"); - getSysManDriversFunctionPtr = - (zes_pfnDriverGet_t)ur_loader::LibLoader::getFunctionPtr( - processHandle, "zesDriverGet"); - sysManInitFunctionPtr = - (zes_pfnInit_t)ur_loader::LibLoader::getFunctionPtr(processHandle, - "zesInit"); + getDeviceByUUIdFunctionPtr = + (zes_pfnDriverGetDeviceByUuidExp_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGetDeviceByUuidExp"); + getSysManDriversFunctionPtr = + (zes_pfnDriverGet_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesDriverGet"); + sysManInitFunctionPtr = (zes_pfnInit_t)ur_loader::LibLoader::getFunctionPtr( + processHandle, "zesInit"); #endif - } - if (getDeviceByUUIdFunctionPtr && getSysManDriversFunctionPtr && - sysManInitFunctionPtr) { - ze_init_flags_t L0ZesInitFlags = 0; - UR_LOG(DEBUG, "\nzesInit with flags value of {}\n", - static_cast(L0ZesInitFlags)); - ZesResult = ZE_CALL_NOCHECK(sysManInitFunctionPtr, (L0ZesInitFlags)); - } else { - ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; - } + } + if (getDeviceByUUIdFunctionPtr && getSysManDriversFunctionPtr && + sysManInitFunctionPtr) { + ze_init_flags_t L0ZesInitFlags = 0; + UR_LOG(DEBUG, "\nzesInit with flags value of {}\n", + static_cast(L0ZesInitFlags)); + ZesResult = ZE_CALL_NOCHECK(sysManInitFunctionPtr, (L0ZesInitFlags)); + } else { + ZesResult = ZE_RESULT_ERROR_UNINITIALIZED; + } - ur_result_t err = initPlatforms(this, platforms, ZesResult); - if (err == UR_RESULT_SUCCESS) { - Platforms = std::move(platforms); - } else { - throw err; - } + ur_result_t err = initPlatforms(this, platforms, ZesResult); + if (err == UR_RESULT_SUCCESS) { + Platforms = std::move(platforms); + } else { + throw err; + } } void globalAdapterOnDemandCleanup() { diff --git a/unified-runtime/source/adapters/level_zero/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/command_buffer.cpp index 7385dae6336f1..687c905417d8b 100644 --- a/unified-runtime/source/adapters/level_zero/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/command_buffer.cpp @@ -1327,11 +1327,12 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( UR_COMMAND_USM_PREFETCH, CommandBuffer, CommandBuffer->ZeComputeCommandList, NumSyncPointsInWaitList, SyncPointWaitList, true, RetSyncPoint, ZeEventList, ZeLaunchEvent)); - switch(Flags) { + switch (Flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(WARN, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); + UR_LOG(WARN, "commandBufferAppendUSMPrefetch: L0 does not support prefetch " + "to host yet"); break; default: UR_LOG(ERR, "commandBufferAppendUSMPrefetch: invalid USM migration flag"); @@ -1348,7 +1349,7 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( // TODO Support migration flags after L0 backend support is added. if (Flags == UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (CommandBuffer->ZeComputeCommandList, Mem, Size)); + (CommandBuffer->ZeComputeCommandList, Mem, Size)); } if (!CommandBuffer->IsInOrderCmdList) { diff --git a/unified-runtime/source/adapters/level_zero/memory.cpp b/unified-runtime/source/adapters/level_zero/memory.cpp index 550233a050fcd..107fcc2d1c2f5 100644 --- a/unified-runtime/source/adapters/level_zero/memory.cpp +++ b/unified-runtime/source/adapters/level_zero/memory.cpp @@ -1276,11 +1276,12 @@ ur_result_t urEnqueueUSMPrefetch( /// [in,out][optional] return an event object that identifies this /// particular command instance. ur_event_handle_t *OutEvent) { - switch(Flags) { + switch (Flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: break; case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(WARN, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); + UR_LOG(WARN, + "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); break; default: UR_LOG(ERR, "enqueueUSMPrefetch: invalid USM migration flag"); diff --git a/unified-runtime/source/adapters/level_zero/platform.cpp b/unified-runtime/source/adapters/level_zero/platform.cpp index 47f2e519bb723..6c924f4827bc4 100644 --- a/unified-runtime/source/adapters/level_zero/platform.cpp +++ b/unified-runtime/source/adapters/level_zero/platform.cpp @@ -32,11 +32,11 @@ ur_result_t urPlatformGet( if (NumPlatforms) { *NumPlatforms = nplatforms; } - if (Platforms) { - for (uint32_t i = 0; i < std::min(nplatforms, NumEntries); ++i) { - Platforms[i] = GlobalAdapter->Platforms.at(i).get(); - } + if (Platforms) { + for (uint32_t i = 0; i < std::min(nplatforms, NumEntries); ++i) { + Platforms[i] = GlobalAdapter->Platforms.at(i).get(); } + } return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index 66c60ec4576e8..b4c2674bd3364 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -589,16 +589,6 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( // the same issue as in urCommandBufferAppendKernelLaunchExp - switch(flags) { - case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: - break; - case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(WARN, "commandBufferAppendUSMPrefetch: L0 does not support prefetch to host yet"); - break; - default: - UR_LOG(ERR, "commandBufferAppendUSMPrefetch: invalid USM migration flag"); - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } auto commandListLocked = hCommandBuffer->commandListManager.lock(); auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 2e5c85593f698..753ad2e0af129 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -288,6 +288,18 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); + switch (flags) { + case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: + break; + case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: + UR_LOG(WARN, + "appendUSMPrefetch: L0v2 does not support prefetch to host yet"); + break; + default: + UR_LOG(ERR, "appendUSMPrefetch: invalid USM migration flag"); + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -299,7 +311,7 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( // TODO: Support migration flags after L0 backend support is added if (flags == UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE) { ZE2UR_CALL(zeCommandListAppendMemoryPrefetch, - (zeCommandList.get(), pMem, size)); + (zeCommandList.get(), pMem, size)); } if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 1dfe7450f5404..d043a68dcaec7 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -261,16 +261,6 @@ ur_result_t urEnqueueUSMPrefetch(ur_queue_handle_t hQueue, const void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { - switch(flags) { - case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: - break; - case UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST: - UR_LOG(WARN, "enqueueUSMPrefetch: L0 does not support prefetch to host yet"); - break; - default: - UR_LOG(ERR, "enqueueUSMPrefetch: invalid USM migration flag"); - return UR_RESULT_ERROR_INVALID_ENUMERATION; - } return hQueue->get().enqueueUSMPrefetch( pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent); } catch (...) { diff --git a/unified-runtime/source/adapters/mock/ur_mockddi.cpp b/unified-runtime/source/adapters/mock/ur_mockddi.cpp index 3ab79444f32ea..97ffa4bf62d11 100644 --- a/unified-runtime/source/adapters/mock/ur_mockddi.cpp +++ b/unified-runtime/source/adapters/mock/ur_mockddi.cpp @@ -10681,7 +10681,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/source/adapters/opencl/usm.cpp b/unified-runtime/source/adapters/opencl/usm.cpp index 6964433d84d81..09cf31aee8645 100644 --- a/unified-runtime/source/adapters/opencl/usm.cpp +++ b/unified-runtime/source/adapters/opencl/usm.cpp @@ -529,7 +529,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // cl_mem_migration_flags MigrationFlag; switch (flags) { case UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE: - // Note: currently opencl:cpu will break with this value, but opencl:gpu + // Note: currently opencl:cpu will break with this value, but opencl:gpu // will work just fine. A spec change has been made to address this issue, // and is waiting to be implemented: // https://github.com/KhronosGroup/OpenCL-Docs/pull/1412/files#diff-7e4c12789cfc81c40637d32b7113b0cca2c3ee0beabaabb9acd9da743f7b5780R974 @@ -573,8 +573,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // TODO: when issues with impl are fully resolved, delete this and use // waitlisting from EnqueueMigrateMem instead. CL_RETURN_ON_FAILURE(clEnqueueMarkerWithWaitList( - hQueue->CLQueue, numEventsInWaitList, CLWaitEvents.data(), - ifUrEvent(phEvent, Event))); + hQueue->CLQueue, numEventsInWaitList, CLWaitEvents.data(), + ifUrEvent(phEvent, Event))); UR_RETURN_ON_FAILURE(createUREvent(Event, hQueue->Context, hQueue, phEvent)); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp index 0abbb7604c57d..fa24233db596e 100644 --- a/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp +++ b/unified-runtime/source/loader/layers/tracing/ur_trcddi.cpp @@ -9039,7 +9039,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp index b61356afd2b35..ef354b300b763 100644 --- a/unified-runtime/source/loader/layers/validation/ur_valddi.cpp +++ b/unified-runtime/source/loader/layers/validation/ur_valddi.cpp @@ -9822,7 +9822,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/source/loader/ur_ldrddi.cpp b/unified-runtime/source/loader/ur_ldrddi.cpp index 74712c5c4d48e..0132b61da3940 100644 --- a/unified-runtime/source/loader/ur_ldrddi.cpp +++ b/unified-runtime/source/loader/ur_ldrddi.cpp @@ -5152,7 +5152,7 @@ __urdlllocal ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/source/loader/ur_libapi.cpp b/unified-runtime/source/loader/ur_libapi.cpp index cad6de4dd9c38..647e2b752b576 100644 --- a/unified-runtime/source/loader/ur_libapi.cpp +++ b/unified-runtime/source/loader/ur_libapi.cpp @@ -9457,7 +9457,7 @@ ur_result_t UR_APICALL urCommandBufferAppendUSMPrefetchExp( const void *pMemory, /// [in] size in bytes to be fetched. size_t size, - /// [in] USM prefetch flags + /// [in] USM migration flags ur_usm_migration_flags_t flags, /// [in] The number of sync points in the provided dependency list. uint32_t numSyncPointsInWaitList, diff --git a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp index 38546ee4fc963..4e0ced3502943 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/commands.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/commands.cpp @@ -149,9 +149,9 @@ TEST_P(urCommandBufferCommandsTest, urCommandBufferAppendUSMPrefetchExp) { } TEST_P(urCommandBufferCommandsTest, - urCommandBufferAppendUSMPrefetchExpDeviceToHost) { + urCommandBufferAppendUSMPrefetchExpDeviceToHost) { // No Prefetch command in cl_khr_command_buffer - // No driver support for prefetching from device to host on Intel GPUs + // No driver support for prefetching from device to host on Intel GPUs UUR_KNOWN_FAILURE_ON(uur::OpenCL{}, uur::LevelZero{}); ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( diff --git a/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp b/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp index 9248d653e5b0c..26ec26b2a05bd 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/event_sync.cpp @@ -426,9 +426,9 @@ TEST_P(CommandEventSyncTest, USMPrefetchExp) { // Test prefetch command waiting on queue event ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( - cmd_buf_handle, device_ptrs[1], allocation_size, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, - 0, nullptr, 1, &external_events[0], nullptr, &external_events[1], - nullptr)); + cmd_buf_handle, device_ptrs[1], allocation_size, + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 1, &external_events[0], + nullptr, &external_events[1], nullptr)); ASSERT_SUCCESS(urCommandBufferFinalizeExp(cmd_buf_handle)); ASSERT_SUCCESS( urEnqueueCommandBufferExp(queue, cmd_buf_handle, 0, nullptr, nullptr)); diff --git a/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp b/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp index a518ac03b4655..16763eaf0c15d 100644 --- a/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp +++ b/unified-runtime/test/conformance/exp_command_buffer/update/event_sync.cpp @@ -723,8 +723,8 @@ TEST_P(CommandEventSyncUpdateTest, USMPrefetchExp) { // Test prefetch command waiting on queue event ASSERT_SUCCESS(urCommandBufferAppendUSMPrefetchExp( updatable_cmd_buf_handle, device_ptrs[1], allocation_size, - UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 1, &external_events[0], nullptr, - &external_events[1], &command_handles[0])); + UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE, 0, nullptr, 1, &external_events[0], + nullptr, &external_events[1], &command_handles[0])); ASSERT_NE(nullptr, command_handles[0]); ASSERT_SUCCESS(urCommandBufferFinalizeExp(updatable_cmd_buf_handle)); ASSERT_SUCCESS(urEnqueueCommandBufferExp(queue, updatable_cmd_buf_handle, 0, From 0a45ea3b0c6f8522013cbc1528db657cc16c72ad Mon Sep 17 00:00:00 2001 From: Ian Li Date: Wed, 23 Jul 2025 13:03:23 -0700 Subject: [PATCH 19/33] reenable queue test --- sycl/test-e2e/USM/prefetch_exp.cpp | 70 +++++++++++++++--------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp index 87542ff3bb767..b3145d919bcfb 100644 --- a/sycl/test-e2e/USM/prefetch_exp.cpp +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -76,41 +76,41 @@ int main() { } } - // { - // // Test host-to-device prefetch via prefetch(queue ...). - // ext::oneapi::experimental::prefetch( - // q, Src, sizeof(float) * Count, - // ext::oneapi::experimental::prefetch_type::device); - // q.wait_and_throw(); - // q.submit([&](handler &CGH) { - // CGH.single_task([=]() { - // for (int i = 0; i < Count; i++) - // Dest[i] = 3 * Src[i]; - // }); - // }); - // q.wait_and_throw(); - - // for (int i = 0; i < Count; i++) { - // assert(Dest[i] == i * 3); - // } - - // // Test device-to-host prefetch via prefetch(queue ...). - // q.submit([&](handler &CGH) { - // CGH.single_task([=]() { - // for (int i = 0; i < Count; i++) - // Dest[i] = 6 * Src[i]; - // }); - // }); - // q.wait_and_throw(); - // ext::oneapi::experimental::prefetch( - // q, Src, sizeof(float) * Count, - // ext::oneapi::experimental::prefetch_type::host); - // q.wait_and_throw(); - - // for (int i = 0; i < Count; i++) { - // assert(Dest[i] == i * 6); - // } - // } + { + // Test host-to-device prefetch via prefetch(queue ...). + ext::oneapi::experimental::prefetch( + q, Src, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::device); + q.wait_and_throw(); + q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 3 * Src[i]; + }); + }); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 3); + } + + // Test device-to-host prefetch via prefetch(queue ...). + q.submit([&](handler &CGH) { + CGH.single_task([=]() { + for (int i = 0; i < Count; i++) + Dest[i] = 6 * Src[i]; + }); + }); + q.wait_and_throw(); + ext::oneapi::experimental::prefetch( + q, Src, sizeof(float) * Count, + ext::oneapi::experimental::prefetch_type::host); + q.wait_and_throw(); + + for (int i = 0; i < Count; i++) { + assert(Dest[i] == i * 6); + } + } free(Src, q); free(Dest, q); } From 0f4ed1feb75058d1834cb9444d4ffeee05cfe3d7 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Thu, 24 Jul 2025 13:21:33 -0700 Subject: [PATCH 20/33] Add unittesting to ensure the runtime calls the UR with the right args --- sycl/unittests/Extensions/CMakeLists.txt | 1 + .../Extensions/EnqueueFunctionsPrefetch.cpp | 80 +++++++++++++++++++ .../enqueue/urEnqueueUSMPrefetch.cpp | 11 --- 3 files changed, 81 insertions(+), 11 deletions(-) create mode 100644 sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index b82c9f798a94c..851ae8720b856 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -12,6 +12,7 @@ add_sycl_unittest(ExtensionsTests OBJECT CompositeDevice.cpp OneAPIProd.cpp EnqueueFunctionsEvents.cpp + EnqueueFunctionsPrefetch.cpp ProfilingTag.cpp KernelProperties.cpp NoDeviceIPVersion.cpp diff --git a/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp b/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp new file mode 100644 index 0000000000000..6386ee8fdf367 --- /dev/null +++ b/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp @@ -0,0 +1,80 @@ +//==------------------- EnqueueFunctionsPrefetch.cpp -----------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Tests enqueue_functions prefetch calls UR functions with the right arguments. + +#include + +#include +#include +#include +#include +#include +#include + +using namespace sycl; + +namespace oneapiext = ext::oneapi::experimental; + +namespace { + +static ur_usm_migration_flags_t SubmittedPrefetchType = + UR_USM_MIGRATION_FLAG_FORCE_UINT32; + +inline ur_result_t replace_urUSMEnqueuePrefetch(void *pParams) { + auto params = *static_cast(pParams); + SubmittedPrefetchType = *params.pflags; + return UR_RESULT_SUCCESS; +} + +static constexpr size_t N = 1024; +class EnqueueFunctionsPrefetchTests : public ::testing::Test { +public: + EnqueueFunctionsPrefetchTests() + : Mock{}, Q{context(sycl::platform()), default_selector_v, + property::queue::in_order{}} {} + +protected: + void SetUp() override { + SubmittedPrefetchType = UR_USM_MIGRATION_FLAG_FORCE_UINT32; + Dst = malloc_shared(N, Q); + } + + unittest::UrMock<> Mock; + queue Q; + int *Dst; +}; + +TEST_F(EnqueueFunctionsPrefetchTests, SubmitHostToDevicePrefetch) { + mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", + replace_urUSMEnqueuePrefetch); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::prefetch(CGH, Dst, sizeof(int) * N, + oneapiext::prefetch_type::device); + }); + + ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + free(Dst, Q); +} + +TEST_F(EnqueueFunctionsPrefetchTests, SubmitDeviceToHostPrefetch) { + mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", + replace_urUSMEnqueuePrefetch); + + oneapiext::submit(Q, [&](handler &CGH) { + oneapiext::prefetch(CGH, Dst, sizeof(int) * N, + oneapiext::prefetch_type::host); + }); + + ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST); + + free(Dst, Q); +} + +} // namespace diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp index d6e692ff5c2e5..88ef85cd93c4d 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueUSMPrefetch.cpp @@ -31,12 +31,6 @@ TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) { // this file. uur::NativeCPU{}); - // if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { - // // Intel GPU drivers do not currently support prefetching memory from - // // device back to host. - // UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::OpenCL); - // } - ur_event_handle_t prefetch_event = nullptr; ASSERT_SUCCESS(urEnqueueUSMPrefetch(queue, ptr, allocation_size, getParam(), 0, nullptr, &prefetch_event)); @@ -57,11 +51,6 @@ TEST_P(urEnqueueUSMPrefetchWithParamTest, Success) { */ TEST_P(urEnqueueUSMPrefetchWithParamTest, CheckWaitEvent) { UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); - // if (getParam() == UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST) { - // // Intel GPU drivers do not currently support prefetching memory from - // // device back to host. - // UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::OpenCL); - // } ur_queue_handle_t fill_queue; ASSERT_SUCCESS(urQueueCreate(context, device, nullptr, &fill_queue)); From 236f70b0fd7b748a46014596396d98d0709d8127 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Thu, 24 Jul 2025 13:25:45 -0700 Subject: [PATCH 21/33] clang-format --- sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp b/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp index 6386ee8fdf367..18eac0a916bd4 100644 --- a/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp +++ b/sycl/unittests/Extensions/EnqueueFunctionsPrefetch.cpp @@ -11,10 +11,10 @@ #include #include -#include +#include #include +#include #include -#include using namespace sycl; @@ -23,7 +23,7 @@ namespace oneapiext = ext::oneapi::experimental; namespace { static ur_usm_migration_flags_t SubmittedPrefetchType = - UR_USM_MIGRATION_FLAG_FORCE_UINT32; + UR_USM_MIGRATION_FLAG_FORCE_UINT32; inline ur_result_t replace_urUSMEnqueuePrefetch(void *pParams) { auto params = *static_cast(pParams); @@ -55,7 +55,7 @@ TEST_F(EnqueueFunctionsPrefetchTests, SubmitHostToDevicePrefetch) { oneapiext::submit(Q, [&](handler &CGH) { oneapiext::prefetch(CGH, Dst, sizeof(int) * N, - oneapiext::prefetch_type::device); + oneapiext::prefetch_type::device); }); ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); @@ -69,7 +69,7 @@ TEST_F(EnqueueFunctionsPrefetchTests, SubmitDeviceToHostPrefetch) { oneapiext::submit(Q, [&](handler &CGH) { oneapiext::prefetch(CGH, Dst, sizeof(int) * N, - oneapiext::prefetch_type::host); + oneapiext::prefetch_type::host); }); ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_DEVICE_TO_HOST); From 6de51cc398945cdfa4dc47a5cbcb140ea7be07a3 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 12:38:43 -0700 Subject: [PATCH 22/33] Remove overcomplicated prefetch alternatives --- sycl/include/sycl/detail/cg_types.hpp | 5 -- .../oneapi/experimental/enqueue_functions.hpp | 6 +- sycl/include/sycl/handler.hpp | 32 +++------ sycl/source/detail/cg.hpp | 40 ++--------- sycl/source/detail/graph/node_impl.hpp | 39 +--------- sycl/source/detail/handler_impl.hpp | 5 ++ sycl/source/detail/scheduler/commands.cpp | 71 +------------------ sycl/source/handler.cpp | 35 +++------ 8 files changed, 38 insertions(+), 195 deletions(-) diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index b082c34fa5cbe..843db64dea661 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -67,11 +67,6 @@ enum class CGType : unsigned int { EnqueueNativeCommand = 27, AsyncAlloc = 28, AsyncFree = 29, -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - PrefetchUSMExp = 30, -#else - PrefetchUSMExpD2H = 30, -#endif }; template struct check_fn_signature { diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index 0b82f0ce10fa9..a661cd3b9a521 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -373,12 +373,14 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count, inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type = prefetch_type::device) { #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - CGH.ext_oneapi_prefetch_exp(Ptr, NumBytes, Type); + CGH.prefetch(Ptr, NumBytes, Type); #else if (Type == prefetch_type::device) { + // Incase an older libsycl.so is used, don't call prefetch function overload + // with new prefetch_type parameter: CGH.prefetch(Ptr, NumBytes); } else { - CGH.ext_oneapi_prefetch_d2h(Ptr, NumBytes); + CGH.prefetch(Ptr, NumBytes, Type); } #endif } diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 74a8118833769..58e4bb1d10af9 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -150,7 +150,6 @@ template class work_group_memory; template class dynamic_work_group_memory; struct image_descriptor; enum class prefetch_type; -void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type); __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr); __SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, @@ -2622,6 +2621,16 @@ class __SYCL_EXPORT handler { /// \param Count is a number of bytes to be prefetched. void prefetch(const void *Ptr, size_t Count); + /// Provides hints to the runtime library that data should be made available + /// on a device earlier than Unified Shared Memory would normally require it + /// to be available. + /// + /// \param Ptr is a USM pointer to the memory to be prefetched to the device. + /// \param Count is a number of bytes to be prefetched. + /// \param Type is type of prefetch, i.e. fetch to device or fetch to host. + void prefetch(const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type); + /// Provides additional information to the underlying runtime about how /// different allocations are used. /// @@ -3461,27 +3470,6 @@ class __SYCL_EXPORT handler { void ext_oneapi_memset2d_impl(void *Dest, size_t DestPitch, int Value, size_t Width, size_t Height); -// Implementation of enqueue_functions extension's USM prefetch, allowing for -// prefetching memory from both host to device and vice versa. -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - // Prefetch implementation that accounts for prefetching both directions, but - // introduces a "prefetch type" field to handler/CG nodes: this results in an - // ABI break. - void ext_oneapi_prefetch_exp(const void *Ptr, size_t Count, - ext::oneapi::experimental::prefetch_type Type); -#else - // Non-ABI breaking implementation that implements prefetching from device to - // host as a separate function. - void ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count); - // TODO upon next ABI-breaking cycle, decide which approach to go with. -#endif - - // Enqueue_functions extension's prefetch function is friended in order to - // call private handler function ext_oneapi_prefetch_d2h. - friend void ext::oneapi::experimental::prefetch( - handler &CGH, void *Ptr, size_t NumBytes, - ext::oneapi::experimental::prefetch_type Type); - // Implementation of memcpy to device_global. void memcpyToDeviceGlobal(const void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScoped, size_t NumBytes, diff --git a/sycl/source/detail/cg.hpp b/sycl/source/detail/cg.hpp index aa35b4f89403a..1861c64ad4bc1 100644 --- a/sycl/source/detail/cg.hpp +++ b/sycl/source/detail/cg.hpp @@ -398,50 +398,20 @@ class CGFillUSM : public CG { class CGPrefetchUSM : public CG { void *MDst; size_t MLength; + ext::oneapi::experimental::prefetch_type MPrefetchType; public: CGPrefetchUSM(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, + ext::oneapi::experimental::prefetch_type PrefetchType, detail::code_location loc = {}) : CG(CGType::PrefetchUSM, std::move(CGData), std::move(loc)), - MDst(DstPtr), MLength(Length) {} - void *getDst() { return MDst; } - size_t getLength() { return MLength; } -}; - -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -/// Enqueue_functions extension USM Prefetch command group class -class CGPrefetchUSMExp : public CG { - void *MDst; - size_t MLength; - ext::oneapi::experimental::prefetch_type MPrefetchType; - -public: - CGPrefetchUSMExp(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, - ext::oneapi::experimental::prefetch_type PrefetchType, - detail::code_location loc = {}) - : CG(CGType::PrefetchUSMExp, std::move(CGData), std::move(loc)), MDst(DstPtr), MLength(Length), MPrefetchType(PrefetchType) {} - void *getDst() { return MDst; } - size_t getLength() { return MLength; } - ext::oneapi::experimental::prefetch_type getPrefetchType() { + void *getDst() const { return MDst; } + size_t getLength() const { return MLength; } + ext::oneapi::experimental::prefetch_type getPrefetchType() const { return MPrefetchType; } }; -#else -/// Enqueue_functions USM device-to-host prefetch command group class -class CGPrefetchUSMExpD2H : public CG { - void *MDst; - size_t MLength; - -public: - CGPrefetchUSMExpD2H(void *DstPtr, size_t Length, CG::StorageInitHelper CGData, - detail::code_location loc = {}) - : CG(CGType::PrefetchUSMExpD2H, std::move(CGData), std::move(loc)), - MDst(DstPtr), MLength(Length) {} - void *getDst() { return MDst; } - size_t getLength() { return MLength; } -}; -#endif /// "Advise USM" command group class. class CGAdviseUSM : public CG { diff --git a/sycl/source/detail/graph/node_impl.hpp b/sycl/source/detail/graph/node_impl.hpp index 01ebf4e9168fc..45286db8b8863 100644 --- a/sycl/source/detail/graph/node_impl.hpp +++ b/sycl/source/detail/graph/node_impl.hpp @@ -56,11 +56,6 @@ inline node_type getNodeTypeFromCG(sycl::detail::CGType CGType) { case sycl::detail::CGType::FillUSM: return node_type::memfill; case sycl::detail::CGType::PrefetchUSM: -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case sycl::detail::CGType::PrefetchUSMExp: -#else - case sycl::detail::CGType::PrefetchUSMExpD2H: -#endif return node_type::prefetch; case sycl::detail::CGType::AdviseUSM: return node_type::memadvise; @@ -253,13 +248,6 @@ class node_impl : public std::enable_shared_from_this { return createCGCopy(); case sycl::detail::CGType::PrefetchUSM: return createCGCopy(); -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case sycl::detail::CGType::PrefetchUSMExp: - return createCGCopy(); -#else - case sycl::detail::CGType::PrefetchUSMExpD2H: - return createCGCopy(); -#endif case sycl::detail::CGType::AdviseUSM: return createCGCopy(); case sycl::detail::CGType::Copy2DUSM: @@ -668,34 +656,13 @@ class node_impl : public std::enable_shared_from_this { sycl::detail::CGPrefetchUSM *Prefetch = static_cast(MCommandGroup.get()); Stream << "Dst: " << Prefetch->getDst() - << " Length: " << Prefetch->getLength() << "\\n"; - } - break; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case sycl::detail::CGType::PrefetchUSMExp: - Stream << "CGPrefetchUSMExp \\n"; - if (Verbose) { - sycl::detail::CGPrefetchUSMExp *PrefetchExp = - static_cast(MCommandGroup.get()); - Stream << "Dst: " << PrefetchExp->getDst() - << " Length: " << PrefetchExp->getLength() << " PrefetchType: " + << " Length: " << Prefetch->getLength() << " PrefetchType: " << sycl::ext::oneapi::experimental::prefetchTypeToString( - PrefetchExp->getPrefetchType()) + Prefetch->getPrefetchType()) << "\\n"; + } break; -#else - case sycl::detail::CGType::PrefetchUSMExpD2H: - Stream << "CGPrefetchUSMExpD2H (Experimental, Device to host) \\n"; - if (Verbose) { - sycl::detail::CGPrefetchUSMExpD2H *Prefetch = - static_cast( - MCommandGroup.get()); - Stream << "Dst: " << Prefetch->getDst() - << " Length: " << Prefetch->getLength() << "\\n"; - } - break; -#endif case sycl::detail::CGType::AdviseUSM: Stream << "CGAdviseUSM \\n"; if (Verbose) { diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 0fda3dd4f2769..fa96fe6bb1284 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace sycl { inline namespace _V1 { @@ -91,6 +92,10 @@ class handler_impl { /// property. bool MIsDeviceImageScoped = false; + /// Direction of USM prefetch / destination device. + sycl::ext::oneapi::experimental::prefetch_type MPrefetchType = + sycl::ext::oneapi::experimental::prefetch_type::device; + // Program scope pipe information. // Pipe name that uniquely identifies a pipe. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 96edcbb2e1251..7cbce262e8be7 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -1913,15 +1913,6 @@ static std::string_view cgTypeToString(detail::CGType Type) { case detail::CGType::PrefetchUSM: return "prefetch usm"; break; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case detail::CGType::PrefetchUSMExp: - return "prefetch usm (experimental)"; - break; -#else - case detail::CGType::PrefetchUSMExpD2H: - return "prefetch usm (experimental, device to host)"; - break; -#endif case detail::CGType::CodeplayHostTask: return "host task"; break; @@ -2999,43 +2990,13 @@ ur_result_t ExecCGCommand::enqueueImpCommandBuffer() { MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, &MQueue->getContextImpl(), MCommandBuffer, Prefetch->getDst(), Prefetch->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, - sycl::ext::oneapi::experimental::prefetch_type::device); - Result != UR_RESULT_SUCCESS) - return Result; - - MEvent->setSyncPoint(OutSyncPoint); - return UR_RESULT_SUCCESS; - } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case CGType::PrefetchUSMExp: { - CGPrefetchUSMExp *PrefetchExp = (CGPrefetchUSMExp *)MCommandGroup.get(); - if (auto Result = callMemOpHelper( - MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, - &MQueue->getContextImpl(), MCommandBuffer, PrefetchExp->getDst(), - PrefetchExp->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, - PrefetchExp->getPrefetchType()); - Result != UR_RESULT_SUCCESS) - return Result; - - MEvent->setSyncPoint(OutSyncPoint); - return UR_RESULT_SUCCESS; - } -#else - case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSMExpD2H *PrefetchD2H = - (CGPrefetchUSMExpD2H *)MCommandGroup.get(); - if (auto Result = callMemOpHelper( - MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer, - &MQueue->getContextImpl(), MCommandBuffer, PrefetchD2H->getDst(), - PrefetchD2H->getLength(), std::move(MSyncPointDeps), &OutSyncPoint, - sycl::ext::oneapi::experimental::prefetch_type::host); + Prefetch->getPrefetchType()); Result != UR_RESULT_SUCCESS) return Result; MEvent->setSyncPoint(OutSyncPoint); return UR_RESULT_SUCCESS; } -#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = callMemOpHelper( @@ -3341,41 +3302,13 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { if (auto Result = callMemOpHelper( MemoryManager::prefetch_usm, Prefetch->getDst(), *MQueue, Prefetch->getLength(), std::move(RawEvents), Event, - sycl::ext::oneapi::experimental::prefetch_type::device); - Result != UR_RESULT_SUCCESS) - return Result; - - SetEventHandleOrDiscard(); - return UR_RESULT_SUCCESS; - } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case CGType::PrefetchUSMExp: { - CGPrefetchUSMExp *PrefetchExp = (CGPrefetchUSMExp *)MCommandGroup.get(); - if (auto Result = callMemOpHelper( - MemoryManager::prefetch_usm, PrefetchExp->getDst(), *MQueue, - PrefetchExp->getLength(), std::move(RawEvents), Event, - PrefetchExp->getPrefetchType()); + Prefetch->getPrefetchType()); Result != UR_RESULT_SUCCESS) return Result; SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } -#else - case CGType::PrefetchUSMExpD2H: { - CGPrefetchUSMExpD2H *PrefetchD2H = - (CGPrefetchUSMExpD2H *)MCommandGroup.get(); - if (auto Result = callMemOpHelper( - MemoryManager::prefetch_usm, PrefetchD2H->getDst(), *MQueue, - PrefetchD2H->getLength(), std::move(RawEvents), Event, - sycl::ext::oneapi::experimental::prefetch_type::host); - Result != UR_RESULT_SUCCESS) - return Result; - - SetEventHandleOrDiscard(); - return UR_RESULT_SUCCESS; - } -#endif case CGType::AdviseUSM: { CGAdviseUSM *Advise = (CGAdviseUSM *)MCommandGroup.get(); if (auto Result = diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 1c1aba7868467..b8baa5c5ff158 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include namespace sycl { @@ -720,19 +721,9 @@ event handler::finalize() { break; case detail::CGType::PrefetchUSM: CommandGroup.reset(new detail::CGPrefetchUSM( - MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); - break; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - case detail::CGType::PrefetchUSMExp: - CommandGroup.reset(new detail::CGPrefetchUSMExp( - MDstPtr, MLength, std::move(impl->CGData), MPrefetchType, MCodeLoc)); - break; -#else - case detail::CGType::PrefetchUSMExpD2H: - CommandGroup.reset(new detail::CGPrefetchUSMExpD2H( - MDstPtr, MLength, std::move(impl->CGData), MCodeLoc)); + MDstPtr, MLength, std::move(impl->CGData), impl->MPrefetchType, + MCodeLoc)); break; -#endif case detail::CGType::AdviseUSM: CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice, std::move(impl->CGData), @@ -1484,27 +1475,19 @@ void handler::prefetch(const void *Ptr, size_t Count) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); MLength = Count; + impl->MPrefetchType = ext::oneapi::experimental::prefetch_type::device; setType(detail::CGType::PrefetchUSM); } -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES -void handler::ext_oneapi_prefetch_exp( - const void *Ptr, size_t Count, - ext::oneapi::experimental::prefetch_type Type) { +void handler::prefetch( + const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); MLength = Count; - MPrefetchType = Type; - setType(detail::CGType::PrefetchUSMExp); -} -#else -void handler::ext_oneapi_prefetch_d2h(const void *Ptr, size_t Count) { - throwIfActionIsCreated(); - MDstPtr = const_cast(Ptr); - MLength = Count; - setType(detail::CGType::PrefetchUSMExpD2H); + impl->MPrefetchType = Type; + setType(detail::CGType::PrefetchUSM); } -#endif void handler::mem_advise(const void *Ptr, size_t Count, int Advice) { throwIfActionIsCreated(); From 16d40d93172a7e1138be7fee4ddc68c0e94bbbfa Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 14:52:49 -0700 Subject: [PATCH 23/33] Fix testing --- sycl/include/sycl/handler.hpp | 6 +----- .../ext_oneapi_enqueue_functions_prefetch.cpp | 12 +++++------- sycl/test-e2e/USM/prefetch_exp.cpp | 7 ++----- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 58e4bb1d10af9..b228543b3cb33 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -149,7 +150,6 @@ namespace ext ::oneapi ::experimental { template class work_group_memory; template class dynamic_work_group_memory; struct image_descriptor; -enum class prefetch_type; __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr); __SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, @@ -3200,10 +3200,6 @@ class __SYCL_EXPORT handler { void *MDstPtr = nullptr; /// Length to copy or fill (for USM operations). size_t MLength = 0; -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES - // Prefetch direction for ext_oneapi_prefetch_exp - ext::oneapi::experimental::prefetch_type MPrefetchType; -#endif /// Pattern that is used to fill memory object in case command type is fill. std::vector MPattern; /// Storage for a lambda or function object. diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index dd59858153fd4..fe925ed21fa99 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -1,8 +1,10 @@ +// REQUIRES: aspect-usm_shared_allocations +// // RUN: %{build} -o %t.out // RUN: %{run} %t.out // -// OpenCL currently does not support command buffers: // UNSUPPORTED: opencl +// UNSUPPORTED-INTENDED: OpenCL currently does not support command buffers // // Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} @@ -19,10 +21,6 @@ static constexpr int Pattern = 42; int main() { queue Q{}; - if (!Q.get_device().get_info()) { - // USM not supported, skipping test and returning early. - return 0; - } int *Src = (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); @@ -56,7 +54,7 @@ int main() { // Check host-to-device prefetch results for (int i = 0; i < N; i++) - assert(Dst[i] == Pattern * 2); + assert(check_value(i, Pattern * 2, Dst[i], "Dst")); { exp_ext::command_graph Graph{Q.get_context(), Q.get_device(), {}}; @@ -85,7 +83,7 @@ int main() { // Check device-to-host prefetch results for (int i = 0; i < N; i++) - assert(Dst[i] == Pattern + 1); + assert(check_value(i, Pattern + 1, Dst[i], "Dst")); free(Src, Q); free(Dst, Q); diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp index b3145d919bcfb..7aea460f0aa22 100644 --- a/sycl/test-e2e/USM/prefetch_exp.cpp +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -6,6 +6,8 @@ // //===----------------------------------------------------------------------===// +// REQUIRES: aspect-usm_shared_allocations +// // RUN: %{build} -o %t1.out // RUN: %{run} %t1.out @@ -23,11 +25,6 @@ int main() { throw e; }); - if (!q.get_device().get_info()) { - // USM not supported, skipping test and returning early. - return 0; - } - float *Src = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), q.get_context()); float *Dest = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), From 160af9d4136b3b0312e4115f2b23dfeeefeeb30a Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 17:54:44 -0400 Subject: [PATCH 24/33] Apply suggestions for graph tests Co-authored-by: Pablo Reble --- .../ext_oneapi_enqueue_functions_prefetch.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index fe925ed21fa99..75e6fcbe7c4f1 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -6,10 +6,7 @@ // UNSUPPORTED: opencl // UNSUPPORTED-INTENDED: OpenCL currently does not support command buffers // -// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} -// Extra run to check for immediate-command-list in Level Zero -// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} // Tests prefetch functionality in enqueue functions @@ -22,8 +19,7 @@ static constexpr int Pattern = 42; int main() { queue Q{}; - int *Src = - (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + int *Src = malloc_shared(N, Q); int *Dst = (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); for (int i = 0; i < N; i++) From ddb53e5b950f50b59805b204d3335f1db7532d78 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 14:57:28 -0700 Subject: [PATCH 25/33] use new syntax for malloc_shared --- .../RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp | 3 +-- sycl/test-e2e/USM/prefetch_exp.cpp | 6 ++---- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index 75e6fcbe7c4f1..f057fa67c83c2 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -20,8 +20,7 @@ int main() { queue Q{}; int *Src = malloc_shared(N, Q); - int *Dst = - (int *)malloc_shared(sizeof(int) * N, Q.get_device(), Q.get_context()); + int *Dst = malloc_shared(N, Q) for (int i = 0; i < N; i++) Src[i] = Pattern; diff --git a/sycl/test-e2e/USM/prefetch_exp.cpp b/sycl/test-e2e/USM/prefetch_exp.cpp index 7aea460f0aa22..56fa19c527814 100644 --- a/sycl/test-e2e/USM/prefetch_exp.cpp +++ b/sycl/test-e2e/USM/prefetch_exp.cpp @@ -25,10 +25,8 @@ int main() { throw e; }); - float *Src = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), - q.get_context()); - float *Dest = (float *)malloc_shared(sizeof(float) * Count, q.get_device(), - q.get_context()); + float *Src = malloc_shared(Count, q); + float *Dest = malloc_shared(Count, q); for (int i = 0; i < Count; i++) Src[i] = i; From 8de6a9a4f0dc37be93c6304fb723ba27bd8bfd76 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 15:05:17 -0700 Subject: [PATCH 26/33] clang-format --- sycl/include/sycl/handler.hpp | 2 +- sycl/source/detail/graph/node_impl.hpp | 1 - sycl/source/detail/handler_impl.hpp | 2 +- sycl/source/handler.cpp | 13 ++++++------- .../ext_oneapi_enqueue_functions_prefetch.cpp | 2 +- 5 files changed, 9 insertions(+), 11 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index b228543b3cb33..32383b43cefa1 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -32,10 +32,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/sycl/source/detail/graph/node_impl.hpp b/sycl/source/detail/graph/node_impl.hpp index 45286db8b8863..fdcae10a5ceb4 100644 --- a/sycl/source/detail/graph/node_impl.hpp +++ b/sycl/source/detail/graph/node_impl.hpp @@ -660,7 +660,6 @@ class node_impl : public std::enable_shared_from_this { << sycl::ext::oneapi::experimental::prefetchTypeToString( Prefetch->getPrefetchType()) << "\\n"; - } break; case sycl::detail::CGType::AdviseUSM: diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index fa96fe6bb1284..23ce36d691dc2 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -94,7 +94,7 @@ class handler_impl { /// Direction of USM prefetch / destination device. sycl::ext::oneapi::experimental::prefetch_type MPrefetchType = - sycl::ext::oneapi::experimental::prefetch_type::device; + sycl::ext::oneapi::experimental::prefetch_type::device; // Program scope pipe information. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index b8baa5c5ff158..f0ecf11affc99 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -36,9 +36,9 @@ #include #include +#include #include #include -#include #include namespace sycl { @@ -720,9 +720,9 @@ event handler::finalize() { MCodeLoc)); break; case detail::CGType::PrefetchUSM: - CommandGroup.reset(new detail::CGPrefetchUSM( - MDstPtr, MLength, std::move(impl->CGData), impl->MPrefetchType, - MCodeLoc)); + CommandGroup.reset( + new detail::CGPrefetchUSM(MDstPtr, MLength, std::move(impl->CGData), + impl->MPrefetchType, MCodeLoc)); break; case detail::CGType::AdviseUSM: CommandGroup.reset(new detail::CGAdviseUSM(MDstPtr, MLength, impl->MAdvice, @@ -1479,9 +1479,8 @@ void handler::prefetch(const void *Ptr, size_t Count) { setType(detail::CGType::PrefetchUSM); } -void handler::prefetch( - const void *Ptr, size_t Count, - ext::oneapi::experimental::prefetch_type Type) { +void handler::prefetch(const void *Ptr, size_t Count, + ext::oneapi::experimental::prefetch_type Type) { throwIfActionIsCreated(); MDstPtr = const_cast(Ptr); MLength = Count; diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index f057fa67c83c2..982d2a52014a7 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -20,7 +20,7 @@ int main() { queue Q{}; int *Src = malloc_shared(N, Q); - int *Dst = malloc_shared(N, Q) + int *Dst = malloc_shared(N, Q); for (int i = 0; i < N; i++) Src[i] = Pattern; From e601af9800540adcbb3a5f393cf5216a5d711a52 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Fri, 25 Jul 2025 15:16:57 -0700 Subject: [PATCH 27/33] remove import from handler --- sycl/include/sycl/handler.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 32383b43cefa1..6f96d771b13bf 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -150,6 +149,7 @@ namespace ext ::oneapi ::experimental { template class work_group_memory; template class dynamic_work_group_memory; struct image_descriptor; +enum class prefetch_type; __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr); __SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, From 514474d08ca46623e5f19a0d826efe5c46802e85 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 29 Jul 2025 08:43:59 -0700 Subject: [PATCH 28/33] add new ABI symbol --- sycl/test/abi/sycl_symbols_linux.dump | 1 + sycl/test/abi/sycl_symbols_windows.dump | 1 + 2 files changed, 2 insertions(+) diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index dd392cf315b88..62d363b8d4189 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3632,6 +3632,7 @@ _ZN4sycl3_V17handler7setTypeENS0_6detail6CGTypeE _ZN4sycl3_V17handler8finalizeEv _ZN4sycl3_V17handler8getQueueEv _ZN4sycl3_V17handler8prefetchEPKvm +_ZN4sycl3_V17handler8prefetchEPKvmNS0_3ext6oneapi12experimental13prefetch_typeE _ZN4sycl3_V17handler9clearArgsEv _ZN4sycl3_V17handler9fill_implEPvPKvmm _ZN4sycl3_V17handlerC1EOSt10unique_ptrINS0_6detail12handler_implESt14default_deleteIS4_EE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 748d74482a1a5..2bf236e2a5d35 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4358,6 +4358,7 @@ ?postProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z ?preProcess@HandlerAccess@detail@_V1@sycl@@SAXAEAVhandler@34@Vtype_erased_cgfo_ty@234@@Z ?prefetch@handler@_V1@sycl@@QEAAXPEBX_K@Z +?prefetch@handler@_V1@sycl@@QEAAXPEBX_KW4prefetch_type@experimental@oneapi@ext@23@@Z ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBUcode_location@detail@23@@Z ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z ?prefetch@queue@_V1@sycl@@QEAA?AVevent@23@PEBX_KV423@AEBUcode_location@detail@23@@Z From b074aa7b15f8fa3e4368741c02447806f4ea3365 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 29 Jul 2025 08:44:50 -0700 Subject: [PATCH 29/33] remove preview_breaking_changes --- .../sycl/ext/oneapi/experimental/enqueue_functions.hpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index a661cd3b9a521..36f14b23845c1 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -372,17 +372,7 @@ void fill(sycl::queue Q, T *Ptr, const T &Pattern, size_t Count, inline void prefetch(handler &CGH, void *Ptr, size_t NumBytes, prefetch_type Type = prefetch_type::device) { -#ifdef __INTEL_PREVIEW_BREAKING_CHANGES CGH.prefetch(Ptr, NumBytes, Type); -#else - if (Type == prefetch_type::device) { - // Incase an older libsycl.so is used, don't call prefetch function overload - // with new prefetch_type parameter: - CGH.prefetch(Ptr, NumBytes); - } else { - CGH.prefetch(Ptr, NumBytes, Type); - } -#endif } inline void prefetch(queue Q, void *Ptr, size_t NumBytes, From 24516095f10b299975acddb27e3e914220f25d28 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 29 Jul 2025 12:33:06 -0700 Subject: [PATCH 30/33] Add new unit testing for normal USM prefetch --- sycl/unittests/Extensions/CMakeLists.txt | 1 + sycl/unittests/Extensions/USMPrefetch.cpp | 71 +++++++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 sycl/unittests/Extensions/USMPrefetch.cpp diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 851ae8720b856..59d57f0851ec1 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -23,6 +23,7 @@ add_sycl_unittest(ExtensionsTests OBJECT EventMode.cpp DeviceInfo.cpp RootGroup.cpp + USMPrefetch.cpp ) add_subdirectory(CommandGraph) diff --git a/sycl/unittests/Extensions/USMPrefetch.cpp b/sycl/unittests/Extensions/USMPrefetch.cpp new file mode 100644 index 0000000000000..d8abff74b0248 --- /dev/null +++ b/sycl/unittests/Extensions/USMPrefetch.cpp @@ -0,0 +1,71 @@ +//==------------------------- USMPrefetch.cpp ------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// Test SYCL prefetch calls UR prefetch functions with the correct arguments. + +#include + +#include +#include +#include +#include +#include + +using namespace sycl; + +namespace { + +static ur_usm_migration_flags_t SubmittedPrefetchType = + UR_USM_MIGRATION_FLAG_FORCE_UINT32; + +inline ur_result_t replace_urUSMEnqueuePrefetch(void *pParams) { + auto params = *static_cast(pParams); + SubmittedPrefetchType = *params.pflags; + return UR_RESULT_SUCCESS; +} + +static constexpr size_t N = 1024; +class USMPrefetchTests : public ::testing::Test { +public: + USMPrefetchTests() + : Mock{}, Q{context(sycl::platform()), default_selector_v, + property::queue::in_order{}} {} + +protected: + void SetUp() override { + SubmittedPrefetchType = UR_USM_MIGRATION_FLAG_FORCE_UINT32; + Dst = malloc_shared(N, Q); + } + + unittest::UrMock<> Mock; + queue Q; + int *Dst; +}; + +TEST_F(USMPrefetchTests, QueuePrefetch) { + mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", + replace_urUSMEnqueuePrefetch); + + Q.prefetch(Dst, sizeof(int) * N); + ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + free(Dst, Q); +} + +TEST_F(USMPrefetchTests, HandlerPrefetch) { + mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", + replace_urUSMEnqueuePrefetch); + + Q.submit([&](handler &CGH) { + CGH.prefetch(Dst, sizeof(int) * N); + }); + ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); + + free(Dst, Q); +} + +} // namespace From 09e9a92609450ba5da33a4119c16ac560c3fb782 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Tue, 29 Jul 2025 12:43:13 -0700 Subject: [PATCH 31/33] clang-format --- sycl/unittests/Extensions/USMPrefetch.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sycl/unittests/Extensions/USMPrefetch.cpp b/sycl/unittests/Extensions/USMPrefetch.cpp index d8abff74b0248..512eb638d51ef 100644 --- a/sycl/unittests/Extensions/USMPrefetch.cpp +++ b/sycl/unittests/Extensions/USMPrefetch.cpp @@ -60,9 +60,7 @@ TEST_F(USMPrefetchTests, HandlerPrefetch) { mock::getCallbacks().set_replace_callback("urEnqueueUSMPrefetch", replace_urUSMEnqueuePrefetch); - Q.submit([&](handler &CGH) { - CGH.prefetch(Dst, sizeof(int) * N); - }); + Q.submit([&](handler &CGH) { CGH.prefetch(Dst, sizeof(int) * N); }); ASSERT_EQ(SubmittedPrefetchType, UR_USM_MIGRATION_FLAG_HOST_TO_DEVICE); free(Dst, Q); From 152c1f5379ab8540800aa5b33cdcd69de608d854 Mon Sep 17 00:00:00 2001 From: Ian Li Date: Wed, 30 Jul 2025 12:44:09 -0400 Subject: [PATCH 32/33] Be more precise about opencl command buffer support Co-authored-by: Pablo Reble --- .../RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index 982d2a52014a7..7e4ceb20e0351 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -4,7 +4,7 @@ // RUN: %{run} %t.out // // UNSUPPORTED: opencl -// UNSUPPORTED-INTENDED: OpenCL currently does not support command buffers +// UNSUPPORTED-INTENDED: OpenCL currently has limited support for command buffers // // RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} From b81a531155c3e0df8a6039a5df8719ba1e88b0cf Mon Sep 17 00:00:00 2001 From: Ian Li Date: Wed, 30 Jul 2025 09:48:22 -0700 Subject: [PATCH 33/33] clang-format --- .../RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp index 7e4ceb20e0351..d998f6d9a21c6 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions_prefetch.cpp @@ -4,7 +4,8 @@ // RUN: %{run} %t.out // // UNSUPPORTED: opencl -// UNSUPPORTED-INTENDED: OpenCL currently has limited support for command buffers +// UNSUPPORTED-INTENDED: OpenCL currently has limited support for command +// buffers // // RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}