4242#include " utils.h"
4343#include " ze_event_cache.h"
4444#include " ze_local_collection_helpers.h"
45+ #include " ze_timer_helper.h"
4546#include " ze_utils.h"
4647#include " ze_wrappers.h"
4748
@@ -64,6 +65,10 @@ struct ZeInstanceData {
6465
6566inline thread_local ZeInstanceData ze_instance_data;
6667
68+ // Used for CPU/GPU sync points. See description to ZeCollector::GetDeviceTimestamps function
69+ inline thread_local std::map<ze_device_handle_t , std::unique_ptr<CPUGPUTimeInterpolationHelper>>
70+ timer_helpers_;
71+
6772struct ZeKernelGroupSize {
6873 uint32_t x;
6974 uint32_t y;
@@ -140,6 +145,7 @@ struct ZeDeviceDescriptor {
140145 uint64_t device_time_origin = 0 ;
141146 uint64_t device_timer_frequency = 0 ;
142147 uint64_t device_timer_mask = 0 ;
148+ uint64_t device_sync_delta = 50'000'000ULL ; // 50ms
143149 ze_driver_handle_t driver = nullptr ;
144150 ze_context_handle_t context = nullptr ;
145151 ze_pci_ext_properties_t pci_properties{};
@@ -247,6 +253,74 @@ class ZeCollector {
247253 enum class ZeCollectionMode { Full = 0 , Hybrid = 1 , Local = 2 };
248254 enum class ZeCollectionState { Normal = 0 , Abnormal = 1 };
249255
256+ /* *
257+ * \internal
258+ * \brief Returns the current device timestamps, CPU in nanoceconds and GPU in ticks
259+ *
260+ * \param [in] device - the device to get the timestamps from
261+ * \param [out] host_time - the CPU time in nanoseconds
262+ * \param [out] device_time - the GPU time in ticks
263+ * \return ZE_RESULT_SUCCESS on success, ZE_RESULT_ERROR on failure
264+ *
265+ * Previously, zeDeviceGetGlobalTimestamps was called every time CPU and GPU timestamps
266+ * needed to be synced (via utils::ze::GetDeviceTimestamps(device, host_time, device_time);)
267+ * GPU cycles were then converted to CPU (aka host) timescale.
268+ * However, zeDeviceGetGlobalTimestamps has a high latency,
269+ * so it is not sutable for frequent calls (e.g. each dozen of micro-secs), especially in profiler
270+ *
271+ * The current implementation calls zeDeviceGetGlobalTimestamp less often:
272+ * once in ~ dozens (or hundreds) of milliseconds
273+ * (see CPUGPUTimeInterpolationHelper.delta) - for sync CPU/GPU point
274+ * per thread per device. (per thread - to avoid any synchronization between threads)
275+ * The delta between sycn points is selected empirically, but it is important keep in mind that
276+ * on systems with 32 GPU time counter - this counter would wrap around every few seconds.
277+ * The delta should be less than this wrap around time.
278+ *
279+ * Another change - the GPU timer frequency is not interpolated anymore but rather
280+ * taken from the device descriptor. This assumes that it is constant.
281+ *
282+ * The function is called synchroniously in a profiled thread, once per device.
283+ *
284+ * InterpolationHelper keeps the sync point from some recent past.
285+ * If CPU time, from the recent sych point, exceeded delta - makes a new sych point.
286+ * The sync point data are returned to a caller. The caller ueses the sync point data
287+ * to convert GPU cycles to CPU time or do other ops with them.
288+ *
289+ */
290+ ze_result_t GetDeviceTimestamps (ze_device_handle_t device, uint64_t * host_time,
291+ uint64_t * device_time) {
292+ PTI_ASSERT (device != nullptr );
293+ PTI_ASSERT (host_time != nullptr );
294+ PTI_ASSERT (device_time != nullptr );
295+ if (timer_helpers_.find (device) == timer_helpers_.end ()) {
296+ timer_helpers_[device] = std::make_unique<CPUGPUTimeInterpolationHelper>(
297+ device, device_descriptors_[device].device_timer_frequency ,
298+ device_descriptors_[device].device_timer_mask ,
299+ device_descriptors_[device].device_sync_delta );
300+ }
301+ uint64_t anchor_host_time = 0 ;
302+ uint64_t anchor_device_time = 0 ;
303+ auto helper = timer_helpers_[device].get ();
304+ uint64_t current_host_time = utils::GetTime ();
305+ if (current_host_time - helper->cpu_timestamp_ > helper->delta_ ) {
306+ ze_result_t res =
307+ utils::ze::GetDeviceTimestamps (device, &anchor_host_time, &anchor_device_time);
308+ PTI_ASSERT (res == ZE_RESULT_SUCCESS);
309+ helper->cpu_timestamp_ = anchor_host_time;
310+ helper->gpu_timestamp_ = anchor_device_time;
311+ } else {
312+ anchor_host_time = helper->cpu_timestamp_ ;
313+ anchor_device_time = helper->gpu_timestamp_ ;
314+ }
315+ current_host_time = utils::GetTime ();
316+ uint64_t current_device_time = anchor_device_time + ((current_host_time - anchor_host_time) /
317+ timer_helpers_[device]->coeff_ );
318+
319+ *host_time = current_host_time;
320+ *device_time = current_device_time;
321+ return ZE_RESULT_SUCCESS;
322+ }
323+
250324 static ZeCollectionMode SelectZeCollectionMode (bool introspection_capable, bool & disabled_mode,
251325 bool & hybrid_mode) {
252326 ZeCollector::ZeCollectionMode mode = ZeCollectionMode::Full;
@@ -375,6 +449,7 @@ class ZeCollector {
375449 swap_event_pool_(512 ),
376450 startstop_mode_changer(this ) {
377451 CreateDeviceMap ();
452+ UpdateDeviceSyncDelta ();
378453 ze_result_t res = l0_wrapper_.InitDynamicTracingWrappers ();
379454 if (ZE_RESULT_SUCCESS == res) {
380455 loader_dynamic_tracing_capable_ = true ;
@@ -450,6 +525,27 @@ class ZeCollector {
450525 }
451526 }
452527
528+ void UpdateDeviceSyncDelta () {
529+ // in future we can try make it per device
530+ SPDLOG_DEBUG (" In {}" , __FUNCTION__);
531+ uint64_t delta = 0 ;
532+ auto env_string = utils::GetEnv (" PTI_DEVICE_SYNC_DELTA" );
533+ SPDLOG_INFO (" Checking DeviceSyncDelta by PTI_DEVICE_SYNC_DELTA environment variable" );
534+ if (!env_string.empty ()) {
535+ try {
536+ delta = std::stoll (env_string);
537+ SPDLOG_INFO (" \t PTI_DEVICE_SYNC_DELTA is {} ns, will use it in device tracing" , delta);
538+ } catch (std::invalid_argument const & /* ex*/ ) {
539+ delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault ; // fallback to default
540+ } catch (std::out_of_range const & /* ex*/ ) {
541+ delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault ; // fallback to default
542+ }
543+ }
544+ for (auto & [device, descriptor] : device_descriptors_) {
545+ descriptor.device_sync_delta = delta;
546+ }
547+ }
548+
453549 void MarkIntrospection () {
454550 const auto drivers = utils::ze::GetDriverList ();
455551 for (auto const driver : drivers) {
@@ -502,21 +598,6 @@ class ZeCollector {
502598 }
503599
504600 desc.pci_properties = pci_device_properties;
505- uint64_t host_time = 0 ;
506- uint64_t ticks = 0 ;
507- uint64_t device_time = 0 ;
508-
509- overhead::Init ();
510- status = zeDeviceGetGlobalTimestamps (device, &host_time, &ticks);
511- overhead_fini (zeDeviceGetGlobalTimestamps_id);
512- PTI_ASSERT (status == ZE_RESULT_SUCCESS);
513-
514- device_time = ticks & desc.device_timer_mask ;
515- if (desc.device_timer_frequency ) {
516- device_time = device_time * NSEC_IN_SEC / desc.device_timer_frequency ;
517- }
518- desc.host_time_origin = host_time;
519- desc.device_time_origin = device_time;
520601 return desc;
521602 }
522603
@@ -1033,10 +1114,7 @@ class ZeCollector {
10331114 // as all command lists submitted to the execution into queue - they are not immediate
10341115 PTI_ASSERT (!info.immediate );
10351116 PTI_ASSERT (info.device != nullptr );
1036- overhead::Init ();
1037- ze_result_t status =
1038- zeDeviceGetGlobalTimestamps (info.device , &host_time_sync, &device_time_sync);
1039- overhead_fini (zeDeviceGetGlobalTimestamps_id);
1117+ ze_result_t status = GetDeviceTimestamps (info.device , &host_time_sync, &device_time_sync);
10401118 PTI_ASSERT (status == ZE_RESULT_SUCCESS);
10411119
10421120 const std::lock_guard<std::mutex> lock (lock_);
@@ -1434,9 +1512,7 @@ class ZeCollector {
14341512 uint64_t host_timestamp = 0 ;
14351513 uint64_t device_timestamp = 0 ; // in ticks
14361514
1437- overhead::Init ();
1438- ze_result_t status = zeDeviceGetGlobalTimestamps (device, &host_timestamp, &device_timestamp);
1439- overhead_fini (zeDeviceGetGlobalTimestamps_id);
1515+ ze_result_t status = collector->GetDeviceTimestamps (device, &host_timestamp, &device_timestamp);
14401516 PTI_ASSERT (status == ZE_RESULT_SUCCESS);
14411517
14421518 ze_instance_data.timestamp_host = host_timestamp;
@@ -1539,13 +1615,12 @@ class ZeCollector {
15391615 } else if (command->props .type == KernelCommandType::kMemory ) {
15401616 SPDLOG_TRACE (" \t\t Devices in Memory command: src: {}, dst {}" ,
15411617 (void *)command->props .src_device , (void *)command->props .dst_device );
1542- bool is_two_devices =
1543- (command->props .src_device != nullptr && command->props .dst_device != nullptr ) ? true
1544- : false ;
1545- append_res = A2AppendBridgeMemoryCopyOrFill (
1546- command->command_list , command->event_self , command->event_swap , command->props .dst ,
1547- command->props .src , command->props .bytes_transferred , command->props .value_size ,
1548- is_two_devices);
1618+
1619+ auto buffer = device_buffer_pool_.GetBuffers (command->context , command->device );
1620+ PTI_ASSERT (buffer != nullptr );
1621+ append_res = A2AppendBridgeMemoryCopyOrFillEx (command->command_list , command->event_self ,
1622+ command->event_swap , buffer,
1623+ device_buffer_pool_.buffer_size_ );
15491624 } else if (command->props .type == KernelCommandType::kCommand ) {
15501625 append_res =
15511626 A2AppendBridgeBarrier (command->command_list , command->event_self , command->event_swap );
@@ -2397,6 +2472,7 @@ class ZeCollector {
23972472 std::map<ze_command_queue_handle_t , ZeCommandQueue> command_queues_;
23982473
23992474 A2BridgeKernelPool bridge_kernel_pool_;
2475+ A2DeviceBufferPool device_buffer_pool_;
24002476 A2EventPool swap_event_pool_;
24012477
24022478 Level0Wrapper l0_wrapper_;
0 commit comments