42
42
#include " utils.h"
43
43
#include " ze_event_cache.h"
44
44
#include " ze_local_collection_helpers.h"
45
+ #include " ze_timer_helper.h"
45
46
#include " ze_utils.h"
46
47
#include " ze_wrappers.h"
47
48
@@ -64,6 +65,10 @@ struct ZeInstanceData {
64
65
65
66
inline thread_local ZeInstanceData ze_instance_data;
66
67
68
+ // Used for CPU/GPU sync points. See description to ZeCollector::GetDeviceTimestamps function
69
+ inline thread_local std::map<ze_device_handle_t , std::unique_ptr<CPUGPUTimeInterpolationHelper>>
70
+ timer_helpers_;
71
+
67
72
struct ZeKernelGroupSize {
68
73
uint32_t x;
69
74
uint32_t y;
@@ -140,6 +145,7 @@ struct ZeDeviceDescriptor {
140
145
uint64_t device_time_origin = 0 ;
141
146
uint64_t device_timer_frequency = 0 ;
142
147
uint64_t device_timer_mask = 0 ;
148
+ uint64_t device_sync_delta = 50'000'000ULL ; // 50ms
143
149
ze_driver_handle_t driver = nullptr ;
144
150
ze_context_handle_t context = nullptr ;
145
151
ze_pci_ext_properties_t pci_properties{};
@@ -247,6 +253,74 @@ class ZeCollector {
247
253
enum class ZeCollectionMode { Full = 0 , Hybrid = 1 , Local = 2 };
248
254
enum class ZeCollectionState { Normal = 0 , Abnormal = 1 };
249
255
256
+ /* *
257
+ * \internal
258
+ * \brief Returns the current device timestamps, CPU in nanoceconds and GPU in ticks
259
+ *
260
+ * \param [in] device - the device to get the timestamps from
261
+ * \param [out] host_time - the CPU time in nanoseconds
262
+ * \param [out] device_time - the GPU time in ticks
263
+ * \return ZE_RESULT_SUCCESS on success, ZE_RESULT_ERROR on failure
264
+ *
265
+ * Previously, zeDeviceGetGlobalTimestamps was called every time CPU and GPU timestamps
266
+ * needed to be synced (via utils::ze::GetDeviceTimestamps(device, host_time, device_time);)
267
+ * GPU cycles were then converted to CPU (aka host) timescale.
268
+ * However, zeDeviceGetGlobalTimestamps has a high latency,
269
+ * so it is not sutable for frequent calls (e.g. each dozen of micro-secs), especially in profiler
270
+ *
271
+ * The current implementation calls zeDeviceGetGlobalTimestamp less often:
272
+ * once in ~ dozens (or hundreds) of milliseconds
273
+ * (see CPUGPUTimeInterpolationHelper.delta) - for sync CPU/GPU point
274
+ * per thread per device. (per thread - to avoid any synchronization between threads)
275
+ * The delta between sycn points is selected empirically, but it is important keep in mind that
276
+ * on systems with 32 GPU time counter - this counter would wrap around every few seconds.
277
+ * The delta should be less than this wrap around time.
278
+ *
279
+ * Another change - the GPU timer frequency is not interpolated anymore but rather
280
+ * taken from the device descriptor. This assumes that it is constant.
281
+ *
282
+ * The function is called synchroniously in a profiled thread, once per device.
283
+ *
284
+ * InterpolationHelper keeps the sync point from some recent past.
285
+ * If CPU time, from the recent sych point, exceeded delta - makes a new sych point.
286
+ * The sync point data are returned to a caller. The caller ueses the sync point data
287
+ * to convert GPU cycles to CPU time or do other ops with them.
288
+ *
289
+ */
290
+ ze_result_t GetDeviceTimestamps (ze_device_handle_t device, uint64_t * host_time,
291
+ uint64_t * device_time) {
292
+ PTI_ASSERT (device != nullptr );
293
+ PTI_ASSERT (host_time != nullptr );
294
+ PTI_ASSERT (device_time != nullptr );
295
+ if (timer_helpers_.find (device) == timer_helpers_.end ()) {
296
+ timer_helpers_[device] = std::make_unique<CPUGPUTimeInterpolationHelper>(
297
+ device, device_descriptors_[device].device_timer_frequency ,
298
+ device_descriptors_[device].device_timer_mask ,
299
+ device_descriptors_[device].device_sync_delta );
300
+ }
301
+ uint64_t anchor_host_time = 0 ;
302
+ uint64_t anchor_device_time = 0 ;
303
+ auto helper = timer_helpers_[device].get ();
304
+ uint64_t current_host_time = utils::GetTime ();
305
+ if (current_host_time - helper->cpu_timestamp_ > helper->delta_ ) {
306
+ ze_result_t res =
307
+ utils::ze::GetDeviceTimestamps (device, &anchor_host_time, &anchor_device_time);
308
+ PTI_ASSERT (res == ZE_RESULT_SUCCESS);
309
+ helper->cpu_timestamp_ = anchor_host_time;
310
+ helper->gpu_timestamp_ = anchor_device_time;
311
+ } else {
312
+ anchor_host_time = helper->cpu_timestamp_ ;
313
+ anchor_device_time = helper->gpu_timestamp_ ;
314
+ }
315
+ current_host_time = utils::GetTime ();
316
+ uint64_t current_device_time = anchor_device_time + ((current_host_time - anchor_host_time) /
317
+ timer_helpers_[device]->coeff_ );
318
+
319
+ *host_time = current_host_time;
320
+ *device_time = current_device_time;
321
+ return ZE_RESULT_SUCCESS;
322
+ }
323
+
250
324
static ZeCollectionMode SelectZeCollectionMode (bool introspection_capable, bool & disabled_mode,
251
325
bool & hybrid_mode) {
252
326
ZeCollector::ZeCollectionMode mode = ZeCollectionMode::Full;
@@ -375,6 +449,7 @@ class ZeCollector {
375
449
swap_event_pool_(512 ),
376
450
startstop_mode_changer(this ) {
377
451
CreateDeviceMap ();
452
+ UpdateDeviceSyncDelta ();
378
453
ze_result_t res = l0_wrapper_.InitDynamicTracingWrappers ();
379
454
if (ZE_RESULT_SUCCESS == res) {
380
455
loader_dynamic_tracing_capable_ = true ;
@@ -450,6 +525,27 @@ class ZeCollector {
450
525
}
451
526
}
452
527
528
+ void UpdateDeviceSyncDelta () {
529
+ // in future we can try make it per device
530
+ SPDLOG_DEBUG (" In {}" , __FUNCTION__);
531
+ uint64_t delta = 0 ;
532
+ auto env_string = utils::GetEnv (" PTI_DEVICE_SYNC_DELTA" );
533
+ SPDLOG_INFO (" Checking DeviceSyncDelta by PTI_DEVICE_SYNC_DELTA environment variable" );
534
+ if (!env_string.empty ()) {
535
+ try {
536
+ delta = std::stoll (env_string);
537
+ SPDLOG_INFO (" \t PTI_DEVICE_SYNC_DELTA is {} ns, will use it in device tracing" , delta);
538
+ } catch (std::invalid_argument const & /* ex*/ ) {
539
+ delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault ; // fallback to default
540
+ } catch (std::out_of_range const & /* ex*/ ) {
541
+ delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault ; // fallback to default
542
+ }
543
+ }
544
+ for (auto & [device, descriptor] : device_descriptors_) {
545
+ descriptor.device_sync_delta = delta;
546
+ }
547
+ }
548
+
453
549
void MarkIntrospection () {
454
550
const auto drivers = utils::ze::GetDriverList ();
455
551
for (auto const driver : drivers) {
@@ -502,21 +598,6 @@ class ZeCollector {
502
598
}
503
599
504
600
desc.pci_properties = pci_device_properties;
505
- uint64_t host_time = 0 ;
506
- uint64_t ticks = 0 ;
507
- uint64_t device_time = 0 ;
508
-
509
- overhead::Init ();
510
- status = zeDeviceGetGlobalTimestamps (device, &host_time, &ticks);
511
- overhead_fini (zeDeviceGetGlobalTimestamps_id);
512
- PTI_ASSERT (status == ZE_RESULT_SUCCESS);
513
-
514
- device_time = ticks & desc.device_timer_mask ;
515
- if (desc.device_timer_frequency ) {
516
- device_time = device_time * NSEC_IN_SEC / desc.device_timer_frequency ;
517
- }
518
- desc.host_time_origin = host_time;
519
- desc.device_time_origin = device_time;
520
601
return desc;
521
602
}
522
603
@@ -1033,10 +1114,7 @@ class ZeCollector {
1033
1114
// as all command lists submitted to the execution into queue - they are not immediate
1034
1115
PTI_ASSERT (!info.immediate );
1035
1116
PTI_ASSERT (info.device != nullptr );
1036
- overhead::Init ();
1037
- ze_result_t status =
1038
- zeDeviceGetGlobalTimestamps (info.device , &host_time_sync, &device_time_sync);
1039
- overhead_fini (zeDeviceGetGlobalTimestamps_id);
1117
+ ze_result_t status = GetDeviceTimestamps (info.device , &host_time_sync, &device_time_sync);
1040
1118
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
1041
1119
1042
1120
const std::lock_guard<std::mutex> lock (lock_);
@@ -1434,9 +1512,7 @@ class ZeCollector {
1434
1512
uint64_t host_timestamp = 0 ;
1435
1513
uint64_t device_timestamp = 0 ; // in ticks
1436
1514
1437
- overhead::Init ();
1438
- ze_result_t status = zeDeviceGetGlobalTimestamps (device, &host_timestamp, &device_timestamp);
1439
- overhead_fini (zeDeviceGetGlobalTimestamps_id);
1515
+ ze_result_t status = collector->GetDeviceTimestamps (device, &host_timestamp, &device_timestamp);
1440
1516
PTI_ASSERT (status == ZE_RESULT_SUCCESS);
1441
1517
1442
1518
ze_instance_data.timestamp_host = host_timestamp;
@@ -1539,13 +1615,12 @@ class ZeCollector {
1539
1615
} else if (command->props .type == KernelCommandType::kMemory ) {
1540
1616
SPDLOG_TRACE (" \t\t Devices in Memory command: src: {}, dst {}" ,
1541
1617
(void *)command->props .src_device , (void *)command->props .dst_device );
1542
- bool is_two_devices =
1543
- (command->props .src_device != nullptr && command->props .dst_device != nullptr ) ? true
1544
- : false ;
1545
- append_res = A2AppendBridgeMemoryCopyOrFill (
1546
- command->command_list , command->event_self , command->event_swap , command->props .dst ,
1547
- command->props .src , command->props .bytes_transferred , command->props .value_size ,
1548
- is_two_devices);
1618
+
1619
+ auto buffer = device_buffer_pool_.GetBuffers (command->context , command->device );
1620
+ PTI_ASSERT (buffer != nullptr );
1621
+ append_res = A2AppendBridgeMemoryCopyOrFillEx (command->command_list , command->event_self ,
1622
+ command->event_swap , buffer,
1623
+ device_buffer_pool_.buffer_size_ );
1549
1624
} else if (command->props .type == KernelCommandType::kCommand ) {
1550
1625
append_res =
1551
1626
A2AppendBridgeBarrier (command->command_list , command->event_self , command->event_swap );
@@ -2397,6 +2472,7 @@ class ZeCollector {
2397
2472
std::map<ze_command_queue_handle_t , ZeCommandQueue> command_queues_;
2398
2473
2399
2474
A2BridgeKernelPool bridge_kernel_pool_;
2475
+ A2DeviceBufferPool device_buffer_pool_;
2400
2476
A2EventPool swap_event_pool_;
2401
2477
2402
2478
Level0Wrapper l0_wrapper_;
0 commit comments