Skip to content

Commit 00e4bbc

Browse files
authored
[PTI-LIB] Reduce L0 usage for device tracing (#449)
- call less zeDeviceGetGlobalTimestamps - rely on gpu frequency as a constant when convert gpu cycles to cpu - use of gpu ze..mem..fill for tracing mem ops on gpu in local mode - adjust tests correspondingly --------- Signed-off-by: jfedorov <julia.fedorova@intel.com>
1 parent 0d47510 commit 00e4bbc

8 files changed

+259
-53
lines changed

sdk/src/levelzero/ze_collector.h

Lines changed: 105 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#include "utils.h"
4343
#include "ze_event_cache.h"
4444
#include "ze_local_collection_helpers.h"
45+
#include "ze_timer_helper.h"
4546
#include "ze_utils.h"
4647
#include "ze_wrappers.h"
4748

@@ -64,6 +65,10 @@ struct ZeInstanceData {
6465

6566
inline thread_local ZeInstanceData ze_instance_data;
6667

68+
// Used for CPU/GPU sync points. See description to ZeCollector::GetDeviceTimestamps function
69+
inline thread_local std::map<ze_device_handle_t, std::unique_ptr<CPUGPUTimeInterpolationHelper>>
70+
timer_helpers_;
71+
6772
struct ZeKernelGroupSize {
6873
uint32_t x;
6974
uint32_t y;
@@ -140,6 +145,7 @@ struct ZeDeviceDescriptor {
140145
uint64_t device_time_origin = 0;
141146
uint64_t device_timer_frequency = 0;
142147
uint64_t device_timer_mask = 0;
148+
uint64_t device_sync_delta = 50'000'000ULL; // 50ms
143149
ze_driver_handle_t driver = nullptr;
144150
ze_context_handle_t context = nullptr;
145151
ze_pci_ext_properties_t pci_properties{};
@@ -247,6 +253,74 @@ class ZeCollector {
247253
enum class ZeCollectionMode { Full = 0, Hybrid = 1, Local = 2 };
248254
enum class ZeCollectionState { Normal = 0, Abnormal = 1 };
249255

256+
/**
257+
* \internal
258+
* \brief Returns the current device timestamps, CPU in nanoceconds and GPU in ticks
259+
*
260+
* \param [in] device - the device to get the timestamps from
261+
* \param [out] host_time - the CPU time in nanoseconds
262+
* \param [out] device_time - the GPU time in ticks
263+
* \return ZE_RESULT_SUCCESS on success, ZE_RESULT_ERROR on failure
264+
*
265+
* Previously, zeDeviceGetGlobalTimestamps was called every time CPU and GPU timestamps
266+
* needed to be synced (via utils::ze::GetDeviceTimestamps(device, host_time, device_time);)
267+
* GPU cycles were then converted to CPU (aka host) timescale.
268+
* However, zeDeviceGetGlobalTimestamps has a high latency,
269+
* so it is not sutable for frequent calls (e.g. each dozen of micro-secs), especially in profiler
270+
*
271+
* The current implementation calls zeDeviceGetGlobalTimestamp less often:
272+
* once in ~ dozens (or hundreds) of milliseconds
273+
* (see CPUGPUTimeInterpolationHelper.delta) - for sync CPU/GPU point
274+
* per thread per device. (per thread - to avoid any synchronization between threads)
275+
* The delta between sycn points is selected empirically, but it is important keep in mind that
276+
* on systems with 32 GPU time counter - this counter would wrap around every few seconds.
277+
* The delta should be less than this wrap around time.
278+
*
279+
* Another change - the GPU timer frequency is not interpolated anymore but rather
280+
* taken from the device descriptor. This assumes that it is constant.
281+
*
282+
* The function is called synchroniously in a profiled thread, once per device.
283+
*
284+
* InterpolationHelper keeps the sync point from some recent past.
285+
* If CPU time, from the recent sych point, exceeded delta - makes a new sych point.
286+
* The sync point data are returned to a caller. The caller ueses the sync point data
287+
* to convert GPU cycles to CPU time or do other ops with them.
288+
*
289+
*/
290+
ze_result_t GetDeviceTimestamps(ze_device_handle_t device, uint64_t* host_time,
291+
uint64_t* device_time) {
292+
PTI_ASSERT(device != nullptr);
293+
PTI_ASSERT(host_time != nullptr);
294+
PTI_ASSERT(device_time != nullptr);
295+
if (timer_helpers_.find(device) == timer_helpers_.end()) {
296+
timer_helpers_[device] = std::make_unique<CPUGPUTimeInterpolationHelper>(
297+
device, device_descriptors_[device].device_timer_frequency,
298+
device_descriptors_[device].device_timer_mask,
299+
device_descriptors_[device].device_sync_delta);
300+
}
301+
uint64_t anchor_host_time = 0;
302+
uint64_t anchor_device_time = 0;
303+
auto helper = timer_helpers_[device].get();
304+
uint64_t current_host_time = utils::GetTime();
305+
if (current_host_time - helper->cpu_timestamp_ > helper->delta_) {
306+
ze_result_t res =
307+
utils::ze::GetDeviceTimestamps(device, &anchor_host_time, &anchor_device_time);
308+
PTI_ASSERT(res == ZE_RESULT_SUCCESS);
309+
helper->cpu_timestamp_ = anchor_host_time;
310+
helper->gpu_timestamp_ = anchor_device_time;
311+
} else {
312+
anchor_host_time = helper->cpu_timestamp_;
313+
anchor_device_time = helper->gpu_timestamp_;
314+
}
315+
current_host_time = utils::GetTime();
316+
uint64_t current_device_time = anchor_device_time + ((current_host_time - anchor_host_time) /
317+
timer_helpers_[device]->coeff_);
318+
319+
*host_time = current_host_time;
320+
*device_time = current_device_time;
321+
return ZE_RESULT_SUCCESS;
322+
}
323+
250324
static ZeCollectionMode SelectZeCollectionMode(bool introspection_capable, bool& disabled_mode,
251325
bool& hybrid_mode) {
252326
ZeCollector::ZeCollectionMode mode = ZeCollectionMode::Full;
@@ -375,6 +449,7 @@ class ZeCollector {
375449
swap_event_pool_(512),
376450
startstop_mode_changer(this) {
377451
CreateDeviceMap();
452+
UpdateDeviceSyncDelta();
378453
ze_result_t res = l0_wrapper_.InitDynamicTracingWrappers();
379454
if (ZE_RESULT_SUCCESS == res) {
380455
loader_dynamic_tracing_capable_ = true;
@@ -450,6 +525,27 @@ class ZeCollector {
450525
}
451526
}
452527

528+
void UpdateDeviceSyncDelta() {
529+
// in future we can try make it per device
530+
SPDLOG_DEBUG("In {}", __FUNCTION__);
531+
uint64_t delta = 0;
532+
auto env_string = utils::GetEnv("PTI_DEVICE_SYNC_DELTA");
533+
SPDLOG_INFO("Checking DeviceSyncDelta by PTI_DEVICE_SYNC_DELTA environment variable");
534+
if (!env_string.empty()) {
535+
try {
536+
delta = std::stoll(env_string);
537+
SPDLOG_INFO("\tPTI_DEVICE_SYNC_DELTA is {} ns, will use it in device tracing", delta);
538+
} catch (std::invalid_argument const& /*ex*/) {
539+
delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault; // fallback to default
540+
} catch (std::out_of_range const& /*ex*/) {
541+
delta = CPUGPUTimeInterpolationHelper::kSycnDeltaDefault; // fallback to default
542+
}
543+
}
544+
for (auto& [device, descriptor] : device_descriptors_) {
545+
descriptor.device_sync_delta = delta;
546+
}
547+
}
548+
453549
void MarkIntrospection() {
454550
const auto drivers = utils::ze::GetDriverList();
455551
for (auto const driver : drivers) {
@@ -502,21 +598,6 @@ class ZeCollector {
502598
}
503599

504600
desc.pci_properties = pci_device_properties;
505-
uint64_t host_time = 0;
506-
uint64_t ticks = 0;
507-
uint64_t device_time = 0;
508-
509-
overhead::Init();
510-
status = zeDeviceGetGlobalTimestamps(device, &host_time, &ticks);
511-
overhead_fini(zeDeviceGetGlobalTimestamps_id);
512-
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
513-
514-
device_time = ticks & desc.device_timer_mask;
515-
if (desc.device_timer_frequency) {
516-
device_time = device_time * NSEC_IN_SEC / desc.device_timer_frequency;
517-
}
518-
desc.host_time_origin = host_time;
519-
desc.device_time_origin = device_time;
520601
return desc;
521602
}
522603

@@ -1033,10 +1114,7 @@ class ZeCollector {
10331114
// as all command lists submitted to the execution into queue - they are not immediate
10341115
PTI_ASSERT(!info.immediate);
10351116
PTI_ASSERT(info.device != nullptr);
1036-
overhead::Init();
1037-
ze_result_t status =
1038-
zeDeviceGetGlobalTimestamps(info.device, &host_time_sync, &device_time_sync);
1039-
overhead_fini(zeDeviceGetGlobalTimestamps_id);
1117+
ze_result_t status = GetDeviceTimestamps(info.device, &host_time_sync, &device_time_sync);
10401118
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
10411119

10421120
const std::lock_guard<std::mutex> lock(lock_);
@@ -1434,9 +1512,7 @@ class ZeCollector {
14341512
uint64_t host_timestamp = 0;
14351513
uint64_t device_timestamp = 0; // in ticks
14361514

1437-
overhead::Init();
1438-
ze_result_t status = zeDeviceGetGlobalTimestamps(device, &host_timestamp, &device_timestamp);
1439-
overhead_fini(zeDeviceGetGlobalTimestamps_id);
1515+
ze_result_t status = collector->GetDeviceTimestamps(device, &host_timestamp, &device_timestamp);
14401516
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
14411517

14421518
ze_instance_data.timestamp_host = host_timestamp;
@@ -1539,13 +1615,12 @@ class ZeCollector {
15391615
} else if (command->props.type == KernelCommandType::kMemory) {
15401616
SPDLOG_TRACE("\t\tDevices in Memory command: src: {}, dst {}",
15411617
(void*)command->props.src_device, (void*)command->props.dst_device);
1542-
bool is_two_devices =
1543-
(command->props.src_device != nullptr && command->props.dst_device != nullptr) ? true
1544-
: false;
1545-
append_res = A2AppendBridgeMemoryCopyOrFill(
1546-
command->command_list, command->event_self, command->event_swap, command->props.dst,
1547-
command->props.src, command->props.bytes_transferred, command->props.value_size,
1548-
is_two_devices);
1618+
1619+
auto buffer = device_buffer_pool_.GetBuffers(command->context, command->device);
1620+
PTI_ASSERT(buffer != nullptr);
1621+
append_res = A2AppendBridgeMemoryCopyOrFillEx(command->command_list, command->event_self,
1622+
command->event_swap, buffer,
1623+
device_buffer_pool_.buffer_size_);
15491624
} else if (command->props.type == KernelCommandType::kCommand) {
15501625
append_res =
15511626
A2AppendBridgeBarrier(command->command_list, command->event_self, command->event_swap);
@@ -2397,6 +2472,7 @@ class ZeCollector {
23972472
std::map<ze_command_queue_handle_t, ZeCommandQueue> command_queues_;
23982473

23992474
A2BridgeKernelPool bridge_kernel_pool_;
2475+
A2DeviceBufferPool device_buffer_pool_;
24002476
A2EventPool swap_event_pool_;
24012477

24022478
Level0Wrapper l0_wrapper_;

sdk/src/levelzero/ze_local_collection_helpers.h

Lines changed: 61 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,13 +85,42 @@ bool A2AppendBridgeMemoryCopyOrFill(ze_command_list_handle_t command_list,
8585
return result == ZE_RESULT_SUCCESS;
8686
}
8787

88+
/**
89+
* \internal
90+
* ze..MemoryFill still has smaller latency than ze..MemoryCopy. So we use it to lower the overhead
91+
*/
92+
bool A2AppendBridgeMemoryCopyOrFillEx(ze_command_list_handle_t command_list,
93+
ze_event_handle_t signal_event, ze_event_handle_t wait_event,
94+
void* dst, size_t size) {
95+
PTI_ASSERT(command_list != nullptr);
96+
PTI_ASSERT(wait_event != nullptr);
97+
98+
SPDLOG_TRACE(
99+
" --- In: {}, CmdList: {}, Signal event: {}, Wait event: {}, dst: {}, \
100+
size: {}",
101+
__FUNCTION__, static_cast<void*>(command_list), static_cast<void*>(signal_event),
102+
static_cast<void*>(wait_event), dst, size);
103+
104+
uint32_t count = 1;
105+
ze_result_t result = ZE_RESULT_SUCCESS;
106+
SPDLOG_TRACE("\tAppending Bridge MemoryFill dst: {}, size: {}", dst, size);
107+
uint32_t pattern = 0;
108+
overhead::Init();
109+
result = zeCommandListAppendMemoryFill(command_list, dst, &pattern, sizeof(pattern), size,
110+
signal_event, count, &wait_event);
111+
overhead_fini(zeCommandListAppendMemoryFill_id);
112+
SPDLOG_DEBUG("\t\tBridge MemOp Append MemoryFill result: {}", (uint32_t)result);
113+
return result == ZE_RESULT_SUCCESS;
114+
}
115+
88116
bool A2AppendBridgeBarrier(ze_command_list_handle_t command_list, ze_event_handle_t signal_event,
89117
ze_event_handle_t wait_event) {
90118
PTI_ASSERT(command_list != nullptr);
91119
PTI_ASSERT(wait_event != nullptr);
92120

93121
SPDLOG_DEBUG(" --- In: {}, CmdList: {}, Signal event: {}, Wait event: {}", __FUNCTION__,
94-
(void*)command_list, (void*)signal_event, (void*)wait_event);
122+
static_cast<void*>(command_list), static_cast<void*>(signal_event),
123+
static_cast<void*>(wait_event));
95124

96125
uint32_t count = 1;
97126
overhead::Init();
@@ -425,4 +454,35 @@ std::vector<uint16_t> A2BridgeKernelPool::kernel_binary_{
425454
0x7274, 0x0079, 0x0000, 0x0013, 0x0002, 0x0002, 0x0000, 0x0021, 0x0003, 0x0003, 0x0000,
426455
0x0002, 0x0000, 0x0036, 0x0005, 0x0002, 0x0000, 0x0004, 0x0000, 0x0000, 0x0000, 0x0003,
427456
0x0000, 0x00f8, 0x0002, 0x0005, 0x0000, 0x00fd, 0x0001, 0x0038, 0x0001};
457+
458+
class A2DeviceBufferPool {
459+
public:
460+
A2DeviceBufferPool() {}
461+
462+
void* GetBuffers(ze_context_handle_t context, ze_device_handle_t device) {
463+
PTI_ASSERT(context != nullptr);
464+
PTI_ASSERT(device != nullptr);
465+
std::unique_lock lock(mutex_);
466+
if (buffer_map_.find(std::pair(context, device)) == buffer_map_.end()) {
467+
void* buff = nullptr;
468+
ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0,
469+
0};
470+
overhead::Init();
471+
ze_result_t status = zeMemAllocDevice(context, &alloc_desc, buffer_size_, 64, device, &buff);
472+
overhead_fini(zeMemAllocDevice_id);
473+
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
474+
buffer_map_[std::pair(context, device)] = buff;
475+
SPDLOG_TRACE("Device buffers created in {} for context: {}, device: {}, buff {}, size: {} ",
476+
__FUNCTION__, static_cast<void*>(context), static_cast<void*>(device),
477+
static_cast<void*>(buff), buffer_size_);
478+
}
479+
return buffer_map_[std::pair(context, device)];
480+
}
481+
const size_t buffer_size_ = 64;
482+
483+
private:
484+
std::mutex mutex_;
485+
std::map<std::pair<ze_context_handle_t, ze_device_handle_t>, void*> buffer_map_;
486+
};
487+
428488
#endif // PTI_TOOLS_ZE_LOCAL_COLLECTION_HELPERS_H_

sdk/src/levelzero/ze_timer_helper.h

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
//
2+
//==============================================================
3+
// Copyright (C) Intel Corporation
4+
//
5+
// SPDX-License-Identifier: MIT
6+
// =============================================================
7+
8+
#ifndef PTI_TOOLS_ZE_TIMER_HELPER_H_
9+
#define PTI_TOOLS_ZE_TIMER_HELPER_H_
10+
11+
#include <level_zero/ze_api.h>
12+
13+
#include "pti_assert.h"
14+
15+
struct CPUGPUTimeInterpolationHelper {
16+
const static uint64_t kSycnDeltaDefault = 50'000'000;
17+
ze_device_handle_t device_;
18+
uint32_t gpu_freq_;
19+
uint64_t gpu_timer_mask_;
20+
uint64_t cpu_timestamp_;
21+
uint64_t gpu_timestamp_;
22+
uint64_t delta_ = kSycnDeltaDefault;
23+
uint64_t coeff_;
24+
CPUGPUTimeInterpolationHelper(ze_device_handle_t device, uint32_t gpu_freq,
25+
uint64_t gpu_timer_mask, uint64_t sync_delta)
26+
: device_(device),
27+
gpu_freq_(gpu_freq),
28+
gpu_timer_mask_(gpu_timer_mask),
29+
cpu_timestamp_(0),
30+
gpu_timestamp_(0) {
31+
PTI_ASSERT(device_ != nullptr);
32+
PTI_ASSERT(gpu_freq != 0ULL);
33+
PTI_ASSERT(gpu_timer_mask != 0ULL);
34+
if (sync_delta != 0ULL) {
35+
delta_ = sync_delta;
36+
}
37+
coeff_ = 1'000'000'000 / gpu_freq_;
38+
}
39+
};
40+
41+
#endif // PTI_TOOLS_ZE_TIMER_HELPER_H_

sdk/src/utils/ze_utils.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -361,15 +361,15 @@ inline std::string GetKernelName(ze_kernel_handle_t kernel, bool demangle = fals
361361
return std::string(name.begin(), name.end() - 1);
362362
}
363363

364-
inline void GetDeviceTimestamps(ze_device_handle_t device, uint64_t* host_timestamp,
365-
uint64_t* device_timestamp) {
364+
inline ze_result_t GetDeviceTimestamps(ze_device_handle_t device, uint64_t* host_timestamp,
365+
uint64_t* device_timestamp) {
366366
PTI_ASSERT(device != nullptr);
367367
PTI_ASSERT(host_timestamp != nullptr);
368368
PTI_ASSERT(device_timestamp != nullptr);
369369
overhead::Init();
370370
ze_result_t status = zeDeviceGetGlobalTimestamps(device, host_timestamp, device_timestamp);
371371
overhead_fini(zeDeviceGetGlobalTimestamps_id);
372-
PTI_ASSERT(status == ZE_RESULT_SUCCESS);
372+
return status;
373373
}
374374

375375
inline uint64_t GetDeviceTimerFrequency(ze_device_handle_t device) {

sdk/test/CMakeLists.txt

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -316,22 +316,34 @@ if(HAVE_SYCL)
316316
PROPERTIES LABELS "functional")
317317

318318
RequirePythonInterp()
319+
320+
set(L0_OVERHEAD_THRESHOLD 17)
321+
set(LINK_OVERHEAD_THRESHOLD 3)
322+
if (WIN32)
323+
# Timer we use on Windows (as of implementation 02.14.25) has 100 ns resolution,
324+
# due to that some functions latency measured as zero, so accumulated overhead is less than on Linux
325+
set(L0_OVERHEAD_THRESHOLD 11)
326+
# For some reason, on Windows, the link overhead is bigger than on Linux
327+
# Need to investigate it - look into UR and XPTI
328+
set(LINK_OVERHEAD_THRESHOLD 5)
329+
endif()
330+
319331
add_test(
320332
NAME perf-profiling-overhead
321333
COMMAND
322-
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" 66 profiled
334+
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" 60 profiled
323335
)
324336

325337
add_test(
326338
NAME perf-link-overhead
327339
COMMAND
328-
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" 8 linkonly
340+
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" ${LINK_OVERHEAD_THRESHOLD} linkonly
329341
)
330342

331343
add_test(
332344
NAME perf-overhead-view
333345
COMMAND
334-
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" 50 overhead
346+
${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/test/perf_test.py "${PTI_TEST_BIN_DIR}" ${L0_OVERHEAD_THRESHOLD} overhead
335347
)
336348

337349

0 commit comments

Comments
 (0)