@@ -75,30 +75,46 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
75
75
if (URAdviceFlags &
76
76
(UR_USM_ADVICE_FLAG_SET_NON_ATOMIC_MOSTLY |
77
77
UR_USM_ADVICE_FLAG_CLEAR_NON_ATOMIC_MOSTLY |
78
- UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED)) {
78
+ UR_USM_ADVICE_FLAG_BIAS_CACHED | UR_USM_ADVICE_FLAG_BIAS_UNCACHED
79
+ #if !defined(__HIP_PLATFORM_AMD__)
80
+ | UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY |
81
+ UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY
82
+ #endif
83
+ )) {
79
84
return UR_RESULT_ERROR_INVALID_ENUMERATION;
80
85
}
81
86
82
87
using ur_to_hip_advice_t = std::pair<ur_usm_advice_flags_t , hipMemoryAdvise>;
83
88
84
- static constexpr std::array<ur_to_hip_advice_t , 6 >
85
- URToHIPMemAdviseDeviceFlags{
86
- std::make_pair (UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
87
- hipMemAdviseSetReadMostly),
88
- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
89
- hipMemAdviseUnsetReadMostly),
90
- std::make_pair (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
91
- hipMemAdviseSetPreferredLocation),
92
- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
93
- hipMemAdviseUnsetPreferredLocation),
94
- std::make_pair (UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
95
- hipMemAdviseSetAccessedBy),
96
- std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
97
- hipMemAdviseUnsetAccessedBy),
98
- };
99
- for (auto &FlagPair : URToHIPMemAdviseDeviceFlags) {
100
- if (URAdviceFlags & FlagPair.first ) {
101
- UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, FlagPair.second , Device));
89
+ #if defined(__HIP_PLATFORM_AMD__)
90
+ constexpr size_t DeviceFlagCount = 8 ;
91
+ #else
92
+ constexpr size_t DeviceFlagCount = 6 ;
93
+ #endif
94
+ static constexpr std::array<ur_to_hip_advice_t , DeviceFlagCount>
95
+ URToHIPMemAdviseDeviceFlags {
96
+ std::make_pair (UR_USM_ADVICE_FLAG_SET_READ_MOSTLY,
97
+ hipMemAdviseSetReadMostly),
98
+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_READ_MOSTLY,
99
+ hipMemAdviseUnsetReadMostly),
100
+ std::make_pair (UR_USM_ADVICE_FLAG_SET_PREFERRED_LOCATION,
101
+ hipMemAdviseSetPreferredLocation),
102
+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_PREFERRED_LOCATION,
103
+ hipMemAdviseUnsetPreferredLocation),
104
+ std::make_pair (UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE,
105
+ hipMemAdviseSetAccessedBy),
106
+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE,
107
+ hipMemAdviseUnsetAccessedBy),
108
+ #if defined(__HIP_PLATFORM_AMD__)
109
+ std::make_pair (UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY,
110
+ hipMemAdviseSetCoarseGrain),
111
+ std::make_pair (UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY,
112
+ hipMemAdviseUnsetCoarseGrain),
113
+ #endif
114
+ };
115
+ for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseDeviceFlags) {
116
+ if (URAdviceFlags & URAdvice) {
117
+ UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, HIPAdvice, Device));
102
118
}
103
119
}
104
120
@@ -113,10 +129,9 @@ ur_result_t setHipMemAdvise(const void *DevPtr, const size_t Size,
113
129
hipMemAdviseUnsetAccessedBy),
114
130
};
115
131
116
- for (auto &FlagPair : URToHIPMemAdviseHostFlags) {
117
- if (URAdviceFlags & FlagPair.first ) {
118
- UR_CHECK_ERROR (
119
- hipMemAdvise (DevPtr, Size, FlagPair.second , hipCpuDeviceId));
132
+ for (const auto &[URAdvice, HIPAdvice] : URToHIPMemAdviseHostFlags) {
133
+ if (URAdviceFlags & URAdvice) {
134
+ UR_CHECK_ERROR (hipMemAdvise (DevPtr, Size, HIPAdvice, hipCpuDeviceId));
120
135
}
121
136
}
122
137
@@ -300,15 +315,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
300
315
bool ProvidedLocalWorkGroupSize = (pLocalWorkSize != nullptr );
301
316
302
317
{
303
- ur_result_t Result = urDeviceGetInfo (
304
- hQueue->Device , UR_DEVICE_INFO_MAX_WORK_ITEM_SIZES,
305
- sizeof (MaxThreadsPerBlock), MaxThreadsPerBlock, nullptr );
306
- UR_ASSERT (Result == UR_RESULT_SUCCESS, Result);
318
+ MaxThreadsPerBlock[0 ] = hQueue->Device ->getMaxBlockDimX ();
319
+ MaxThreadsPerBlock[1 ] = hQueue->Device ->getMaxBlockDimY ();
320
+ MaxThreadsPerBlock[2 ] = hQueue->Device ->getMaxBlockDimZ ();
307
321
308
- Result =
309
- urDeviceGetInfo (hQueue->Device , UR_DEVICE_INFO_MAX_WORK_GROUP_SIZE,
310
- sizeof (MaxWorkGroupSize), &MaxWorkGroupSize, nullptr );
311
- UR_ASSERT (Result == UR_RESULT_SUCCESS, Result);
322
+ MaxWorkGroupSize = hQueue->Device ->getMaxWorkGroupSize ();
312
323
313
324
// The MaxWorkGroupSize = 1024 for AMD GPU
314
325
// The MaxThreadsPerBlock = {1024, 1024, 1024}
@@ -423,11 +434,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
423
434
: (LocalMemSzPtrPI ? LocalMemSzPtrPI : nullptr );
424
435
425
436
if (LocalMemSzPtr) {
426
- int DeviceMaxLocalMem = 0 ;
427
- UR_CHECK_ERROR (hipDeviceGetAttribute (
428
- &DeviceMaxLocalMem, hipDeviceAttributeMaxSharedMemoryPerBlock,
429
- Dev->get ()));
430
-
437
+ int DeviceMaxLocalMem = Dev->getDeviceMaxLocalMem ();
431
438
static const int EnvVal = std::atoi (LocalMemSzPtr);
432
439
if (EnvVal <= 0 || EnvVal > DeviceMaxLocalMem) {
433
440
setErrorMessage (LocalMemSzPtrUR ? " Invalid value specified for "
@@ -1484,7 +1491,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
1484
1491
1485
1492
// If the device does not support managed memory access, we can't set
1486
1493
// mem_advise.
1487
- if (!getAttribute ( Device, hipDeviceAttributeManagedMemory )) {
1494
+ if (!Device-> getManagedMemSupport ( )) {
1488
1495
releaseEvent ();
1489
1496
setErrorMessage (" mem_advise ignored as device does not support "
1490
1497
" managed memory access" ,
@@ -1558,7 +1565,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
1558
1565
1559
1566
// If the device does not support managed memory access, we can't set
1560
1567
// mem_advise.
1561
- if (!getAttribute ( Device, hipDeviceAttributeManagedMemory )) {
1568
+ if (!Device-> getManagedMemSupport ( )) {
1562
1569
releaseEvent ();
1563
1570
setErrorMessage (" mem_advise ignored as device does not support "
1564
1571
" managed memory access" ,
@@ -1575,7 +1582,7 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
1575
1582
UR_USM_ADVICE_FLAG_SET_ACCESSED_BY_DEVICE |
1576
1583
UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_DEVICE |
1577
1584
UR_USM_ADVICE_FLAG_DEFAULT)) {
1578
- if (!getAttribute ( Device, hipDeviceAttributeConcurrentManagedAccess )) {
1585
+ if (!Device-> getConcurrentManagedAccess ( )) {
1579
1586
releaseEvent ();
1580
1587
setErrorMessage (" mem_advise ignored as device does not support "
1581
1588
" concurrent managed access" ,
@@ -1598,6 +1605,10 @@ urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
1598
1605
pMem, size, hipMemAdviseUnsetPreferredLocation, DeviceID));
1599
1606
UR_CHECK_ERROR (
1600
1607
hipMemAdvise (pMem, size, hipMemAdviseUnsetAccessedBy, DeviceID));
1608
+ #if defined(__HIP_PLATFORM_AMD__)
1609
+ UR_CHECK_ERROR (
1610
+ hipMemAdvise (pMem, size, hipMemAdviseUnsetCoarseGrain, DeviceID));
1611
+ #endif
1601
1612
} else {
1602
1613
Result = setHipMemAdvise (HIPDevicePtr, size, advice, DeviceID);
1603
1614
// UR_RESULT_ERROR_INVALID_ENUMERATION is returned when using a valid but
@@ -1663,8 +1674,57 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D(
1663
1674
UR_CHECK_ERROR (RetImplEvent->start ());
1664
1675
}
1665
1676
1677
+ // There is an issue with hipMemcpy2D* when hipMemcpyDefault is used, which
1678
+ // makes the HIP runtime not correctly derive the copy kind (direction) for
1679
+ // the copies since ROCm 5.6.0+. See: https://github.com/ROCm/clr/issues/40
1680
+ // TODO: Add maximum HIP_VERSION when bug has been fixed.
1681
+ #if HIP_VERSION >= 50600000
1682
+ hipPointerAttribute_t srcAttribs{};
1683
+ hipPointerAttribute_t dstAttribs{};
1684
+
1685
+ bool srcIsSystemAlloc{false };
1686
+ bool dstIsSystemAlloc{false };
1687
+
1688
+ hipError_t hipRes{};
1689
+ // hipErrorInvalidValue returned from hipPointerGetAttributes for a non-null
1690
+ // pointer refers to an OS-allocation, hence pageable host memory. However,
1691
+ // this means we cannot rely on the attributes result, hence we mark system
1692
+ // pageable memory allocation manually as host memory. The HIP runtime can
1693
+ // handle the registering/unregistering of the memory as long as the right
1694
+ // copy-kind (direction) is provided to hipMemcpy2DAsync for this case.
1695
+ hipRes = hipPointerGetAttributes (&srcAttribs, (const void *)pSrc);
1696
+ if (hipRes == hipErrorInvalidValue && pSrc)
1697
+ srcIsSystemAlloc = true ;
1698
+ hipRes = hipPointerGetAttributes (&dstAttribs, (const void *)pDst);
1699
+ if (hipRes == hipErrorInvalidValue && pDst)
1700
+ dstIsSystemAlloc = true ;
1701
+
1702
+ const unsigned int srcMemType{srcAttribs.type };
1703
+ const unsigned int dstMemType{dstAttribs.type };
1704
+
1705
+ const bool srcIsHost{(srcMemType == hipMemoryTypeHost) || srcIsSystemAlloc};
1706
+ const bool srcIsDevice{srcMemType == hipMemoryTypeDevice};
1707
+ const bool dstIsHost{(dstMemType == hipMemoryTypeHost) || dstIsSystemAlloc};
1708
+ const bool dstIsDevice{dstMemType == hipMemoryTypeDevice};
1709
+
1710
+ unsigned int cpyKind{};
1711
+ if (srcIsHost && dstIsHost)
1712
+ cpyKind = hipMemcpyHostToHost;
1713
+ else if (srcIsHost && dstIsDevice)
1714
+ cpyKind = hipMemcpyHostToDevice;
1715
+ else if (srcIsDevice && dstIsHost)
1716
+ cpyKind = hipMemcpyDeviceToHost;
1717
+ else if (srcIsDevice && dstIsDevice)
1718
+ cpyKind = hipMemcpyDeviceToDevice;
1719
+ else
1720
+ cpyKind = hipMemcpyDefault;
1721
+
1722
+ UR_CHECK_ERROR (hipMemcpy2DAsync (pDst, dstPitch, pSrc, srcPitch, width,
1723
+ height, (hipMemcpyKind)cpyKind, HIPStream));
1724
+ #else
1666
1725
UR_CHECK_ERROR (hipMemcpy2DAsync (pDst, dstPitch, pSrc, srcPitch, width,
1667
1726
height, hipMemcpyDefault, HIPStream));
1727
+ #endif
1668
1728
1669
1729
if (phEvent) {
1670
1730
UR_CHECK_ERROR (RetImplEvent->record ());
0 commit comments