Skip to content

Commit de62ad0

Browse files
committed
Merge branch 'hip-support' into dev
2 parents 534c4cd + 9385655 commit de62ad0

File tree

25 files changed

+862
-635
lines changed

25 files changed

+862
-635
lines changed

CMakeLists.txt

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,44 @@
11
cmake_minimum_required(VERSION 3.20)
22

33
set (PROJECT_NAME kernel_float)
4-
project(${PROJECT_NAME} CXX CUDA)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
55

6-
set(CMAKE_C_STANDARD 11)
6+
set(CMAKE_CXX_STANDARD 17)
7+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
78

9+
# Validate and enable the appropriate language
10+
if (NOT DEFINED KERNEL_FLOAT_LANGUAGE)
11+
set(KERNEL_FLOAT_LANGUAGE "CUDA")
12+
endif()
13+
14+
if (KERNEL_FLOAT_LANGUAGE STREQUAL "CUDA")
15+
enable_language(CUDA)
16+
set(KERNEL_FLOAT_LANGUAGE_CUDA ON)
17+
elseif (KERNEL_FLOAT_LANGUAGE STREQUAL "HIP")
18+
enable_language(HIP)
19+
set(KERNEL_FLOAT_LANGUAGE_HIP ON)
20+
else()
21+
message(FATAL_ERROR "KERNEL_FLOAT_LANGUAGE must be either 'HIP' or 'CUDA'")
22+
endif()
23+
24+
# Create an interface library for kernel_float
825
add_library(${PROJECT_NAME} INTERFACE)
926
target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}/include")
1027

28+
# Optionally build tests and examples if the corresponding flags are set
29+
option(KERNEL_FLOAT_BUILD_TEST "Build kernel float tests" OFF)
30+
option(KERNEL_FLOAT_BUILD_EXAMPLE "Build kernel float examples" OFF)
31+
1132
if (KERNEL_FLOAT_BUILD_TEST)
1233
add_subdirectory(tests)
1334
endif()
1435

1536
if (KERNEL_FLOAT_BUILD_EXAMPLE)
1637
add_subdirectory(examples)
1738
endif()
39+
40+
# Display configuration
41+
message(STATUS "=== Kernel Float ===")
42+
message(STATUS "Using GPU Language: ${KERNEL_FLOAT_LANGUAGE}")
43+
message(STATUS "Building Tests: ${KERNEL_FLOAT_BUILD_TEST}")
44+
message(STATUS "Building Examples: ${KERNEL_FLOAT_BUILD_EXAMPLE}")

README.md

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@
99
![GitHub Repo stars](https://img.shields.io/github/stars/KernelTuner/kernel_float?style=social)
1010

1111

12-
_Kernel Float_ is a header-only library for CUDA that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
12+
_Kernel Float_ is a header-only library for CUDA/HIP that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
1313

1414

1515
## Summary
1616

17-
CUDA natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
17+
CUDA/HIP natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
1818
and vector types (e.g., `__half2`, `__nv_fp8x4_e4m3`, `float3`).
1919
However, working with these types is cumbersome:
2020
mathematical operations require intrinsics (e.g., `__hadd2` performs addition for `__half2`),
@@ -24,9 +24,9 @@ and some functionality is missing (e.g., one cannot convert a `__half` to `__nv_
2424
_Kernel Float_ resolves this by offering a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
2525
Internally, the data is stored as a fixed-sized array of elements.
2626
Operator overloading (like `+`, `*`, `&&`) has been implemented such that the most optimal intrinsic for the available types is selected automatically.
27-
Many mathetical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
27+
Many mathematical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
2828

29-
By using this library, developers can avoid the complexity of working with reduced precision floating-point types in CUDA and focus on their applications.
29+
Using Kernel Float, developers avoid the complexity of reduced precision floating-point types in CUDA and can focus on their applications.
3030

3131

3232
## Features
@@ -40,6 +40,7 @@ In a nutshell, _Kernel Float_ offers the following features:
4040
* Easy integration as a single header file.
4141
* Written for C++17.
4242
* Compatible with NVCC (NVIDIA Compiler) and NVRTC (NVIDIA Runtime Compilation).
43+
* Compatible with HIPCC (AMD HIP Compiler)
4344

4445

4546
## Example
@@ -49,7 +50,7 @@ Check out the [examples](https://github.com/KernelTuner/kernel_float/tree/master
4950

5051
Below shows a simple example of a CUDA kernel that adds a `constant` to the `input` array and writes the results to the `output` array.
5152
Each thread processes two elements.
52-
Notice how easy it would be change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
53+
Notice how easy it would be to change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
5354

5455

5556
```cpp
@@ -63,14 +64,14 @@ __global__ void kernel(const kf::vec<half, 2>* input, float constant, kf::vec<fl
6364

6465
```
6566
66-
Here is how the same kernel would like without Kernel Float.
67+
Here is how the same kernel would look for CUDA without Kernel Float.
6768
6869
```cpp
6970
__global__ void kernel(const __half* input, float constant, float* output) {
7071
int i = blockIdx.x * blockDim.x + threadIdx.x;
7172
__half in0 = input[2 * i + 0];
72-
__half in1 = input[2 * 1 + 1];
73-
__half2 a = __halves2half2(in0, int1);
73+
__half in1 = input[2 * i + 1];
74+
__half2 a = __halves2half2(in0, in1);
7475
float b = float(constant);
7576
__half c = __float2half(b);
7677
__half2 d = __half2half2(c);

examples/hip_compat.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#pragma once
2+
3+
/**
4+
* This header file provides a mapping from CUDA-specific function names and types to their equivalent HIP
5+
* counterparts, allowing for cross-platform development between CUDA and HIP. By including this header, code
6+
* originally written for CUDA can be compiled with the HIP compiler (hipcc) by automatically replacing CUDA API
7+
* calls with their HIP equivalents.
8+
*/
9+
#ifdef __HIPCC__
10+
#define cudaError_t hipError_t
11+
#define cudaSuccess hipSuccess
12+
#define cudaGetErrorString hipGetErrorString
13+
#define cudaGetLastError hipGetLastError
14+
#define cudaMalloc hipMalloc
15+
#define cudaFree hipFree
16+
#define cudaMemcpy hipMemcpy
17+
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
18+
#define cudaMemcpyDefault hipMemcpyDefault
19+
#define cudaMemset hipMemset
20+
#define cudaSetDevice hipSetDevice
21+
#define cudaDeviceSynchronize hipDeviceSynchronize
22+
#endif

examples/pi/CMakeLists.txt

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
cmake_minimum_required(VERSION 3.17)
1+
cmake_minimum_required(VERSION 3.20)
22

33
set (PROJECT_NAME kernel_float_pi)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
11+
find_package(CUDA REQUIRED)
12+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
13+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
14+
endif()
15+
16+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
17+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
18+
endif()

examples/pi/main.cu

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <stdio.h>
22
#include <stdlib.h>
33

4+
#include "../hip_compat.h"
45
#include "kernel_float.h"
56

67
#define CUDA_CHECK(call) \
@@ -9,12 +10,12 @@
910
if (__err != cudaSuccess) { \
1011
fprintf( \
1112
stderr, \
12-
"CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
13+
"CUDA error at %s:%d (%s): %s (code %d) \n", \
1314
__FILE__, \
1415
__LINE__, \
15-
__err, \
16+
#call, \
1617
cudaGetErrorString(__err), \
17-
#call); \
18+
__err); \
1819
exit(EXIT_FAILURE); \
1920
} \
2021
} while (0)

examples/vector_add/CMakeLists.txt

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
cmake_minimum_required(VERSION 3.17)
22

33
set (PROJECT_NAME kernel_float_vecadd)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
11+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
12+
endif()
13+
14+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
15+
find_package(CUDA REQUIRED)
16+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
17+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
18+
endif()

examples/vector_add/main.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <stdexcept>
44
#include <vector>
55

6+
#include "../hip_compat.h"
67
#include "kernel_float.h"
78
namespace kf = kernel_float;
89

@@ -21,7 +22,7 @@ __global__ void my_kernel(
2122
int i = blockIdx.x * blockDim.x + threadIdx.x;
2223

2324
if (i * N < length) {
24-
output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
25+
output(i) = kf::fma(input[i], input[i], kf::cast<half>(constant));
2526
}
2627
}
2728

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
11
cmake_minimum_required(VERSION 3.17)
22

33
set (PROJECT_NAME kernel_float_vecadd_tiling)
4-
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5-
set (CMAKE_CXX_STANDARD 17)
4+
project(${PROJECT_NAME} LANGUAGES CXX)
65

6+
set (CMAKE_CXX_STANDARD 17)
77
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
88
target_link_libraries(${PROJECT_NAME} kernel_float)
9-
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
109

11-
find_package(CUDA REQUIRED)
12-
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
10+
if(${KERNEL_FLOAT_LANGUAGE_HIP})
11+
set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
12+
endif()
13+
14+
if(${KERNEL_FLOAT_LANGUAGE_CUDA})
15+
find_package(CUDA REQUIRED)
16+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
17+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
18+
endif()

examples/vector_add_tiling/main.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include <stdexcept>
44
#include <vector>
55

6+
#include "../hip_compat.h"
67
#include "kernel_float.h"
78
#include "kernel_float/tiling.h"
89
namespace kf = kernel_float;

include/kernel_float/base.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44
#include "macros.h"
55
#include "meta.h"
66

7+
#if KERNEL_FLOAT_IS_HIP
8+
#include <hip/hip_vector_types.h>
9+
#endif
10+
711
namespace kernel_float {
812

913
template<typename T, size_t N, size_t Alignment = alignof(T)>
@@ -266,7 +270,7 @@ using promoted_vector_value_type = promote_t<vector_value_type<Vs>...>;
266270

267271
template<typename V>
268272
KERNEL_FLOAT_INLINE vector_storage_type<V> into_vector_storage(V&& input) {
269-
return into_vector_impl<V>::call(std::forward<V>(input));
273+
return into_vector_impl<V>::call(static_cast<V&&>(input));
270274
}
271275

272276
} // namespace kernel_float

0 commit comments

Comments
 (0)