KernelTuner
diff --git a/‎CMakeLists.txt
Lines changed: 29 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 29 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 9 additions & 8 deletions b/‎README.md
Lines changed: 9 additions & 8 deletions
diff --git a/‎examples/hip_compat.h
Lines changed: 22 additions & 0 deletions b/‎examples/hip_compat.h
Lines changed: 22 additions & 0 deletions
diff --git a/‎examples/pi/CMakeLists.txt
Lines changed: 12 additions & 6 deletions b/‎examples/pi/CMakeLists.txt
Lines changed: 12 additions & 6 deletions
diff --git a/‎examples/pi/main.cu
Lines changed: 4 additions & 3 deletions b/‎examples/pi/main.cu
Lines changed: 4 additions & 3 deletions
diff --git a/‎examples/vector_add/CMakeLists.txt
Lines changed: 11 additions & 5 deletions b/‎examples/vector_add/CMakeLists.txt
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/vector_add/main.cu
Lines changed: 2 additions & 1 deletion b/‎examples/vector_add/main.cu
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/vector_add_tiling/CMakeLists.txt
Lines changed: 11 additions & 5 deletions b/‎examples/vector_add_tiling/CMakeLists.txt
Lines changed: 11 additions & 5 deletions
diff --git a/‎examples/vector_add_tiling/main.cu
Lines changed: 1 addition & 0 deletions b/‎examples/vector_add_tiling/main.cu
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/kernel_float/base.h
Lines changed: 5 additions & 1 deletion b/‎include/kernel_float/base.h
Lines changed: 5 additions & 1 deletion
@@ -1,17 +1,44 @@
 cmake_minimum_required(VERSION 3.20)
 
 set (PROJECT_NAME kernel_float)
-project(${PROJECT_NAME} CXX CUDA)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
-set(CMAKE_C_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 
+# Validate and enable the appropriate language
+if (NOT DEFINED KERNEL_FLOAT_LANGUAGE)
+    set(KERNEL_FLOAT_LANGUAGE "CUDA")
+endif()
+
+if (KERNEL_FLOAT_LANGUAGE STREQUAL "CUDA")
+    enable_language(CUDA)
+    set(KERNEL_FLOAT_LANGUAGE_CUDA ON)
+elseif (KERNEL_FLOAT_LANGUAGE STREQUAL "HIP")
+    enable_language(HIP)
+    set(KERNEL_FLOAT_LANGUAGE_HIP ON)
+else()
+    message(FATAL_ERROR "KERNEL_FLOAT_LANGUAGE must be either 'HIP' or 'CUDA'")
+endif()
+
+# Create an interface library for kernel_float
 add_library(${PROJECT_NAME} INTERFACE)
 target_include_directories(${PROJECT_NAME} INTERFACE "${PROJECT_SOURCE_DIR}/include")
 
+# Optionally build tests and examples if the corresponding flags are set
+option(KERNEL_FLOAT_BUILD_TEST "Build kernel float tests" OFF)
+option(KERNEL_FLOAT_BUILD_EXAMPLE "Build kernel float examples" OFF)
+
 if (KERNEL_FLOAT_BUILD_TEST)
     add_subdirectory(tests)
 endif()
 
 if (KERNEL_FLOAT_BUILD_EXAMPLE)
     add_subdirectory(examples)
 endif()
+
+# Display configuration
+message(STATUS "=== Kernel Float ===")
+message(STATUS "Using GPU Language: ${KERNEL_FLOAT_LANGUAGE}")
+message(STATUS "Building Tests: ${KERNEL_FLOAT_BUILD_TEST}")
+message(STATUS "Building Examples: ${KERNEL_FLOAT_BUILD_EXAMPLE}")
@@ -9,12 +9,12 @@
 ![GitHub Repo stars](https://img.shields.io/github/stars/KernelTuner/kernel_float?style=social)
 
 
-_Kernel Float_ is a header-only library for CUDA that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
+_Kernel Float_ is a header-only library for CUDA/HIP that simplifies working with vector types and reduced precision floating-point arithmetic in GPU code.
 
 
 ## Summary
 
-CUDA natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
+CUDA/HIP natively offers several reduced precision floating-point types (`__half`, `__nv_bfloat16`, `__nv_fp8_e4m3`, `__nv_fp8_e5m2`)
 and vector types (e.g., `__half2`, `__nv_fp8x4_e4m3`, `float3`).
 However, working with these types is cumbersome:
 mathematical operations require intrinsics (e.g., `__hadd2` performs addition for `__half2`),
@@ -24,9 +24,9 @@ and some functionality is missing (e.g., one cannot convert a `__half` to `__nv_
 _Kernel Float_ resolves this by offering a single data type `kernel_float::vec<T, N>` that stores `N` elements of type `T`.
 Internally, the data is stored as a fixed-sized array of elements.
 Operator overloading (like `+`, `*`, `&&`) has been implemented such that the most optimal intrinsic for the available types is selected automatically.
-Many mathetical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
+Many mathematical functions (like `log`, `exp`, `sin`) and common operations (such as `sum`, `range`, `for_each`) are also available.
 
-By using this library, developers can avoid the complexity of working with reduced precision floating-point types in CUDA and focus on their applications.
+Using Kernel Float, developers avoid the complexity of reduced precision floating-point types in CUDA and can focus on their applications.
 
 
 ## Features
@@ -40,6 +40,7 @@ In a nutshell, _Kernel Float_ offers the following features:
 * Easy integration as a single header file.
 * Written for C++17.
 * Compatible with NVCC (NVIDIA Compiler) and NVRTC (NVIDIA Runtime Compilation).
+* Compatible with HIPCC (AMD HIP Compiler)
 
 
 ## Example
@@ -49,7 +50,7 @@ Check out the [examples](https://github.com/KernelTuner/kernel_float/tree/master
 
 Below shows a simple example of a CUDA kernel that adds a `constant` to the `input` array and writes the results to the `output` array.
 Each thread processes two elements.
-Notice how easy it would be change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
+Notice how easy it would be to change the precision (for example, `double` to `half`) or the vector size (for example, 4 instead of 2 items per thread).
 
 
 ```cpp
@@ -63,14 +64,14 @@ __global__ void kernel(const kf::vec<half, 2>* input, float constant, kf::vec<fl
 
 ```
 
-Here is how the same kernel would like without Kernel Float.
+Here is how the same kernel would look for CUDA without Kernel Float.
 
 ```cpp
 __global__ void kernel(const __half* input, float constant, float* output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
     __half in0 = input[2 * i + 0];
-    __half in1 = input[2 * 1 + 1];
-    __half2 a = __halves2half2(in0, int1);
+    __half in1 = input[2 * i + 1];
+    __half2 a = __halves2half2(in0, in1);
     float b = float(constant);
     __half c = __float2half(b);
     __half2 d = __half2half2(c);
 
@@ -0,0 +1,22 @@
+#pragma once
+
+/**
+ * This header file provides a mapping from CUDA-specific function names and types to their equivalent HIP
+ * counterparts, allowing for cross-platform development between CUDA and HIP. By including this header, code
+ * originally written for CUDA can be compiled with the HIP compiler (hipcc) by automatically replacing CUDA API
+ * calls with their HIP equivalents.
+ */
+#ifdef __HIPCC__
+#define cudaError_t            hipError_t
+#define cudaSuccess            hipSuccess
+#define cudaGetErrorString     hipGetErrorString
+#define cudaGetLastError       hipGetLastError
+#define cudaMalloc             hipMalloc
+#define cudaFree               hipFree
+#define cudaMemcpy             hipMemcpy
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyDefault      hipMemcpyDefault
+#define cudaMemset             hipMemset
+#define cudaSetDevice          hipSetDevice
+#define cudaDeviceSynchronize  hipDeviceSynchronize
+#endif
@@ -1,12 +1,18 @@
-cmake_minimum_required(VERSION 3.17)
+cmake_minimum_required(VERSION 3.20)
 
 set (PROJECT_NAME kernel_float_pi)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 
 #define CUDA_CHECK(call)                                     \
@@ -9,12 +10,12 @@
         if (__err != cudaSuccess) {                          \
             fprintf(                                         \
                 stderr,                                      \
-                "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
+                "CUDA error at %s:%d (%s): %s (code %d) \n", \
                 __FILE__,                                    \
                 __LINE__,                                    \
-                __err,                                       \
+                #call,                                       \
                 cudaGetErrorString(__err),                   \
-                #call);                                      \
+                __err);                                      \
             exit(EXIT_FAILURE);                              \
         }                                                    \
     } while (0)
 
@@ -1,12 +1,18 @@
 cmake_minimum_required(VERSION 3.17)
 
 set (PROJECT_NAME kernel_float_vecadd)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <vector>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 namespace kf = kernel_float;
 
@@ -21,7 +22,7 @@ __global__ void my_kernel(
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
+        output(i) = kf::fma(input[i], input[i], kf::cast<half>(constant));
     }
 }
 
 
@@ -1,12 +1,18 @@
 cmake_minimum_required(VERSION 3.17)
 
 set (PROJECT_NAME kernel_float_vecadd_tiling)
-project(${PROJECT_NAME} LANGUAGES CXX CUDA)
-set (CMAKE_CXX_STANDARD 17)
+project(${PROJECT_NAME} LANGUAGES CXX)
 
+set (CMAKE_CXX_STANDARD 17)
 add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
 target_link_libraries(${PROJECT_NAME} kernel_float)
-set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
 
-find_package(CUDA REQUIRED)
-target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+if(${KERNEL_FLOAT_LANGUAGE_HIP})
+    set_source_files_properties("${PROJECT_SOURCE_DIR}/main.cu" PROPERTIES LANGUAGE HIP)
+endif()
+
+if(${KERNEL_FLOAT_LANGUAGE_CUDA})
+    find_package(CUDA REQUIRED)
+    target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
+    set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+endif()
@@ -3,6 +3,7 @@
 #include <stdexcept>
 #include <vector>
 
+#include "../hip_compat.h"
 #include "kernel_float.h"
 #include "kernel_float/tiling.h"
 namespace kf = kernel_float;
 
@@ -4,6 +4,10 @@
 #include "macros.h"
 #include "meta.h"
 
+#if KERNEL_FLOAT_IS_HIP
+#include <hip/hip_vector_types.h>
+#endif
+
 namespace kernel_float {
 
 template<typename T, size_t N, size_t Alignment = alignof(T)>
@@ -266,7 +270,7 @@ using promoted_vector_value_type = promote_t<vector_value_type<Vs>...>;
 
 template<typename V>
 KERNEL_FLOAT_INLINE vector_storage_type<V> into_vector_storage(V&& input) {
-    return into_vector_impl<V>::call(std::forward<V>(input));
+    return into_vector_impl<V>::call(static_cast<V&&>(input));
 }
 
 }  // namespace kernel_float
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`#include <stdexcept>`
`4`	`4`	`#include <vector>`
`5`	`5`
	`6`	`+#include "../hip_compat.h"`
`6`	`7`	`#include "kernel_float.h"`
`7`	`8`	`namespace kf = kernel_float;`
`8`	`9`
`@@ -21,7 +22,7 @@ __global__ void my_kernel(`
`21`	`22`	`int i = blockIdx.x * blockDim.x + threadIdx.x;`
`22`	`23`
`23`	`24`	`if (i * N < length) {`
`24`		`- output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));`
	`25`	`+ output(i) = kf::fma(input[i], input[i], kf::cast<half>(constant));`
`25`	`26`	`}`
`26`	`27`	`}`
`27`	`28`