tracel-ai · torsteingrindvik · Dec 19, 2024 · Dec 20, 2024
diff --git a/crates/cubecl-cuda/tests/clamp_f16.cu b/crates/cubecl-cuda/tests/clamp_f16.cu
@@ -0,0 +1,34 @@
+#include <cuda_fp16.h>
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint;
+typedef unsigned long long int uint64;
+typedef long long int int64;
+
+extern "C" __global__ void clamp_kernel_f16(__half input_0[], __half output_0[],
+                                            uint info[]) {
+
+  int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
+                               blockIdx.y * blockDim.y + threadIdx.y,
+                               blockIdx.z * blockDim.z + threadIdx.z);
+
+  uint idxGlobal =
+      (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
+      (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
+  __half l_0_0;
+  uint l_0_1;
+  bool l_0_2;
+  __half l_0_3;
+  l_0_1 = info[uint(0)];
+  l_0_2 = uint(0) < l_0_1;
+  l_0_3 = input_0[uint(0)];
+  l_0_0 = (l_0_2) ? l_0_3 : __half(0.0);
+  l_0_0 = __hmax(__half(0.0), __hmin(__half(2.0), l_0_0));
+  uint l_0_4;
+  bool l_0_5;
+  l_0_4 = info[uint(1)];
+  l_0_5 = idxGlobal < l_0_4;
+  if (l_0_5) {
+    output_0[idxGlobal] = l_0_0;
+  }
+}
diff --git a/crates/cubecl-cuda/tests/clamp_f32.cu b/crates/cubecl-cuda/tests/clamp_f32.cu
@@ -0,0 +1,33 @@
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint;
+typedef unsigned long long int uint64;
+typedef long long int int64;
+
+extern "C" __global__ void clamp_kernel_f32(float input_0[], float output_0[],
+                                            uint info[]) {
+
+  int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
+                               blockIdx.y * blockDim.y + threadIdx.y,
+                               blockIdx.z * blockDim.z + threadIdx.z);
+
+  uint idxGlobal =
+      (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
+      (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
+  float l_0_0;
+  uint l_0_1;
+  bool l_0_2;
+  float l_0_3;
+  l_0_1 = info[uint(0)];
+  l_0_2 = uint(0) < l_0_1;
+  l_0_3 = input_0[uint(0)];
+  l_0_0 = (l_0_2) ? l_0_3 : float(0.0);
+  l_0_0 = max(float(0.0), min(float(2.0), l_0_0));
+  uint l_0_4;
+  bool l_0_5;
+  l_0_4 = info[uint(1)];
+  l_0_5 = idxGlobal < l_0_4;
+  if (l_0_5) {
+    output_0[idxGlobal] = l_0_0;
+  }
+}
diff --git a/crates/cubecl-cuda/tests/lined_clamp_f16.cu b/crates/cubecl-cuda/tests/lined_clamp_f16.cu
@@ -0,0 +1,56 @@
+#include <cuda_fp16.h>
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint;
+typedef unsigned long long int uint64;
+typedef long long int int64;
+
+struct __align__(8) __half2_2 {
+  __half2 i_0;
+  __half2 i_1;
+};
+
+struct __align__(8) __half_4 {
+  __half i_0;
+  __half i_1;
+  __half i_2;
+  __half i_3;
+};
+
+extern "C" __global__ void
+lined_clamp_kernel_f16(__half_4 input_0[], __half_4 output_0[], uint info[]) {
+
+  int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
+                               blockIdx.y * blockDim.y + threadIdx.y,
+                               blockIdx.z * blockDim.z + threadIdx.z);
+
+  uint idxGlobal =
+      (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
+      (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
+  __half_4 l_0_0;
+  uint l_0_1;
+  bool l_0_2;
+  __half_4 l_0_3;
+  l_0_1 = info[uint(0)];
+  l_0_2 = uint(0) < l_0_1;
+  l_0_3 = input_0[uint(0)];
+  l_0_0 = __half_4{
+      (l_0_2) ? l_0_3.i_0 : __half(0.0),
+      (l_0_2) ? l_0_3.i_1 : __half(0.0),
+      (l_0_2) ? l_0_3.i_2 : __half(0.0),
+      (l_0_2) ? l_0_3.i_3 : __half(0.0),
+  };
+  l_0_0 = __half2_2{
+      __hmax2(__half(0.0),
+              __hmin2(__half(2.0), (reinterpret_cast<__half2_2 &>(l_0_0)).i_0)),
+      __hmax2(__half(0.0),
+              __hmin2(__half(2.0), (reinterpret_cast<__half2_2 &>(l_0_0)).i_1)),
+  };
+  uint l_0_4;
+  bool l_0_5;
+  l_0_4 = info[uint(1)];
+  l_0_5 = idxGlobal < l_0_4;
+  if (l_0_5) {
+    output_0[idxGlobal] = l_0_0;
+  }
+}
diff --git a/crates/cubecl-cuda/tests/lined_clamp_f32.cu b/crates/cubecl-cuda/tests/lined_clamp_f32.cu
@@ -0,0 +1,50 @@
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint;
+typedef unsigned long long int uint64;
+typedef long long int int64;
+
+struct __align__(16) float_4 {
+  float i_0;
+  float i_1;
+  float i_2;
+  float i_3;
+};
+
+extern "C" __global__ void
+lined_clamp_kernel_f32(float_4 input_0[], float_4 output_0[], uint info[]) {
+
+  int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
+                               blockIdx.y * blockDim.y + threadIdx.y,
+                               blockIdx.z * blockDim.z + threadIdx.z);
+
+  uint idxGlobal =
+      (absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
+      (absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
+  float_4 l_0_0;
+  uint l_0_1;
+  bool l_0_2;
+  float_4 l_0_3;
+  l_0_1 = info[uint(0)];
+  l_0_2 = uint(0) < l_0_1;
+  l_0_3 = input_0[uint(0)];
+  l_0_0 = float_4{
+      (l_0_2) ? l_0_3.i_0 : float(0.0),
+      (l_0_2) ? l_0_3.i_1 : float(0.0),
+      (l_0_2) ? l_0_3.i_2 : float(0.0),
+      (l_0_2) ? l_0_3.i_3 : float(0.0),
+  };
+  l_0_0 = float_4{
+      max(float(0.0), min(float(2.0), l_0_0.i_0)),
+      max(float(0.0), min(float(2.0), l_0_0.i_1)),
+      max(float(0.0), min(float(2.0), l_0_0.i_2)),
+      max(float(0.0), min(float(2.0), l_0_0.i_3)),
+  };
+  uint l_0_4;
+  bool l_0_5;
+  l_0_4 = info[uint(1)];
+  l_0_5 = idxGlobal < l_0_4;
+  if (l_0_5) {
+    output_0[idxGlobal] = l_0_0;
+  }
+}
diff --git a/crates/cubecl-cuda/tests/main.rs b/crates/cubecl-cuda/tests/main.rs
@@ -1,10 +1,13 @@
+use std::num::NonZero;
+
 use common::*;
 use constant_array_kernel::ConstantArrayKernel;
 use cubecl_core as cubecl;
 use cubecl_core::prelude::*;
 use cubecl_cuda::CudaRuntime;
 use execute_unary_kernel::ExecuteUnaryKernel;
 use half::bf16;
+use half::f16;
 use kernel_sum::KernelSum;
 use naming_kernel::NamingKernel;
 use pretty_assertions::assert_eq;
@@ -134,3 +137,56 @@ pub fn naming() {
     let expected = expected.trim();
     assert_eq!(compile(kernel), expected);
 }
+
+#[cube(launch, create_dummy_kernel)]
+fn clamp_kernel<F: Float>(input: &Array<F>, out: &mut Array<F>) {
+    out[ABSOLUTE_POS] = F::clamp(input[0], F::from_int(0), F::from_int(2));
+}
+
+#[test]
+pub fn test_clamp() {
+    let kernel = clamp_kernel::ClampKernel::<f32, CudaRuntime>::new(settings(), array(), array());
+    let expected = include_str!("clamp_f32.cu").replace("\r\n", "\n");
+    assert_eq!(compile(kernel), expected);
+
+    let kernel = clamp_kernel::ClampKernel::<f16, CudaRuntime>::new(settings(), array(), array());
+    let expected = include_str!("clamp_f16.cu").replace("\r\n", "\n");
+    assert_eq!(compile(kernel), expected);
+}
+
+#[cube(launch, create_dummy_kernel)]
+fn lined_clamp_kernel<F: Float>(input: &Array<Line<F>>, out: &mut Array<Line<F>>) {
+    out[ABSOLUTE_POS] = Line::<F>::clamp(
+        input[0],
+        Line::new(F::from_int(0)),
+        Line::new(F::from_int(2)),
+    );
+}
+
+#[test]
+pub fn test_lined_clamp() {
+    let arg4 = ArrayCompilationArg {
+        inplace: None,
+        vectorisation: NonZero::new(4),
+    };
+
+    let kernel = lined_clamp_kernel::LinedClampKernel::<f32, CudaRuntime>::new(
+        settings(),
+        arg4.clone(),
+        arg4.clone(),
+    );
+
+    let expected = include_str!("lined_clamp_f32.cu").replace("\r\n", "\n");
+    assert_eq!(compile(kernel), expected);
+
+    let kernel = lined_clamp_kernel::LinedClampKernel::<f16, CudaRuntime>::new(
+        settings(),
+        arg4.clone(),
+        arg4.clone(),
+    );
+    // TODO: Regenerate when correct
+    // std::fs::write("tests/lined_clamp_f16.cu", compile(kernel));
+
+    let expected = include_str!("lined_clamp_f16.cu").replace("\r\n", "\n");
+    assert_eq!(compile(kernel), expected);
+}