Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions crates/cubecl-cuda/tests/clamp_f16.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#include <cuda_fp16.h>
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint;
typedef unsigned long long int uint64;
typedef long long int int64;

extern "C" __global__ void clamp_kernel_f16(__half input_0[], __half output_0[],
uint info[]) {

int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);

uint idxGlobal =
(absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
(absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
__half l_0_0;
uint l_0_1;
bool l_0_2;
__half l_0_3;
l_0_1 = info[uint(0)];
l_0_2 = uint(0) < l_0_1;
l_0_3 = input_0[uint(0)];
l_0_0 = (l_0_2) ? l_0_3 : __half(0.0);
l_0_0 = __hmax(__half(0.0), __hmin(__half(2.0), l_0_0));
uint l_0_4;
bool l_0_5;
l_0_4 = info[uint(1)];
l_0_5 = idxGlobal < l_0_4;
if (l_0_5) {
output_0[idxGlobal] = l_0_0;
}
}
33 changes: 33 additions & 0 deletions crates/cubecl-cuda/tests/clamp_f32.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint;
typedef unsigned long long int uint64;
typedef long long int int64;

extern "C" __global__ void clamp_kernel_f32(float input_0[], float output_0[],
uint info[]) {

int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);

uint idxGlobal =
(absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
(absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
float l_0_0;
uint l_0_1;
bool l_0_2;
float l_0_3;
l_0_1 = info[uint(0)];
l_0_2 = uint(0) < l_0_1;
l_0_3 = input_0[uint(0)];
l_0_0 = (l_0_2) ? l_0_3 : float(0.0);
l_0_0 = max(float(0.0), min(float(2.0), l_0_0));
uint l_0_4;
bool l_0_5;
l_0_4 = info[uint(1)];
l_0_5 = idxGlobal < l_0_4;
if (l_0_5) {
output_0[idxGlobal] = l_0_0;
}
}
56 changes: 56 additions & 0 deletions crates/cubecl-cuda/tests/lined_clamp_f16.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include <cuda_fp16.h>
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint;
typedef unsigned long long int uint64;
typedef long long int int64;

struct __align__(8) __half2_2 {
__half2 i_0;
__half2 i_1;
};

struct __align__(8) __half_4 {
__half i_0;
__half i_1;
__half i_2;
__half i_3;
};

extern "C" __global__ void
lined_clamp_kernel_f16(__half_4 input_0[], __half_4 output_0[], uint info[]) {

int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);

uint idxGlobal =
(absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
(absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
__half_4 l_0_0;
uint l_0_1;
bool l_0_2;
__half_4 l_0_3;
l_0_1 = info[uint(0)];
l_0_2 = uint(0) < l_0_1;
l_0_3 = input_0[uint(0)];
l_0_0 = __half_4{
(l_0_2) ? l_0_3.i_0 : __half(0.0),
(l_0_2) ? l_0_3.i_1 : __half(0.0),
(l_0_2) ? l_0_3.i_2 : __half(0.0),
(l_0_2) ? l_0_3.i_3 : __half(0.0),
};
l_0_0 = __half2_2{
__hmax2(__half(0.0),
__hmin2(__half(2.0), (reinterpret_cast<__half2_2 &>(l_0_0)).i_0)),
__hmax2(__half(0.0),
__hmin2(__half(2.0), (reinterpret_cast<__half2_2 &>(l_0_0)).i_1)),
};
uint l_0_4;
bool l_0_5;
l_0_4 = info[uint(1)];
l_0_5 = idxGlobal < l_0_4;
if (l_0_5) {
output_0[idxGlobal] = l_0_0;
}
}
50 changes: 50 additions & 0 deletions crates/cubecl-cuda/tests/lined_clamp_f32.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
typedef unsigned char uint8;
typedef unsigned short uint16;
typedef unsigned int uint;
typedef unsigned long long int uint64;
typedef long long int int64;

struct __align__(16) float_4 {
float i_0;
float i_1;
float i_2;
float i_3;
};

extern "C" __global__ void
lined_clamp_kernel_f32(float_4 input_0[], float_4 output_0[], uint info[]) {

int3 absoluteIdx = make_int3(blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y,
blockIdx.z * blockDim.z + threadIdx.z);

uint idxGlobal =
(absoluteIdx.z * gridDim.x * blockDim.x * gridDim.y * blockDim.y) +
(absoluteIdx.y * gridDim.x * blockDim.x) + absoluteIdx.x;
float_4 l_0_0;
uint l_0_1;
bool l_0_2;
float_4 l_0_3;
l_0_1 = info[uint(0)];
l_0_2 = uint(0) < l_0_1;
l_0_3 = input_0[uint(0)];
l_0_0 = float_4{
(l_0_2) ? l_0_3.i_0 : float(0.0),
(l_0_2) ? l_0_3.i_1 : float(0.0),
(l_0_2) ? l_0_3.i_2 : float(0.0),
(l_0_2) ? l_0_3.i_3 : float(0.0),
};
l_0_0 = float_4{
max(float(0.0), min(float(2.0), l_0_0.i_0)),
max(float(0.0), min(float(2.0), l_0_0.i_1)),
max(float(0.0), min(float(2.0), l_0_0.i_2)),
max(float(0.0), min(float(2.0), l_0_0.i_3)),
};
uint l_0_4;
bool l_0_5;
l_0_4 = info[uint(1)];
l_0_5 = idxGlobal < l_0_4;
if (l_0_5) {
output_0[idxGlobal] = l_0_0;
}
}
56 changes: 56 additions & 0 deletions crates/cubecl-cuda/tests/main.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
use std::num::NonZero;

use common::*;
use constant_array_kernel::ConstantArrayKernel;
use cubecl_core as cubecl;
use cubecl_core::prelude::*;
use cubecl_cuda::CudaRuntime;
use execute_unary_kernel::ExecuteUnaryKernel;
use half::bf16;
use half::f16;
use kernel_sum::KernelSum;
use naming_kernel::NamingKernel;
use pretty_assertions::assert_eq;
Expand Down Expand Up @@ -134,3 +137,56 @@ pub fn naming() {
let expected = expected.trim();
assert_eq!(compile(kernel), expected);
}

#[cube(launch, create_dummy_kernel)]
fn clamp_kernel<F: Float>(input: &Array<F>, out: &mut Array<F>) {
out[ABSOLUTE_POS] = F::clamp(input[0], F::from_int(0), F::from_int(2));
}

#[test]
pub fn test_clamp() {
let kernel = clamp_kernel::ClampKernel::<f32, CudaRuntime>::new(settings(), array(), array());
let expected = include_str!("clamp_f32.cu").replace("\r\n", "\n");
assert_eq!(compile(kernel), expected);

let kernel = clamp_kernel::ClampKernel::<f16, CudaRuntime>::new(settings(), array(), array());
let expected = include_str!("clamp_f16.cu").replace("\r\n", "\n");
assert_eq!(compile(kernel), expected);
}

#[cube(launch, create_dummy_kernel)]
fn lined_clamp_kernel<F: Float>(input: &Array<Line<F>>, out: &mut Array<Line<F>>) {
out[ABSOLUTE_POS] = Line::<F>::clamp(
input[0],
Line::new(F::from_int(0)),
Line::new(F::from_int(2)),
);
}

#[test]
pub fn test_lined_clamp() {
let arg4 = ArrayCompilationArg {
inplace: None,
vectorisation: NonZero::new(4),
};

let kernel = lined_clamp_kernel::LinedClampKernel::<f32, CudaRuntime>::new(
settings(),
arg4.clone(),
arg4.clone(),
);

let expected = include_str!("lined_clamp_f32.cu").replace("\r\n", "\n");
assert_eq!(compile(kernel), expected);

let kernel = lined_clamp_kernel::LinedClampKernel::<f16, CudaRuntime>::new(
settings(),
arg4.clone(),
arg4.clone(),
);
// TODO: Regenerate when correct
// std::fs::write("tests/lined_clamp_f16.cu", compile(kernel));

let expected = include_str!("lined_clamp_f16.cu").replace("\r\n", "\n");
assert_eq!(compile(kernel), expected);
}