KernelTuner
diff --git a/‎docs/api.rst
Lines changed: 1 addition & 0 deletions b/‎docs/api.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/build_api.py
Lines changed: 14 additions & 7 deletions b/‎docs/build_api.py
Lines changed: 14 additions & 7 deletions
diff --git a/‎examples/vector_add/main.cu
Lines changed: 9 additions & 7 deletions b/‎examples/vector_add/main.cu
Lines changed: 9 additions & 7 deletions
diff --git a/‎examples/vector_add_tiling/main.cu
Lines changed: 5 additions & 12 deletions b/‎examples/vector_add_tiling/main.cu
Lines changed: 5 additions & 12 deletions
diff --git a/‎include/kernel_float/apply.h
Lines changed: 31 additions & 1 deletion b/‎include/kernel_float/apply.h
Lines changed: 31 additions & 1 deletion
diff --git a/‎include/kernel_float/base.h
Lines changed: 14 additions & 0 deletions b/‎include/kernel_float/base.h
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/kernel_float/bf16.h
Lines changed: 2 additions & 8 deletions b/‎include/kernel_float/bf16.h
Lines changed: 2 additions & 8 deletions
diff --git a/‎include/kernel_float/binops.h
Lines changed: 58 additions & 39 deletions b/‎include/kernel_float/binops.h
Lines changed: 58 additions & 39 deletions
@@ -3,6 +3,7 @@ API Reference
 .. toctree::
    api/types.rst
    api/primitives.rst
+   api/conversion.rst
    api/generation.rst
    api/unary_operators.rst
    api/binary_operators.rst
 
@@ -83,15 +83,18 @@ def build_index_page(groups):
             "reduce",
             "zip",
             "zip_common",
-            "cast",
-            "broadcast",
-            "convert",
             "make_vec",
             "into_vec",
             "concat",
             "select",
             "for_each",
         ],
+        "Conversion": [
+            "convert",
+            "cast",
+            "cast_to",
+            "broadcast",
+        ],
         "Generation": [
             ("range", "range()"),
             ("range", "range(F fun)"),
@@ -186,13 +189,14 @@ def build_index_page(groups):
             "sin",
             "sinh",
             ("sqrt", "sqrt(const V&)"),
+            "rsqrt",
             "tan",
             "tanh",
             "tgamma",
-            "trunc",
+            "rcp",
             "rint",
-            "rsqrt",
             "round",
+            "trunc",
             "signbit",
             "isinf",
             "isnan",
@@ -203,6 +207,9 @@ def build_index_page(groups):
                 "fast_cos",
                 "fast_sin",
                 "fast_tan",
+                "fast_rcp",
+                "fast_sqrt",
+                "fast_rsqrt",
                 "fast_div",
         ],
         "Conditional": [
@@ -211,7 +218,6 @@ def build_index_page(groups):
             ("where", "where(const C&)"),
         ],
         "Memory read/write": [
-            "cast_to",
             ("read", "read(const T*, const I&, const M&)"),
             ("write", "write(T*, const I&, const V&, const M&)"),
 
@@ -220,8 +226,9 @@ def build_index_page(groups):
 
             ("read_aligned", "read_aligned(const T*)"),
             ("write_aligned", "write_aligned(T*, const V&)"),
+            "assert_aligned",
 
-            ("aligned_ptr", "aligned_ptr", "struct"),
+            ("vector_ptr", "vector_ptr", "struct"),
         ],
         "Utilities": [
             ("constant", "constant", "struct"),
 
@@ -13,13 +13,15 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N>
-__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
+__global__ void my_kernel(
+    int length,
+    kf::vec_ptr<const half, N> input,
+    double constant,
+    kf::vec_ptr<half, N, float> output) {
     int i = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (i * N < length) {
-        auto a = kf::read_aligned<N>(input + i * N);
-        auto b = kf::fma(a, a, kf::cast<__half>(constant));
-        kf::write_aligned<N>(output + i * N, b);
+        output(i) = kf::fma(input[i], input[i], kf::cast<__half>(constant));
     }
 }
 
@@ -51,9 +53,9 @@ void run_kernel(int n) {
     int grid_size = (n + items_per_block - 1) / items_per_block;
     my_kernel<items_per_thread><<<grid_size, block_size>>>(
         n,
-        kf::aligned_ptr(input_dev),
+        kf::assert_aligned(input_dev),
         constant,
-        kf::aligned_ptr(output_dev));
+        kf::assert_aligned(output_dev));
 
     // Copy results back
     cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
@@ -80,7 +82,7 @@ int main() {
 
     run_kernel<1>(n);
     run_kernel<2>(n);
-    run_kernel<3>(n);
+    //    run_kernel<3>(n);
     run_kernel<4>(n);
     run_kernel<8>(n);
 
 
@@ -14,11 +14,7 @@ void cuda_check(cudaError_t code) {
 }
 
 template<int N, int B>
-__global__ void my_kernel(
-    int length,
-    kf::aligned_ptr<const __half> input,
-    double constant,
-    kf::aligned_ptr<float> output) {
+__global__ void my_kernel(int length, const __half* input, double constant, float* output) {
     auto tiling = kf::tiling<
         kf::tile_factor<N>,
         kf::block_size<B>,
@@ -27,9 +23,9 @@ __global__ void my_kernel(
     auto points = int(blockIdx.x * tiling.tile_size(0)) + tiling.local_points(0);
     auto mask = tiling.local_mask();
 
-    auto a = input.read(points, mask);
+    auto a = kf::read(input, points, mask);
     auto b = (a * a) * constant;
-    output.write(points, b, mask);
+    kf::write(output, points, b, mask);
 }
 
 template<int items_per_thread, int block_size = 256>
@@ -57,11 +53,8 @@ void run_kernel(int n) {
     // Launch kernel!
     int items_per_block = block_size * items_per_thread;
     int grid_size = (n + items_per_block - 1) / items_per_block;
-    my_kernel<items_per_thread, block_size><<<grid_size, block_size>>>(
-        n,
-        kf::aligned_ptr(input_dev),
-        constant,
-        kf::aligned_ptr(output_dev));
+    my_kernel<items_per_thread, block_size>
+        <<<grid_size, block_size>>>(n, input_dev, constant, output_dev);
 
     // Copy results back
     cuda_check(cudaMemcpy(output_dev, output_result.data(), sizeof(float) * n, cudaMemcpyDefault));
 
@@ -152,6 +152,9 @@ struct apply_recur_impl<1> {
         result[0] = fun(inputs[0]...);
     }
 };
+
+template<typename F, size_t N, typename Output, typename... Args>
+struct apply_fastmath_impl: apply_impl<F, N, Output, Args...> {};
 }  // namespace detail
 
 template<typename F, typename... Args>
@@ -174,7 +177,34 @@ KERNEL_FLOAT_INLINE map_type<F, Args...> map(F fun, const Args&... args) {
     using E = broadcast_vector_extent_type<Args...>;
     vector_storage<Output, E::value> result;
 
-    detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>::call(
+    // Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
+#if KERNEL_FLOAT_FAST_MATH
+    using apply_impl = detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>;
+#else
+    using apply_impl = detail::apply_impl<F, E::value, Output, vector_value_type<Args>...>;
+#endif
+
+    apply_impl::call(
+        fun,
+        result.data(),
+        (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
+             into_vector_storage(args))
+             .data())...);
+
+    return result;
+}
+
+/**
+ * Apply the function `F` to each element from the vector `input` and return the results as a new vector. This
+ * uses fast-math if available for the given function `F`, otherwise this function behaves like `map`.
+ */
+template<typename F, typename... Args>
+KERNEL_FLOAT_INLINE map_type<F, Args...> fast_map(F fun, const Args&... args) {
+    using Output = result_t<F, vector_value_type<Args>...>;
+    using E = broadcast_vector_extent_type<Args...>;
+    vector_storage<Output, E::value> result;
+
+    detail::apply_fastmath_impl<F, E::value, Output, vector_value_type<Args>...>::call(
         fun,
         result.data(),
         (detail::broadcast_impl<vector_value_type<Args>, vector_extent_type<Args>, E>::call(
 
@@ -89,6 +89,20 @@ struct extent<N> {
     static constexpr size_t size = N;
 };
 
+namespace detail {
+// Indicates that elements of type `T` offer less precision than floats, thus operations
+// on elements of type `T` can be performed by upcasting them to ` float`.
+template<typename T>
+struct allow_float_fallback {
+    static constexpr bool value = false;
+};
+
+template<>
+struct allow_float_fallback<float> {
+    static constexpr bool value = true;
+};
+}  // namespace detail
+
 template<typename T>
 struct into_vector_impl {
     using value_type = T;
 
@@ -72,11 +72,7 @@ KERNEL_FLOAT_BF16_UNARY_FUN(rsqrt, ::hrsqrt, ::h2rsqrt)
 KERNEL_FLOAT_BF16_UNARY_FUN(sin, ::hsin, ::h2sin)
 KERNEL_FLOAT_BF16_UNARY_FUN(sqrt, ::hsqrt, ::h2sqrt)
 KERNEL_FLOAT_BF16_UNARY_FUN(trunc, ::htrunc, ::h2trunc)
-
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_exp, ::hexp, ::h2exp)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_log, ::hlog, ::h2log)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_cos, ::hcos, ::h2cos)
-KERNEL_FLOAT_BF16_UNARY_FUN(fast_sin, ::hsin, ::h2sin)
+KERNEL_FLOAT_BF16_UNARY_FUN(rcp, ::hrcp, ::h2rcp)
 #endif
 
 #if KERNEL_FLOAT_CUDA_ARCH >= 800
@@ -114,10 +110,8 @@ KERNEL_FLOAT_BF16_BINARY_FUN(divide, __hdiv, __h2div)
 KERNEL_FLOAT_BF16_BINARY_FUN(min, __hmin, __hmin2)
 KERNEL_FLOAT_BF16_BINARY_FUN(max, __hmax, __hmax2)
 
-KERNEL_FLOAT_BF16_BINARY_FUN(fast_div, __hdiv, __h2div)
-
 KERNEL_FLOAT_BF16_BINARY_FUN(equal_to, __heq, __heq2)
-KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __heq, __heq2)
+KERNEL_FLOAT_BF16_BINARY_FUN(not_equal_to, __hneu, __hneu2)
 KERNEL_FLOAT_BF16_BINARY_FUN(less, __hlt, __hlt2)
 KERNEL_FLOAT_BF16_BINARY_FUN(less_equal, __hle, __hle2)
 KERNEL_FLOAT_BF16_BINARY_FUN(greater, __hgt, __hgt2)
 
@@ -7,9 +7,7 @@
 namespace kernel_float {
 
 template<typename F, typename L, typename R>
-using zip_type = vector<
-    result_t<F, vector_value_type<L>, vector_value_type<R>>,
-    broadcast_vector_extent_type<L, R>>;
+using zip_type = map_type<F, L, R>;
 
 /**
  * Combines the elements from the two inputs (`left` and `right`)  element-wise, applying a provided binary
@@ -25,20 +23,7 @@ using zip_type = vector<
  */
 template<typename F, typename L, typename R>
 KERNEL_FLOAT_INLINE zip_type<F, L, R> zip(F fun, const L& left, const R& right) {
-    using A = vector_value_type<L>;
-    using B = vector_value_type<R>;
-    using O = result_t<F, A, B>;
-    using E = broadcast_vector_extent_type<L, R>;
-    vector_storage<O, E::value> result;
-
-    detail::apply_impl<F, E::value, O, A, B>::call(
-        fun,
-        result.data(),
-        detail::broadcast_impl<A, vector_extent_type<L>, E>::call(into_vector_storage(left)).data(),
-        detail::broadcast_impl<B, vector_extent_type<R>, E>::call(into_vector_storage(right))
-            .data());
-
-    return result;
+    return ::kernel_float::map(fun, left, right);
 }
 
 template<typename F, typename L, typename R>
@@ -67,7 +52,14 @@ KERNEL_FLOAT_INLINE zip_common_type<F, L, R> zip_common(F fun, const L& left, co
 
     vector_storage<O, E::value> result;
 
-    detail::apply_impl<F, E::value, O, T, T>::call(
+// Use the `apply_fastmath_impl` if KERNEL_FLOAT_FAST_MATH is enabled
+#if KERNEL_FLOAT_FAST_MATH
+    using apply_impl = detail::apply_fastmath_impl<F, E::value, O, T, T>;
+#else
+    using apply_impl = detail::apply_impl<F, E::value, O, T, T>;
+#endif
+
+    apply_impl::call(
         fun,
         result.data(),
         detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
@@ -277,36 +269,17 @@ KERNEL_FLOAT_DEFINE_BINARY(
 #if KERNEL_FLOAT_IS_DEVICE
 KERNEL_FLOAT_DEFINE_BINARY(
     rhypot,
-    (T(1) / ops::hypot<T>()(left, right)),
+    (ops::rcp<T>(ops::hypot<T>()(left, right))),
     ::rhypot(left, right),
     ::rhypotf(left, right))
 #else
 KERNEL_FLOAT_DEFINE_BINARY(
     rhypot,
-    (T(1) / ops::hypot<T>()(left, right)),
+    (ops::rcp<T>(ops::hypot<T>()(left, right))),
     (double(1) / ::hypot(left, right)),
     (float(1) / ::hypotf(left, right)))
 #endif
 
-#if KERNEL_FLOAT_IS_DEVICE
-#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \
-    KERNEL_FLOAT_DEFINE_BINARY(                                       \
-        FUN_NAME,                                                     \
-        ops::OP_NAME<T> {}(left, right),                              \
-        ops::OP_NAME<double> {}(left, right),                         \
-        ops::OP_NAME<float> {}(left, right))
-#else
-#define KERNEL_FLOAT_DEFINE_BINARY_FAST(FUN_NAME, OP_NAME, FLOAT_FUN) \
-    KERNEL_FLOAT_DEFINE_BINARY(                                       \
-        FUN_NAME,                                                     \
-        ops::OP_NAME<T> {}(left, right),                              \
-        ops::OP_NAME<double> {}(left, right),                         \
-        ops::OP_NAME<float> {}(left, right))
-#endif
-
-KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_div, divide, __fdividef)
-KERNEL_FLOAT_DEFINE_BINARY_FAST(fast_pow, pow, __powf)
-
 namespace ops {
 template<>
 struct add<bool> {
@@ -323,6 +296,52 @@ struct multiply<bool> {
 };
 };  // namespace ops
 
+namespace detail {
+template<typename T, size_t N>
+struct apply_fastmath_impl<ops::divide<T>, N, T, T, T> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::divide<T> fun, T* result, const T* lhs, const T* rhs) {
+        T rhs_rcp[N];
+
+        // Fast way to perform division is to multiply by the reciprocal
+        apply_fastmath_impl<ops::rcp<T>, N, T, T, T>::call({}, rhs_rcp, rhs);
+        apply_fastmath_impl<ops::multiply<T>, N, T, T, T>::call({}, result, lhs, rhs_rcp);
+    }
+};
+
+#if KERNEL_FLOAT_IS_DEVICE
+template<size_t N>
+struct apply_fastmath_impl<ops::divide<float>, N, float, float, float> {
+    KERNEL_FLOAT_INLINE static void
+    call(ops::divide<float> fun, float* result, const float* lhs, const float* rhs) {
+#pragma unroll
+        for (size_t i = 0; i < N; i++) {
+            result[i] = __fdividef(lhs[i], rhs[i]);
+        }
+    }
+};
+#endif
+}  // namespace detail
+
+template<typename L, typename R, typename T = promoted_vector_value_type<L, R>>
+KERNEL_FLOAT_INLINE zip_common_type<ops::divide<T>, T, T>
+fast_divide(const L& left, const R& right) {
+    using E = broadcast_vector_extent_type<L, R>;
+    vector_storage<T, E::value> result;
+
+    detail::apply_fastmath_impl<ops::divide<T>, E::value, T, T, T>::call(
+        ops::divide<T> {},
+        result.data(),
+        detail::convert_impl<vector_value_type<L>, vector_extent_type<L>, T, E>::call(
+            into_vector_storage(left))
+            .data(),
+        detail::convert_impl<vector_value_type<R>, vector_extent_type<R>, T, E>::call(
+            into_vector_storage(right))
+            .data());
+
+    return result;
+}
+
 namespace detail {
 template<typename T>
 struct cross_impl {