Skip to content

Commit 534c4cd

Browse files
committed
Merge branch 'main' into dev
2 parents 86adf93 + 5208898 commit 534c4cd

File tree

4 files changed

+77
-103
lines changed

4 files changed

+77
-103
lines changed

kernel_tuner/example.cu

Lines changed: 0 additions & 12 deletions
This file was deleted.

kernel_tuner/example.py

Lines changed: 0 additions & 91 deletions
This file was deleted.

kernel_tuner/vector_add.cu

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#include "kernel_float.h"
2+
namespace kf = kernel_float;
3+
4+
__global__ void vector_add(
5+
kf::vec<float_type, elements_per_thread>* c,
6+
const kf::vec<float_type, elements_per_thread>* a,
7+
const kf::vec<float_type, elements_per_thread>* b,
8+
int n
9+
) {
10+
int i = blockIdx.x * blockDim.x + threadIdx.x;
11+
if (i * elements_per_thread < n) {
12+
c[i] = a[i] + b[i];
13+
}
14+
}

kernel_tuner/vector_add.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/usr/bin/env python
2+
import os
3+
4+
import numpy
5+
from kernel_tuner import tune_kernel
6+
from kernel_tuner.accuracy import TunablePrecision, AccuracyObserver
7+
8+
# Specify the compiler flags Kernel Tuner should use to compile our kernel
9+
ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + "/../"
10+
flags = [f"-I{ROOT_DIR}/include", "-std=c++17"]
11+
12+
def tune():
13+
14+
# Prepare input data
15+
size = 100000000
16+
n = numpy.int32(size)
17+
a = numpy.random.randn(size).astype(numpy.float64)
18+
b = numpy.random.randn(size).astype(numpy.float64)
19+
c = numpy.zeros_like(b)
20+
21+
# Prepare the argument list of the kernel
22+
args = [
23+
TunablePrecision("float_type", c),
24+
TunablePrecision("float_type", a),
25+
TunablePrecision("float_type", b),
26+
n,
27+
]
28+
29+
# Define the reference answer to compute the kernel output against
30+
answer = [a+b, None, None, None]
31+
32+
# Define the tunable parameters, in this case thread block size
33+
# and the type to use for the input and output data of our kernel
34+
tune_params = dict()
35+
tune_params["block_size_x"] = [64, 128, 256, 512]
36+
tune_params["float_type"] = ["half", "float", "double"]
37+
tune_params["elements_per_thread"] = [1, 2, 4, 8]
38+
39+
# Observers will measure the error using either RMSE or MRE as error metric
40+
observers = [
41+
AccuracyObserver("RMSE", "error_rmse"),
42+
AccuracyObserver("MRE", "error_relative"),
43+
]
44+
45+
# The metrics here are only to ensure Kernel Tuner prints them to the console
46+
metrics = dict(RMSE=lambda p: p["error_rmse"], MRE=lambda p: p["error_relative"])
47+
48+
results, env = tune_kernel(
49+
"vector_add",
50+
"vector_add.cu",
51+
size,
52+
args,
53+
tune_params,
54+
answer=answer,
55+
observers=observers,
56+
metrics=metrics,
57+
lang="cupy",
58+
compiler_options=flags
59+
)
60+
61+
62+
if __name__ == "__main__":
63+
tune()

0 commit comments

Comments
 (0)