Skip to content

Commit beec70f

Browse files
Merge pull request #1 from daisytuner/initial-examples
adds initial examples
2 parents 1aecfa7 + b1b2916 commit beec70f

File tree

11 files changed

+357
-0
lines changed

11 files changed

+357
-0
lines changed

.daisy/c_docc.yml

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- bellis5
12+
13+
steps:
14+
build: |
15+
docc -Xclang -no-opaque-pointers -g -O1 -mllvm -hotspot -o c/matmul.out c/matmul.c -ldaisy_rtl
16+
17+
run:
18+
matmul:
19+
command: ./c/matmul.out
20+
measurements: 5
21+
profiler: perf
22+
loops: true
23+
metrics:
24+
- flops_dp
25+
- memory_bandwidth

.daisy/c_gcc.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- bellis5
12+
13+
steps:
14+
build: |
15+
gcc -g -O1 -o c/matmul.out c/matmul.c
16+
17+
run:
18+
matmul:
19+
command: ./c/matmul.out
20+
measurements: 5
21+
profiler: perf
22+
metrics:
23+
- flops_dp
24+
- memory_bandwidth

.daisy/cuda.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- zinnia
12+
13+
steps:
14+
build: |
15+
nvcc -O1 -g -o cuda/matmul.out cuda/matmul.cu
16+
17+
run:
18+
matmul_cu:
19+
command: ./cuda/matmul.out
20+
measurements: 5
21+
profiler: nsys
22+
kernels: true

.daisy/onnx.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- tansy
12+
13+
steps:
14+
build: |
15+
16+
run:
17+
squeezenet:
18+
model: /data/squeezenet1.onnx
19+
measurements: 1

.daisy/python_apt.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- bellis5
12+
13+
steps:
14+
build: |
15+
sudo apt-get install -y python3-numpy
16+
17+
run:
18+
matmul:
19+
command: python3 python/matmul_np.py
20+
measurements: 5
21+
profiler: py-spy

.daisy/python_conda.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
conda: "3.12"
11+
partitions:
12+
- bellis5
13+
14+
steps:
15+
build: |
16+
pip install numpy
17+
18+
run:
19+
matmul:
20+
command: python python/matmul_np.py
21+
measurements: 5
22+
profiler: perf

.daisy/python_venv.yml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 20
10+
partitions:
11+
- bellis5
12+
13+
steps:
14+
build: |
15+
python3 -m venv venv
16+
. venv/bin/activate
17+
18+
pip install numpy
19+
20+
run:
21+
matmul:
22+
command: venv/bin/python3 python/matmul_np.py
23+
measurements: 5
24+
profiler: py-spy

.daisy/tflite_coral.yml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
on:
2+
push:
3+
branches:
4+
- main
5+
pull_request:
6+
types: [opened, reopened, synchronize, ready_for_review]
7+
8+
parameters:
9+
timeout: 60
10+
conda: "3.8"
11+
partitions:
12+
- bellis4
13+
14+
steps:
15+
build: |
16+
sudo add-apt-repository 'deb https://packages.cloud.google.com/apt coral-edgetpu-stable main'
17+
curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
18+
sudo apt-get update
19+
sudo apt-get install -y libedgetpu1-std
20+
sudo apt-get install -y usbutils
21+
22+
23+
sudo apt install -y libjpeg-dev zlib1g-dev
24+
25+
wget https://github.com/google-coral/pycoral/releases/download/v2.0.0/tflite_runtime-2.5.0.post1-cp38-cp38-linux_aarch64.whl
26+
wget https://github.com/google-coral/pycoral/releases/download/v2.0.0/pycoral-2.0.0-cp38-cp38-linux_aarch64.whl
27+
28+
pip install Pillow==9.5.0
29+
pip install tflite_runtime-2.5.0.post1-cp38-cp38-linux_aarch64.whl
30+
pip install pycoral-2.0.0-cp38-cp38-linux_aarch64.whl
31+
32+
git clone --recurse-submodules --branch v2.0.0 --depth 1 https://github.com/google-coral/pycoral
33+
cd pycoral
34+
35+
bash examples/install_requirements.sh classify_image.py
36+
37+
run:
38+
list_usb:
39+
command: lsusb | grep Google
40+
classify_image:
41+
command: python pycoral/examples/classify_image.py --model pycoral/test_data/mobilenet_v2_1.0_224_inat_bird_quant_edgetpu.tflite --labels pycoral/test_data/inat_bird_labels.txt --input pycoral/test_data/parrot.jpg

c/matmul.c

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#include <stdio.h>
2+
3+
#define N 64
4+
#define M 64
5+
#define K 64
6+
7+
int main() {
8+
int A[N][K];
9+
int B[K][M];
10+
int C[N][M];
11+
12+
// Init
13+
for (int i = 0; i < N; i++) {
14+
for (int j = 0; j < K; j++) {
15+
A[i][j] = i + j;
16+
}
17+
}
18+
for (int i = 0; i < K; i++) {
19+
for (int j = 0; j < M; j++) {
20+
B[i][j] = i + j;
21+
}
22+
}
23+
for (int i = 0; i < N; i++) {
24+
for (int j = 0; j < M; j++) {
25+
C[i][j] = 0;
26+
}
27+
}
28+
29+
// Matmul
30+
for (int i = 0; i < N; i++) {
31+
for (int j = 0; j < M; j++) {
32+
for (int k = 0; k < K; k++) {
33+
C[i][j] += A[i][k] * B[k][j];
34+
}
35+
}
36+
}
37+
38+
// Print
39+
for (int i = 0; i < N; i++) {
40+
for (int j = 0; j < M; j++) {
41+
printf("%d ", C[i][j]);
42+
}
43+
printf("\n");
44+
}
45+
46+
return 0;
47+
}

cuda/matmul.cu

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#include <cuda_runtime.h>
2+
#include <stdio.h>
3+
#include <stdlib.h>
4+
5+
#define TILE_SIZE 16
6+
7+
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
8+
if (code != cudaSuccess) {
9+
fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line);
10+
if (abort) exit(code);
11+
}
12+
}
13+
14+
__global__ void matrixMulKernel(float *A, float *B, float *C, int N) {
15+
__shared__ float sharedA[TILE_SIZE][TILE_SIZE];
16+
__shared__ float sharedB[TILE_SIZE][TILE_SIZE];
17+
18+
int tx = threadIdx.x, ty = threadIdx.y;
19+
int row = blockIdx.y * TILE_SIZE + ty;
20+
int col = blockIdx.x * TILE_SIZE + tx;
21+
float sum = 0.0f;
22+
23+
for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) {
24+
if (row < N && t * TILE_SIZE + tx < N)
25+
sharedA[ty][tx] = A[row * N + t * TILE_SIZE + tx];
26+
else
27+
sharedA[ty][tx] = 0.0f;
28+
29+
if (col < N && t * TILE_SIZE + ty < N)
30+
sharedB[ty][tx] = B[(t * TILE_SIZE + ty) * N + col];
31+
else
32+
sharedB[ty][tx] = 0.0f;
33+
34+
__syncthreads();
35+
36+
for (int k = 0; k < TILE_SIZE; k++) {
37+
sum += sharedA[ty][k] * sharedB[k][tx];
38+
}
39+
40+
__syncthreads();
41+
}
42+
43+
if (row < N && col < N)
44+
C[row * N + col] = sum;
45+
}
46+
47+
void matrixMultiply(float *h_A, float *h_B, float *h_C, int N) {
48+
float *d_A, *d_B, *d_C;
49+
size_t size = N * N * sizeof(float);
50+
51+
cudaMalloc(&d_A, size);
52+
cudaMalloc(&d_B, size);
53+
cudaMalloc(&d_C, size);
54+
55+
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
56+
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
57+
58+
dim3 blockSize(TILE_SIZE, TILE_SIZE);
59+
dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE);
60+
61+
matrixMulKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N);
62+
63+
cudaDeviceSynchronize();
64+
gpuAssert(cudaGetLastError(), __FILE__, __LINE__);
65+
66+
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
67+
68+
cudaFree(d_A);
69+
cudaFree(d_B);
70+
cudaFree(d_C);
71+
}
72+
73+
int main() {
74+
int N = 64; // Matrix size
75+
size_t size = N * N * sizeof(float);
76+
77+
float *h_A = (float *)malloc(size);
78+
float *h_B = (float *)malloc(size);
79+
float *h_C = (float *)malloc(size);
80+
81+
for (int i = 0; i < N * N; i++) {
82+
h_A[i] = rand() % 10;
83+
h_B[i] = rand() % 10;
84+
}
85+
86+
matrixMultiply(h_A, h_B, h_C, N);
87+
88+
printf("Result matrix:\n");
89+
for (int i = 0; i < N; i++) {
90+
for (int j = 0; j < N; j++) {
91+
printf("%0.1f ", h_C[i * N + j]);
92+
}
93+
printf("\n");
94+
}
95+
96+
free(h_A);
97+
free(h_B);
98+
free(h_C);
99+
100+
return 0;
101+
}

0 commit comments

Comments
 (0)