Skip to content

Commit ae31357

Browse files
Fixes to GPU routines, improved tests and results
1 parent ad873e6 commit ae31357

File tree

13 files changed

+193
-43
lines changed

13 files changed

+193
-43
lines changed

README.md

Lines changed: 139 additions & 1 deletion
Large diffs are not rendered by default.

experiments/config.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
device: cpu
2-
size: 512
2+
size: 4096
33
function:
4-
routine: matmul_numba_block_serial
5-
block_size: 24
4+
routine: matmul_numba_serial
5+
block_size: 32
66
print: False

figures/pie_1node_CPU.png

56.9 KB
Loading

figures/pie_1node_GPU.png

65.2 KB
Loading

figures/pie_4nodes_CPU.png

45.8 KB
Loading

figures/pie_4nodes_GPU.png

70.7 KB
Loading

figures/scaling_nodes.png

30.7 KB
Loading

figures/scaling_size.png

44.8 KB
Loading

figures/speedup.png

68 KB
Loading

scripts/run.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,8 +135,10 @@ def main_gpu(params: dict):
135135
a_d = cuda.to_device(A)
136136
c_d = cuda.to_device(C)
137137

138+
# each process at each step computes a block of C of size n_loc x ncols
139+
# we set parameters for the kernel accordingly
138140
nthreads = bs
139-
blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(SIZE + nthreads-1)//nthreads)
141+
blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(ncols + nthreads-1)//nthreads)
140142
threads_per_block = (nthreads, nthreads)
141143

142144
t_tot = 0
@@ -150,6 +152,7 @@ def main_gpu(params: dict):
150152

151153
B_block = np.empty((n_loc,ncols), dtype=np.float64)
152154
B_col = np.empty((SIZE,ncols), dtype=np.float64)
155+
blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(ncols + nthreads-1)//nthreads)
153156

154157
# create a contiguous block from B to communicate
155158
create_block(B, B_block, start, ncols)

shell/submit.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,5 +4,5 @@ rank=$OMPI_COMM_WORLD_RANK
44

55
export NUMBA_NUM_THREADS=1
66

7-
# kernprof -lz -o "logs/time/gpu/256_rank_$rank.lprof" scripts/run.py --config experiments/config
8-
valgrind --tool=cachegrind --cache-sim=yes --cachegrind-out-file="logs/memory/512_naive_rank_$rank.log" python scripts/run.py --config experiments/config
7+
kernprof -lz -o "out_big_$rank.lprof" scripts/run.py --config experiments/config
8+
# valgrind --tool=cachegrind --cache-sim=yes --cachegrind-out-file="logs/memory/512_block_rank_$rank.log" python scripts/run.py --config experiments/config

test/test_distributed.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import numpy as np
44
from numba import cuda
55

6-
from matmul import matmul, matmul_numba_gpu
6+
from matmul import matmul, matmul_numba_block_gpu
77
from matmul.utils import create_block
88

99
import mpi4py
@@ -40,11 +40,12 @@ def test_parallel_cpu():
4040
row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
4141

4242
# initialise matrices somehow
43-
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
44-
B = np.zeros((n_loc,SIZE), dtype=np.float64)
43+
np.random.seed(0)
44+
atot = np.random.rand(SIZE,SIZE)
45+
btot = np.linalg.inv(atot)
46+
A = atot[row_offset:row_offset+n_loc,:]
47+
B = btot[row_offset:row_offset+n_loc,:]
4548
C = np.zeros((n_loc,SIZE), dtype=np.float64)
46-
for i in range(n_loc):
47-
B[i, i+row_offset] = 1
4849

4950
# Compute quantities for Allgatherv and allocate required memory
5051
ncols = workloads[0]
@@ -80,12 +81,12 @@ def test_parallel_cpu():
8081
rcvcounts = workloads*SIZE
8182
displacements = np.cumsum(rcvcounts) - rcvcounts
8283
if rank == 0:
83-
A_tot = np.arange(1, SIZE*SIZE + 1, dtype=np.float64).reshape((SIZE,SIZE))
84+
target = np.eye(SIZE)
8485
C_tot = np.zeros((SIZE,SIZE))
8586
comm.Gatherv([C, MPI.DOUBLE], [C_tot, rcvcounts, displacements, MPI.DOUBLE])
8687

8788
if rank == 0:
88-
assert np.allclose(A_tot,C_tot)
89+
assert np.allclose(target,C_tot)
8990

9091
comm.Barrier()
9192

@@ -117,11 +118,12 @@ def test_parallel_gpu():
117118
row_offset = np.cumsum(workloads)[rank-1] if rank > 0 else 0
118119

119120
# initialise matrices somehow
120-
A = np.arange(1, SIZE*n_loc + 1, dtype=np.float64).reshape((n_loc,SIZE)) + (row_offset * SIZE)
121-
B = np.zeros((n_loc,SIZE), dtype=np.float64)
121+
np.random.seed(0)
122+
atot = np.random.rand(SIZE,SIZE)
123+
btot = np.linalg.inv(atot)
124+
A = atot[row_offset:row_offset+n_loc,:]
125+
B = btot[row_offset:row_offset+n_loc,:]
122126
C = np.zeros((n_loc,SIZE), dtype=np.float64)
123-
for i in range(n_loc):
124-
B[i, i+row_offset] = 1
125127

126128
# Compute quantities for Allgatherv and allocate required memory
127129
ncols = workloads[0]
@@ -151,9 +153,9 @@ def test_parallel_gpu():
151153

152154
B_block = np.empty((n_loc,ncols), dtype=np.float64)
153155
B_col = np.empty((SIZE,ncols), dtype=np.float64)
154-
155156
blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(ncols + nthreads-1)//nthreads)
156157

158+
157159
# create a contiguous block from B to communicate
158160
create_block(B, B_block, start, ncols)
159161
# gather all pieces of B from other processes
@@ -163,7 +165,7 @@ def test_parallel_gpu():
163165
b_d = cuda.to_device(B_col)
164166

165167
# multiply
166-
matmul_numba_gpu[blocks_per_grid, threads_per_block](a_d,b_d,c_d[:,start:start+ncols])
168+
matmul_numba_block_gpu[blocks_per_grid, threads_per_block](a_d,b_d,c_d[:,start:start+ncols])
167169

168170
start += ncols
169171

@@ -172,12 +174,12 @@ def test_parallel_gpu():
172174
rcvcounts = workloads*SIZE
173175
displacements = np.cumsum(rcvcounts) - rcvcounts
174176
if rank == 0:
175-
A_tot = np.arange(1, SIZE*SIZE + 1, dtype=np.float64).reshape((SIZE,SIZE))
177+
target = np.eye(SIZE)
176178
C_tot = np.zeros((SIZE,SIZE))
177179
comm.Gatherv([C, MPI.DOUBLE], [C_tot, rcvcounts, displacements, MPI.DOUBLE])
178180

179181
if rank == 0:
180-
assert np.allclose(A_tot,C_tot)
182+
assert np.allclose(target,C_tot)
181183

182184
comm.Barrier()
183185

test/test_shared.py

Lines changed: 28 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -6,59 +6,65 @@
66

77
def test_matmul():
88
size = 20
9-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
10-
B = np.eye(size,dtype=np.float64)
9+
np.random.seed(0)
10+
A = np.random.rand(size,size)
11+
B = np.linalg.inv(A)
1112
C = np.zeros((size,size),dtype=np.float64)
1213

1314
matmul(A,B,C,None)
1415

15-
assert np.allclose(A,C)
16+
assert np.allclose(np.eye(size),C)
1617

1718
def test_matmul_numba_cpu():
1819
size = 20
19-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
20-
B = np.eye(size,dtype=np.float64)
20+
np.random.seed(0)
21+
A = np.random.rand(size,size)
22+
B = np.linalg.inv(A)
2123
C = np.zeros((size,size),dtype=np.float64)
2224

2325
matmul_numba_cpu(A,B,C,None)
2426

25-
assert np.allclose(A,C)
27+
assert np.allclose(np.eye(size),C)
2628

2729
def test_matmul_numba_serial():
2830
size = 20
29-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
30-
B = np.eye(size,dtype=np.float64)
31+
np.random.seed(0)
32+
A = np.random.rand(size,size)
33+
B = np.linalg.inv(A)
3134
C = np.zeros((size,size),dtype=np.float64)
3235

3336
matmul_numba_serial(A,B,C,None)
3437

35-
assert np.allclose(A,C)
38+
assert np.allclose(np.eye(size),C)
3639

3740
def test_matmul_numba_block_cpu():
3841
size = 20
39-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
40-
B = np.eye(size,dtype=np.float64)
42+
np.random.seed(0)
43+
A = np.random.rand(size,size)
44+
B = np.linalg.inv(A)
4145
C = np.zeros((size,size),dtype=np.float64)
4246

4347
matmul_numba_block_cpu(A,B,C,6)
4448

45-
assert np.allclose(A,C)
49+
assert np.allclose(np.eye(size),C)
4650

4751
def test_matmul_numba_block_serial():
4852
size = 20
49-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
50-
B = np.eye(size,dtype=np.float64)
53+
np.random.seed(0)
54+
A = np.random.rand(size,size)
55+
B = np.linalg.inv(A)
5156
C = np.zeros((size,size),dtype=np.float64)
5257

5358
matmul_numba_block_serial(A,B,C,6)
5459

55-
assert np.allclose(A,C)
60+
assert np.allclose(np.eye(size),C)
5661

5762
@pytest.mark.skipif((not numba.cuda.is_available()), reason='Could not find any CUDA GPU')
5863
def test_matmul_numba_gpu():
5964
size = 20
60-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
61-
B = np.eye(size,dtype=np.float64)
65+
np.random.seed(0)
66+
A = np.random.rand(size,size)
67+
B = np.linalg.inv(A)
6268
C = np.zeros((size,size),dtype=np.float64)
6369

6470
a_d = numba.cuda.to_device(A)
@@ -73,13 +79,14 @@ def test_matmul_numba_gpu():
7379

7480
C = c_d.copy_to_host()
7581

76-
assert np.allclose(A,C)
82+
assert np.allclose(np.eye(size),C)
7783

7884
@pytest.mark.skipif((not numba.cuda.is_available()), reason='Could not find any CUDA GPU')
7985
def test_matmul_numba_block_gpu():
8086
size = 20
81-
A = np.arange(1,size*size+1,1,dtype=np.float64).reshape((size,size))
82-
B = np.eye(size,dtype=np.float64)
87+
np.random.seed(0)
88+
A = np.random.rand(size,size)
89+
B = np.linalg.inv(A)
8390
C = np.zeros((size,size),dtype=np.float64)
8491

8592
a_d = numba.cuda.to_device(A)
@@ -94,6 +101,6 @@ def test_matmul_numba_block_gpu():
94101

95102
C = c_d.copy_to_host()
96103

97-
assert np.allclose(A,C)
104+
assert np.allclose(np.eye(size),C)
98105

99106

0 commit comments

Comments
 (0)