Gabriele-Codega
diff --git a/‎README.md‎
Lines changed: 139 additions & 1 deletion b/‎README.md‎
Lines changed: 139 additions & 1 deletion
diff --git a/‎experiments/config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎experiments/config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎figures/pie_1node_CPU.png‎
56.9 KB b/‎figures/pie_1node_CPU.png‎
56.9 KB
diff --git a/‎figures/pie_1node_GPU.png‎
65.2 KB b/‎figures/pie_1node_GPU.png‎
65.2 KB
diff --git a/‎figures/pie_4nodes_CPU.png‎
45.8 KB b/‎figures/pie_4nodes_CPU.png‎
45.8 KB
diff --git a/‎figures/pie_4nodes_GPU.png‎
70.7 KB b/‎figures/pie_4nodes_GPU.png‎
70.7 KB
diff --git a/‎figures/scaling_nodes.png‎
30.7 KB b/‎figures/scaling_nodes.png‎
30.7 KB
diff --git a/‎figures/scaling_size.png‎
44.8 KB b/‎figures/scaling_size.png‎
44.8 KB
diff --git a/‎figures/speedup.png‎
68 KB b/‎figures/speedup.png‎
68 KB
diff --git a/‎scripts/run.py‎
Lines changed: 4 additions & 1 deletion b/‎scripts/run.py‎
Lines changed: 4 additions & 1 deletion
@@ -1,6 +1,6 @@
 device: cpu
-size: 512
+size: 4096
 function: 
-  routine: matmul_numba_block_serial
-  block_size: 24
+  routine: matmul_numba_serial
+  block_size: 32
 print: False
@@ -135,8 +135,10 @@ def main_gpu(params: dict):
     a_d = cuda.to_device(A)
     c_d = cuda.to_device(C)
 
+    # each process at each step computes a block of C of size n_loc x ncols
+    # we set parameters for the kernel accordingly
     nthreads = bs
-    blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(SIZE + nthreads-1)//nthreads)
+    blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(ncols + nthreads-1)//nthreads)
     threads_per_block = (nthreads, nthreads)
 
     t_tot = 0
@@ -150,6 +152,7 @@ def main_gpu(params: dict):
 
             B_block = np.empty((n_loc,ncols), dtype=np.float64)
             B_col = np.empty((SIZE,ncols), dtype=np.float64)
+            blocks_per_grid = ((n_loc + nthreads-1)//nthreads,(ncols + nthreads-1)//nthreads)
 
         # create a contiguous block from B to communicate
         create_block(B, B_block, start, ncols)