Fixes

Gabriele-Codega · Gabriele-Codega · commit b9cfece0910c · 2025-03-20T18:39:50.000+01:00
- readme: typos and content
- requirements: slimmer
- tests: assertions
- run: also run without line_profiler
- routines: dimension check
diff --git a/README.md b/README.md
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,12 +1,13 @@
 # Choosing a build backend:
+[build-system]
 requires = ["setuptools"]
 build-backend = "setuptools.build_meta"
 
 
 [project]
 name = "matmul"
 version = "0.0.1"
-description = "Gabriele Codega"
+description = "Distributed matrix multiplication."
 readme = "README.md"
 requires-python = ">=3.11"
 license = { file = "LICENSE" }
@@ -22,3 +23,5 @@ dependencies = { file = ["requirements.txt"] }
 
 [project.optional-dependencies]
 test = ["pytest"]
+profile = ["line_profiler"]
+dev = ["pytest", "line_profiler"]
diff --git a/requirements.txt b/requirements.txt
@@ -1,35 +1,5 @@
-filelock==3.17.0
-fsspec==2025.2.0
-iniconfig==2.0.0
-Jinja2==3.1.5
-line_profiler==4.2.0
 llvmlite==0.44.0
-MarkupSafe==3.0.2
-mpi4py==4.0.3
-mpmath==1.3.0
-networkx==3.4.2
+mpi4py==4.0.3 --no-binary=mpi4py
 numba==0.61.0
 numpy==2.1.3
-nvidia-cublas-cu12==12.4.5.8
-nvidia-cuda-cupti-cu12==12.4.127
-nvidia-cuda-nvrtc-cu12==12.4.127
-nvidia-cuda-runtime-cu12==12.4.127
-nvidia-cudnn-cu12==9.1.0.70
-nvidia-cufft-cu12==11.2.1.3
-nvidia-curand-cu12==10.3.5.147
-nvidia-cusolver-cu12==11.6.1.9
-nvidia-cusparse-cu12==12.3.1.170
-nvidia-cusparselt-cu12==0.6.2
-nvidia-ml-py==12.570.86
-nvidia-nccl-cu12==2.21.5
-nvidia-nvjitlink-cu12==12.4.127
-nvidia-nvtx-cu12==12.4.127
-packaging==24.2
-pluggy==1.5.0
-pytest==8.3.5
 PyYAML==6.0.2
-scipy==1.15.2
-sympy==1.13.1
-torch==2.6.0
-triton==3.2.0
-typing_extensions==4.12.2
diff --git a/scripts/run.py b/scripts/run.py
@@ -1,3 +1,5 @@
+from functools import wraps
+from warnings import warn
 import numpy as np
 from numba import cuda
 
@@ -11,7 +13,14 @@
 import argparse
 import importlib
 
-from line_profiler import profile
+try:
+    from line_profiler import profile
+except ModuleNotFoundError:
+    warn("Did not find line_profiler. Please install it to access profiling information.")
+    def profile(f,*args,**kwargs):
+        def wrapper(*args,**kwargs):
+            f(*args,**kwargs)
+        return wrapper
 
 @profile
 def main_cpu(params: dict):
diff --git a/src/matmul/routines.py b/src/matmul/routines.py
@@ -2,6 +2,7 @@
 import numba
 
 def matmul(A,B,C,_):
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), f"Matrices have incompatible shapes: {A.shape}, {B.shape}, {C.shape}"
     for i in range(A.shape[0]):
         for j in range(B.shape[1]):
             tmp = 0.
@@ -11,13 +12,15 @@ def matmul(A,B,C,_):
 
 @njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), cache=True)
 def matmul_numba_serial(A,B,C,_):
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), f"Matrices have incompatible shapes: {A.shape}, {B.shape}, {C.shape}"
     for i in range(A.shape[0]):
         for k in range(A.shape[-1]):
             for j in range(B.shape[1]):
                 C[i,j] += A[i,k] * B[k,j]
 
 @njit(void(float64[:,::1],float64[:,::1],float64[:,:],numba.optional(int32)), parallel=True, nogil=True, cache=True)
 def matmul_numba_cpu(A,B,C,_):
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), f"Matrices have incompatible shapes: {A.shape}, {B.shape}, {C.shape}"
     for i in prange(A.shape[0]):
         for k in range(A.shape[1]):
             for j in range(B.shape[1]):
@@ -27,6 +30,7 @@ def matmul_numba_cpu(A,B,C,_):
 
 @njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=True, nogil=True, cache=True)
 def matmul_numba_block_cpu(A,B,C, bs=64):
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), f"Matrices have incompatible shapes: {A.shape}, {B.shape}, {C.shape}"
     N = A.shape[0]
     M = B.shape[1]
     K = A.shape[1]
@@ -45,6 +49,7 @@ def matmul_numba_block_cpu(A,B,C, bs=64):
 
 @njit(void(float64[:,::1],float64[:,::1],float64[:,:],int32), parallel=False, nogil=True, cache=True)
 def matmul_numba_block_serial(A,B,C, bs=64):
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), f"Matrices have incompatible shapes: {A.shape}, {B.shape}, {C.shape}"
     N = A.shape[0]
     M = B.shape[1]
     K = A.shape[1]
@@ -61,8 +66,10 @@ def matmul_numba_block_serial(A,B,C, bs=64):
                         for j in range(jj,jmax):
                             C[i,j] += A[i,k] * B[k,j]
 
-@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
+@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True, debug=False)
 def matmul_numba_gpu(A,B,C):
+    # this only has effect if function is compiled with debug = True
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), "Matrices have incompatible shapes"
     i, j = cuda.grid(ndim=2)
     if i < C.shape[0] and j < C.shape[1]:
         tmp = 0.
@@ -71,8 +78,10 @@ def matmul_numba_gpu(A,B,C):
         C[i,j] = tmp
 
 BLOCK_SIZE = 16
-@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True)
+@cuda.jit(void(float64[:,::1],float64[:,::1],float64[:,:]), cache=True, debug=False)
 def matmul_numba_block_gpu(A,B,C):
+    # this only has effect if function is compiled with debug = True
+    assert (A.shape[0] == C.shape[0]) and (A.shape[1] == B.shape[0]) and (B.shape[1] == C.shape[1]), "Matrices have incompatible shapes"
 
     bi = cuda.blockIdx.y
     bj = cuda.blockIdx.x
diff --git a/src/matmul/utils.py b/src/matmul/utils.py
@@ -10,7 +10,7 @@ def read_config(config_path: str):
 
     Params:
       - config_path : str
-        Path to the config file, withot the extension.
+        Path to the config file, without the extension.
     Returns:
       - params:
         Dictionary with parameters.
diff --git a/test/test_shared.py b/test/test_shared.py
@@ -15,6 +15,22 @@ def test_matmul():
 
     assert np.allclose(np.eye(size),C)
 
+    with pytest.raises(AssertionError):
+        A = np.empty((size,size+1))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul(A,B,C,None)
+        A = np.empty((size+1,size))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul(A,B,C,None)
+        A = np.empty((size,size))
+        B = np.empty((size,size+1))
+        C = np.empty((size,size))
+        matmul(A,B,C,None)
+
+
+
 def test_matmul_numba_cpu():
     size = 20
     np.random.seed(0)
@@ -26,6 +42,21 @@ def test_matmul_numba_cpu():
 
     assert np.allclose(np.eye(size),C)
 
+    with pytest.raises(AssertionError):
+        A = np.empty((size,size+1),dtype=np.float64)
+        B = np.empty((size,size),dtype=np.float64)
+        C = np.empty((size,size),dtype=np.float64)
+        matmul_numba_cpu(A,B,C,None)
+        A = np.empty((size+1,size),dtype=np.float64)
+        B = np.empty((size,size),dtype=np.float64)
+        C = np.empty((size,size),dtype=np.float64)
+        matmul_numba_cpu(A,B,C,None)
+        A = np.empty((size,size),dtype=np.float64)
+        B = np.empty((size,size+1),dtype=np.float64)
+        C = np.empty((size,size),dtype=np.float64)
+        matmul_numba_cpu(A,B,C,None)
+
+
 def test_matmul_numba_serial():
     size = 20
     np.random.seed(0)
@@ -37,6 +68,20 @@ def test_matmul_numba_serial():
 
     assert np.allclose(np.eye(size),C)
 
+    with pytest.raises(AssertionError):
+        A = np.empty((size,size+1))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_serial(A,B,C,None)
+        A = np.empty((size+1,size))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_serial(A,B,C,None)
+        A = np.empty((size,size))
+        B = np.empty((size,size+1))
+        C = np.empty((size,size))
+        matmul_numba_serial(A,B,C,None)
+
 def test_matmul_numba_block_cpu():
     size = 20
     np.random.seed(0)
@@ -48,6 +93,20 @@ def test_matmul_numba_block_cpu():
 
     assert np.allclose(np.eye(size),C)
 
+    with pytest.raises(AssertionError):
+        A = np.empty((size,size+1))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_block_cpu(A,B,C,6)
+        A = np.empty((size+1,size))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_block_cpu(A,B,C,6)
+        A = np.empty((size,size))
+        B = np.empty((size,size+1))
+        C = np.empty((size,size))
+        matmul_numba_block_cpu(A,B,C,6)
+
 def test_matmul_numba_block_serial():
     size = 20
     np.random.seed(0)
@@ -59,6 +118,20 @@ def test_matmul_numba_block_serial():
 
     assert np.allclose(np.eye(size),C)
 
+    with pytest.raises(AssertionError):
+        A = np.empty((size,size+1))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_block_serial(A,B,C,6)
+        A = np.empty((size+1,size))
+        B = np.empty((size,size))
+        C = np.empty((size,size))
+        matmul_numba_block_serial(A,B,C,6)
+        A = np.empty((size,size))
+        B = np.empty((size,size+1))
+        C = np.empty((size,size))
+        matmul_numba_block_serial(A,B,C,6)
+
 @pytest.mark.skipif((not numba.cuda.is_available()), reason='Could not find any CUDA GPU')
 def test_matmul_numba_gpu():
     size = 20
@@ -81,6 +154,9 @@ def test_matmul_numba_gpu():
 
     assert np.allclose(np.eye(size),C)
 
+    # No tests for assertion errors on matrix shape since they are only available
+    # for debug=True
+
 @pytest.mark.skipif((not numba.cuda.is_available()), reason='Could not find any CUDA GPU')
 def test_matmul_numba_block_gpu():
     size = 20
@@ -103,4 +179,5 @@ def test_matmul_numba_block_gpu():
 
     assert np.allclose(np.eye(size),C)
 
-
+    # No tests for assertion errors on matrix shape since they are only available
+    # for debug=True