Merge pull request #385 from ntumlgroup/print-estimate-tree-model-size-with-x-transpose

Eleven1Liu · web-flow · commit 6544547b80b9 · 2024-08-22T16:51:39.000+08:00
estimate and print tree model size
diff --git a/libmultilabel/linear/tree.py b/libmultilabel/linear/tree.py
@@ -7,6 +7,7 @@
 import sklearn.cluster
 import sklearn.preprocessing
 from tqdm import tqdm
+import psutil
 
 from . import linear
 
@@ -135,13 +136,28 @@ def train_tree(
     root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
 
     num_nodes = 0
+    # Both type(x) and type(y) are sparse.csr_matrix
+    # However, type((x != 0).T) becomes sparse.csc_matrix
+    # So type((x != 0).T * y) results in sparse.csc_matrix
+    features_used_perlabel = (x != 0).T * y
 
     def count(node):
         nonlocal num_nodes
         num_nodes += 1
+        node.num_features_used = np.count_nonzero(features_used_perlabel[:, node.label_map].sum(axis=1))
 
     root.dfs(count)
 
+    model_size = get_estimated_model_size(root)
+    print(f'The estimated tree model size is: {model_size / (1024**3):.3f} GB')
+
+    # Calculate the total memory (excluding swap) on the local machine
+    total_memory = psutil.virtual_memory().total 
+    print(f'Your system memory is: {total_memory / (1024**3):.3f} GB')
+
+    if (total_memory <= model_size):
+        raise MemoryError(f'Not enough memory to train the model.')
+
     pbar = tqdm(total=num_nodes, disable=not verbose)
 
     def visit(node):
@@ -195,6 +211,23 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
     return Node(label_map=label_map, children=children)
 
 
+def get_estimated_model_size(root):
+    total_num_weights = 0
+
+    def collect_stat(node: Node):
+        nonlocal total_num_weights
+        
+        if node.isLeaf():
+            total_num_weights += len(node.label_map) * node.num_features_used
+        else:
+            total_num_weights += len(node.children) * node.num_features_used
+
+    root.dfs(collect_stat)
+
+    # 16 is because when storing sparse matrices, indices (int64) require 8 bytes and floats require 8 bytes
+    return total_num_weights * 16
+
+
 def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node: Node):
     """If node is internal, computes the metalabels representing each child and trains
     on the metalabels. Otherwise, train on y.
diff --git a/requirements.txt b/requirements.txt
@@ -5,3 +5,4 @@ PyYAML
 scikit-learn
 scipy
 tqdm
+psutil

-Original file line number
+Diff line change
 scikit-learn
 scipy
 tqdm
 +psutil