Options for Stagger model loading for low memory systems

jjhursey · jjhursey · commit c5218b1543b7 · 2025-05-07T09:53:49.000-04:00
* `--stagger_load` : (default: `0` off) Stagger model loading to avoid OOM issues on the host
 * `--stagger_update_lazyhandle` : (default: `0` off) Stagger update_lazyhandle to avoid OOM issues on the host

Signed-off-by: Joshua Hursey &lt;jhursey@us.ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -3,14 +3,15 @@
 import time
 from fms.utils.tokenizers import BaseTokenizer
 from fms.utils.generation import generate
-from aiu_fms_testing_utils.utils.aiu_setup import dprint
+from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
 from typing import Optional, List, Tuple
 import os
 import requests
 import json
 import random
+import math
 
-def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, **padding_kwargs):
+def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, stagger_update_lazyhandle = 0, **padding_kwargs):
     from torch_sendnn import torch_sendnn
     dprint("AIU warmup")
     pt_compile_model_time = time.time()
@@ -22,12 +23,26 @@ def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int,
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
+    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
+        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
+            if rank < (_set+1)*stagger_update_lazyhandle:
+                break
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: Begin (Set: {_set+1} of {math.ceil(world_size / float(stagger_update_lazyhandle))})")
+
     dprint("executing update_lazyhandle and performing validation")
     update_lh_time = time.time()
     torch_sendnn.update_lazyhandle()
     update_lh_time = time.time() - update_lh_time
     dprint(f"update_lazyhandle complete, took {update_lh_time:.3f}s")
 
+    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
+        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
+            if rank >= (_set+1)*stagger_update_lazyhandle:
+                continue
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: All Complete")
+
 def ids_for_prompt(prompt, tokenizer):
     tokens = tokenizer.tokenize(prompt)
     ids = tokenizer.convert_tokens_to_ids(tokens)
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 import random
 import time
+import math
 
 # Third Party
 from aiu_fms_testing_utils.utils import aiu_setup, warmup_model
@@ -217,6 +218,18 @@
     default=0,
     help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)"
 )
+parser.add_argument(
+    "--stagger_load",
+    type=int,
+    default=0,
+    help="Stagger model loading to avoid OOM issues on the host"
+)
+parser.add_argument(
+    "--stagger_update_lazyhandle",
+    type=int,
+    default=0,
+    help="Stagger update_lazyhandle to avoid OOM issues on the host"
+)
 args = parser.parse_args()
 
 if args.quantization == "gptq":
@@ -437,6 +450,13 @@ def select_int8_module(
 dprint(f"data_type={default_dtype}")
 dprint("="*60 + "\n")
 
+if args.stagger_load > 0 and args.stagger_load != world_size:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank < (_set+1)*args.stagger_load:
+            break
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: Begin (Set: {_set+1} of {math.ceil(world_size / float(args.stagger_load))})")
+
 model = get_model(
     args.architecture,
     args.variant,
@@ -466,6 +486,13 @@ def select_int8_module(
 loading_model_time = time.time() - loading_model_time
 dprint(f"loading complete, took {loading_model_time:.3f}s")
 
+if args.stagger_load > 0 and args.stagger_load != world_size:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank >= (_set+1)*args.stagger_load:
+            continue
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: All Complete")
+
 if args.compile:
     dprint("compiling model")
     if is_aiu_backend:
@@ -691,7 +718,7 @@ def infer(use_cache, do_sample, warmup):
 ]  # True/False are identical with greedy iff `torch.use_deterministic_algorithms(True)`
 
 if args.compile:
-    warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **extra_generation_kwargs)
+    warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, args.stagger_update_lazyhandle, **extra_generation_kwargs)
 
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
         aiu_warmup_time = time.time()