Options for Stagger model loading for low memory systems

jjhursey · jjhursey · commit 1af220276c1e · 2025-05-06T15:08:20.000-04:00
* `--stagger_load` : (default: `0` off) Stagger model loading to avoid OOM issues on the host
 * `--stagger_update_lazyhandle` : (default: `0` off) Stagger update_lazyhandle to avoid OOM issues on the host

Signed-off-by: Joshua Hursey &lt;jhursey@us.ibm.com&gt;
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -7,6 +7,7 @@
 from pathlib import Path
 import random
 import time
+import math
 
 # Third Party
 from aiu_fms_testing_utils.utils import aiu_setup
@@ -212,6 +213,18 @@
     default=0,
     help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)"
 )
+parser.add_argument(
+    "--stagger_load",
+    type=int,
+    default=0,
+    help="Stagger model loading to avoid OOM issues on the host"
+)
+parser.add_argument(
+    "--stagger_update_lazyhandle",
+    type=int,
+    default=0,
+    help="Stagger update_lazyhandle to avoid OOM issues on the host"
+)
 args = parser.parse_args()
 
 if args.quantization == "gptq":
@@ -432,6 +445,13 @@ def select_int8_module(
 dprint(f"data_type={default_dtype}")
 dprint("="*60 + "\n")
 
+if args.stagger_load > 0:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank < (_set+1)*args.stagger_load:
+            break
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: Begin (Set: {_set+1} of {math.ceil(world_size / float(args.stagger_load))})")
+
 model = get_model(
     args.architecture,
     args.variant,
@@ -461,6 +481,13 @@ def select_int8_module(
 loading_model_time = time.time() - loading_model_time
 dprint(f"loading complete, took {loading_model_time:.3f}s")
 
+if args.stagger_load > 0:
+    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
+        if rank >= (_set+1)*args.stagger_load:
+            continue
+        torch.distributed.barrier()
+    dprint(f"Stagger Model Load: All Complete")
+
 if args.compile:
     dprint("compiling model")
     if is_aiu_backend:
@@ -693,13 +720,27 @@ def infer(use_cache, do_sample, warmup):
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
+    if args.stagger_update_lazyhandle > 0:
+        for _set in range( math.ceil(world_size / float(args.stagger_update_lazyhandle)) ):
+            if rank < (_set+1)*args.stagger_update_lazyhandle:
+                break
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: Begin (Set: {_set+1} of {math.ceil(world_size / float(args.stagger_update_lazyhandle))})")
+
     if is_aiu_backend:
         dprint("executing update_lazyhandle and compiling for AIU")
         update_lh_time = time.time()
         torch_sendnn.update_lazyhandle()
         update_lh_time = time.time() - update_lh_time
         dprint(f"update_lazyhandle complete, took {update_lh_time:.3f}s")
 
+    if args.stagger_update_lazyhandle > 0:
+        for _set in range( math.ceil(world_size / float(args.stagger_update_lazyhandle)) ):
+            if rank >= (_set+1)*args.stagger_update_lazyhandle:
+                continue
+            torch.distributed.barrier()
+        dprint(f"Stagger update_lazyhandle: All Complete")
+
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
         aiu_warmup_time = time.time()
         for sample, cache in itertools.product(do_sample, use_cache):