Consolidate stagger code into two support funcitons: stagger_enter, stagger_leave

jjhursey · jjhursey · commit 14dfb4a1a522 · 2025-06-04T12:27:30.000-04:00
Signed-off-by: Joshua Hursey &lt;jhursey@us.ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -11,6 +11,22 @@
 import random
 import math
 
+def stagger_enter(limit: int):
+    if limit > 0 and limit != world_size:
+        for _set in range( math.ceil(world_size / float(limit)) ):
+            if rank < (_set+1)*limit:
+                break
+            torch.distributed.barrier()
+        dprint(f"Stagger: Enter (Set: {_set+1} of {math.ceil(world_size / float(limit))})")
+
+def stagger_leave(limit: int):
+    if limit > 0 and limit != world_size:
+        for _set in range( math.ceil(world_size / float(limit)) ):
+            if rank >= (_set+1)*limit:
+                continue
+            torch.distributed.barrier()
+        dprint(f"Stagger: All Complete")
+
 def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, stagger_update_lazyhandle = 0, **padding_kwargs):
     import torch_sendnn
     dprint("AIU warmup")
@@ -19,25 +35,15 @@ def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int,
     if compile_dynamic_sendnn:
         max_new_tokens_warmup = 2
 
-    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
-        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
-            if rank < (_set+1)*stagger_update_lazyhandle:
-                break
-            torch.distributed.barrier()
-        dprint(f"Stagger update_lazyhandle: Begin (Set: {_set+1} of {math.ceil(world_size / float(stagger_update_lazyhandle))})")
+    stagger_enter(stagger_update_lazyhandle)
 
     pt_compile_model_time = time.time()
     with torch_sendnn.warmup_mode():
         generate(model, input_ids, max_new_tokens=max_new_tokens_warmup, max_seq_len=model.config.max_expected_seq_len, use_cache=True, do_sample=False, contiguous_cache=True, extra_kwargs=extra_kwargs)
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
-    if stagger_update_lazyhandle > 0 and stagger_update_lazyhandle != world_size:
-        for _set in range( math.ceil(world_size / float(stagger_update_lazyhandle)) ):
-            if rank >= (_set+1)*stagger_update_lazyhandle:
-                continue
-            torch.distributed.barrier()
-        dprint(f"Stagger update_lazyhandle: All Complete")
+    stagger_leave(stagger_update_lazyhandle)
 
 def ids_for_prompt(prompt, tokenizer):
     tokens = tokenizer.tokenize(prompt)
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -12,7 +12,7 @@
 import math
 
 # Third Party
-from aiu_fms_testing_utils.utils import aiu_setup, warmup_model
+from aiu_fms_testing_utils.utils import aiu_setup, warmup_model, stagger_enter, stagger_leave
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
 import numpy as np
 import torch
@@ -464,12 +464,7 @@ def select_int8_module(
 dprint(f"data_type={default_dtype}")
 dprint("="*60 + "\n")
 
-if args.stagger_load > 0 and args.stagger_load != world_size:
-    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
-        if rank < (_set+1)*args.stagger_load:
-            break
-        torch.distributed.barrier()
-    dprint(f"Stagger Model Load: Begin (Set: {_set+1} of {math.ceil(world_size / float(args.stagger_load))})")
+stagger_enter(args.stagger_load)
 
 model = get_model(
     args.architecture,
@@ -500,12 +495,7 @@ def select_int8_module(
 loading_model_time = time.time() - loading_model_time
 dprint(f"loading complete, took {loading_model_time:.3f}s")
 
-if args.stagger_load > 0 and args.stagger_load != world_size:
-    for _set in range( math.ceil(world_size / float(args.stagger_load)) ):
-        if rank >= (_set+1)*args.stagger_load:
-            continue
-        torch.distributed.barrier()
-    dprint(f"Stagger Model Load: All Complete")
+stagger_leave(args.stagger_load)
 
 if args.compile:
     dprint("compiling model")