add --compile_dynamic_sendnn option

Jiamin-Ni · Jiamin-Ni · commit b2ea5e3cd3cf · 2025-05-06T15:37:04.000-04:00
Signed-off-by: Jiamin Ni &lt;jiamin.ni@ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
@@ -10,15 +10,14 @@
 import json
 import random
 
-def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, **padding_kwargs):
+def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, **padding_kwargs):
     from torch_sendnn import torch_sendnn
     dprint("AIU warmup")
     pt_compile_model_time = time.time()
     extra_kwargs = {**padding_kwargs, "only_last_token": True}
-    max_new_tokens_warmup = 2
-    is_dynamic_value = os.getenv("TORCH_SENDNN_DYNAMIC")
-    if is_dynamic_value is None or is_dynamic_value.lower() in {"0", "false"}:
-        max_new_tokens_warmup = max_new_tokens
+    max_new_tokens_warmup = max_new_tokens
+    if compile_dynamic_sendnn:
+        max_new_tokens_warmup = 2
     generate(model, input_ids, max_new_tokens=max_new_tokens_warmup, max_seq_len=model.config.max_expected_seq_len, use_cache=True, do_sample=False, contiguous_cache=True, extra_kwargs=extra_kwargs)
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
diff --git a/scripts/inference.py b/scripts/inference.py
@@ -129,6 +129,11 @@
     action="store_true",
     help="Use dynamic shapes with torch.compile",
 )
+parser.add_argument(
+    "--compile_dynamic_sendnn",
+    action="store_true",
+    help="Use dynamic shapes with aiu compile",
+)
 parser.add_argument(
     "--deterministic",
     action="store_true",
@@ -464,7 +469,7 @@ def select_int8_module(
 if args.compile:
     dprint("compiling model")
     if is_aiu_backend:
-        model.compile(backend="sendnn_decoder")
+        model.compile(backend="sendnn_decoder", options={'sendnn.dynamic': args.compile_dynamic_sendnn})
     else:
         # compiling can make first inference pass slow
         model.compile(mode=args.compile_mode, backend=args.compile_backend)
@@ -686,7 +691,7 @@ def infer(use_cache, do_sample, warmup):
 ]  # True/False are identical with greedy iff `torch.use_deterministic_algorithms(True)`
 
 if args.compile:
-    warmup_model(model, ids, args.max_new_tokens, **extra_generation_kwargs)
+    warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **extra_generation_kwargs)
 
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
         aiu_warmup_time = time.time()
diff --git a/scripts/validation.py b/scripts/validation.py
@@ -106,6 +106,11 @@
     action="store_true",
     help="Use dynamic shapes with torch.compile",
 )
+parser.add_argument(
+    "--compile_dynamic_sendnn",
+    action="store_true",
+    help="Use dynamic shapes with aiu compile",
+)
 parser.add_argument(
     "--deterministic",
     action="store_true",
@@ -680,7 +685,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
         **padding_kwargs
     )
 
-warmup_model(model, ids, args.max_new_tokens, **padding_kwargs)
+warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **padding_kwargs)
 
 ### AIU generation loop
 static_tokens = validation_info.get_info("tokens")