foundation-model-stack · avery-blanchard · Apr 8, 2025 · JRosenkranz · Apr 14, 2025
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
@@ -148,6 +148,8 @@
     os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str((((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64)
     os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(common_batch_sizes))
 
+cache_params = list(itertools.product([common_model_paths[0]], [common_batch_sizes[0]], [common_seq_lengths[0]], [common_max_new_tokens[0]], ["miss", "hit"]))
+
 # thresholds are chosen based on 1024 tokens per sequence
 # 1% error threshold rate between cpu fp32 and cuda fp16
 # if a models failure thresholds do not exist in this dict, default to the default_metrics_threshold defined above
@@ -214,7 +216,7 @@ def reset_compiler():
         torch.compiler.reset()
         torch._dynamo.reset()
         os.environ.pop("COMPILATION_MODE", None)
-
+        os.environ.pop('TORCH_SENDNN_CACHE_ENABLE', None)
 
 # TODO: Currently, gptq does not have the same level of support as non-gptq models for get_model. This method provides the extra requirements for gptq for get_model,
 #  however ideally, these fixes should be done in foundation-model-stack.
@@ -260,7 +262,6 @@ def __maybe_get_gptq_kwargs(model_path):
         pass
     return gptq_kwargs_aiu, gptq_kwargs_cpu
 
-
 def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
     prompts_and_sizes = sample_sharegpt_requests(
         SHARE_GPT_DATASET_PATH,
@@ -620,3 +621,57 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
         print("passed validation level 1")
     else:
         print("passed validation level 0")
+
+@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,cache_status", cache_params)
+def test_cache(model_path, batch_size, seq_length, max_new_tokens, cache_status):
+    torch.manual_seed(42)
+    os.environ["TORCH_SENDNN_CACHE_ENABLE"] = "1"
+    os.environ["COMPILATION_MODE"] = "offline_decoder"
+
+    dprint(f"testing with cache: model={model_path}, batch_size={batch_size}, seq_length={seq_length}, max_new_tokens={max_new_tokens}, micro_model={USE_MICRO_MODELS}, cache={cache_status}")
+
+    if USE_MICRO_MODELS:
+        micro_model_kwargs = {"architecture": "hf_configured", "nlayers": 3}
+    else:
+        micro_model_kwargs  = {"architecture": "hf_pretrained"}
+
+    if not USE_MICRO_MODELS and os.path.exists(model_path):
+        model_path_kwargs = {"model_path": model_path}
+    else:
+        model_path_kwargs = {"variant": model_path}
+
+    distributed_kwargs = {}
+    if USE_DISTRIBUTED:
+        distributed_kwargs["distr_param"] = "tp"
+        distributed_kwargs["group"] = dist.group.WORLD
+    get_model_kwargs = {**model_path_kwargs, **micro_model_kwargs, **distributed_kwargs}
+
+    tokenizer = tokenizers.get_tokenizer(model_path)
+
+    # prepare the AIU model
+    model = get_model(
+        device_type="cpu",
+        fused_weights=False,
+        **get_model_kwargs
+    )
+
+    model.eval()
+    torch.set_grad_enabled(False)
+    model.compile(backend="sendnn_decoder")
+
+
+    # prepare input_ids
+    input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
+
+    # warmup aiu model
+    warmup_model(model, input_ids, max_new_tokens, **padding_kwargs)
+
+    # aiu validatation 
+    aiu_validation_info = extract_validation_information(
+        model,
+        input_ids,
+        max_new_tokens,
+        None,
+        only_last_token=True,
+        **padding_kwargs
+)