Add 16b forced casting

andrea-fasoli · andrea-fasoli · commit 0cd0d7bdefed · 2025-07-02T18:56:19.000-04:00
Signed-off-by: Andrea Fasoli &lt;andrea.fasoli@ibm.com&gt;
diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py
@@ -50,6 +50,22 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
             "weight format by setting the default pytorch format"
         ),
     )
+    parser.add_argument(
+        "--cast_bf16_to_fp16",
+        action="store_true",
+        help=(
+            "If set, cast any bf16 weights in the model to fp16 for AIU compiler. "
+            "Doesn't touch fp32 or quantized"
+        )
+    )
+    parser.add_argument(
+        "--cast_fp16_to_bf16",
+        action="store_true",
+        help=(
+            "If set, cast any fp16 weights in the model to bf16 for GPU. "
+            "Doesn't touch fp32 or quantized"
+        )
+    )
 
     # Quantization arguments
     args_quantization = parser.add_argument_group("Model quantization")
@@ -260,6 +276,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
     args.is_aiu_backend = "aiu" in args.device_type
     args.dynamo_backend = "sendnn" if args.is_aiu_backend else "inductor"
     args.fused_weights = not args.unfuse_weights
+    args.force_16b_dtype = args.cast_bf16_to_fp16 or args.cast_fp16_to_bf16
 
     if args.verbose:
         dprint("=" * 60)
diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py
@@ -597,8 +597,8 @@ def run_evaluation(self) -> None:
                 batch = self.convert_batch_to_fms_style(batch)
                 batch = move_to_device(batch, args.device)
                 start_logits, end_logits = self.model(**batch)
-                all_start_logits.append(start_logits.cpu().numpy())
-                all_end_logits.append(end_logits.cpu().numpy())
+                all_start_logits.append(start_logits.to(torch.float16).cpu().numpy())
+                all_end_logits.append(end_logits.to(torch.float16).cpu().numpy())
         eval_duration = time.time() - start_time
         dprint(
             f"Runtime: {eval_duration:.0f} s | "
diff --git a/aiu_fms_testing_utils/utils/model_setup.py b/aiu_fms_testing_utils/utils/model_setup.py
@@ -120,6 +120,33 @@ def setup_model(args: argparse.Namespace) -> tuple[str | None, torch.device, str
     return default_dtype, device, dist_strat
 
 
+def recast_16b(model: nn.Module, args: argparse.Namespace) -> None:
+    """Cast 16-bit model parameters to selected datatype."""
+
+    if args.cast_bf16_to_fp16:
+        dprint(
+            "Casting all BF16 model parameters to FP16 "
+            "(--cast_bf16_to_fp16 flag is enabled)"
+        )
+        for name, param in model.named_parameters():
+            if param.dtype == torch.bfloat16:
+                if param.max() > torch.finfo(torch.float16).max:
+                    dprint(
+                        f"[WARNING] Casting param {name} to fp16 will truncate the "
+                        "tensor. This may cause accuracy loss. Ignore this warning if "
+                        "this is intended."
+                    )
+                param.data = param.data.to(dtype=torch.float16)
+    elif args.cast_fp16_to_bf16:
+        dprint(
+            "Casting all FP16 model parameters to BF16 "
+            "(--cast_fp16_to_bf16 flag is enabled)"
+        )
+        for param in model.parameters():
+            if param.dtype == torch.float16:
+               param.data = param.data.to(dtype=torch.bfloat16)
+
+
 def print_model_params(model: nn.Module, args: argparse.Namespace) -> None:
     """Printout model and list of model parameters with related statistics."""
 
diff --git a/aiu_fms_testing_utils/utils/quantization_setup.py b/aiu_fms_testing_utils/utils/quantization_setup.py
@@ -6,6 +6,7 @@
 import os
 
 # Third Party
+import torch
 from torch import nn
 
 # Local Packages
@@ -133,3 +134,35 @@ def select_int8_module(
     else:
         linear_config = {"linear_type": "torch_linear"}
     return linear_config
+
+
+def validate_quantization(model: nn.Module, args: argparse.Namespace) -> None:
+    """Ensure compatibility of FP8 models with device-specific operations."""
+
+    has_fp8_weights = False
+    has_bf16_weights = False
+    has_fp16_weights = False
+    for param in model.parameters():
+        if param.dtype == torch.float8_e4m3fn:
+            has_fp8_weights = True
+        elif param.dtype == torch.bfloat16:
+            has_bf16_weights = True
+        elif param.dtype == torch.float16:
+            has_fp16_weights = True
+
+    if has_fp8_weights:
+        if args.is_aiu_backend and has_bf16_weights and not args.cast_bf16_to_fp16:
+            raise ValueError(
+                "FP8 checkpoints on AIU with bf16 weights require casting to fp16 "
+                "using --cast_bf16_to_fp16. Do not use --default_dtype!"
+            )
+        elif (
+            args.device.type == "cuda"
+            and has_fp16_weights
+            and not args.cast_fp16_to_bf16
+        ):
+            raise ValueError(
+                "FP8 checkpoints on GPU with fp16 weights require casting to bf16 "
+                "using --cast_fp16_to_bf16. Do not use --default_dtype!"
+            )
+
diff --git a/scripts/run_encoders.py b/scripts/run_encoders.py
@@ -16,10 +16,15 @@
     run_encoder_eval_qa,
     run_encoder_eval_mlm,
 )
-from aiu_fms_testing_utils.utils.model_setup import setup_model, print_model_params
+from aiu_fms_testing_utils.utils.model_setup import (
+    setup_model,
+    print_model_params,
+    recast_16b
+)
 from aiu_fms_testing_utils.utils.quantization_setup import (
     import_addons,
     get_linear_config,
+    validate_quantization,
 )
 
 parser = argparse.ArgumentParser(
@@ -61,9 +66,14 @@
     group=distributed.group.WORLD,
     linear_config=linear_config,
     fused_weights=args.fused_weights,
+    attn_name="math_fp8",
 )
 
+if args.force_16b_dtype:
+    recast_16b(model, args)
+
 if args.is_quantized:
+    validate_quantization(model, args)
     print_model_params(model, args)
 
 tokenizer = tokenizers.get_tokenizer(args.tokenizer)