Merge pull request #75 from andrea-fasoli/fp8_fixes

andrea-fasoli · web-flow · commit 1a77f630104a · 2025-07-07T09:30:16.000-07:00
Fix mask creation for QA
diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py
@@ -12,21 +12,18 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
     args_model_loading.add_argument(
         "--architecture",
         type=str,
-        help="The model architecture to benchmark",
+        help="The model architecture to benchmark.",
     )
     args_model_loading.add_argument(
         "--variant",
         type=str,
         default=None,
-        help="The model variant (configuration) to benchmark. E.g. 7b, 13b, 70b.",
+        help="The model variant (configuration) to benchmark (e.g., 7b, 13b, 70b).",
     )
     args_model_loading.add_argument(
         "--model_path",
         type=str,
-        help=(
-            "Path to the directory containing LLaMa weights "
-            "(.pth files sharded by tensor parallel rank, not HF weights)"
-        ),
+        help="Path to the directory containing the model checkpoint(s).",
     )
     args_model_loading.add_argument(
         "--model_source",
@@ -36,9 +33,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
     args_model_loading.add_argument(
         "--unfuse_weights",
         action="store_true",
-        help=(
-            "If set to True, this will unfuse any fused weight modules"
-        ),
+        help="If True, this will unfuse any fused weight modules.",
     )
     args_model_loading.add_argument(
         "--default_dtype",
@@ -47,23 +42,23 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
         choices=["bf16", "fp16", "fp32"],
         help=(
             "If set to one of the choices, overrides the model checkpoint "
-            "weight format by setting the default pytorch format"
+            "weight format by setting the default pytorch format."
         ),
     )
     parser.add_argument(
         "--cast_bf16_to_fp16",
         action="store_true",
         help=(
             "If set, cast any bf16 weights in the model to fp16 for AIU compiler. "
-            "Doesn't touch fp32 or quantized"
+            "Doesn't touch fp32 or quantized."
         )
     )
     parser.add_argument(
         "--cast_fp16_to_bf16",
         action="store_true",
         help=(
             "If set, cast any fp16 weights in the model to bf16 for GPU. "
-            "Doesn't touch fp32 or quantized"
+            "Doesn't touch fp32 or quantized."
         )
     )
 
diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py
@@ -160,9 +160,13 @@ def convert_batch_to_fms_style(
         self,
         batch: dict[str, torch.Tensor],
     ) -> dict[str, torch.Tensor]:
-        """FMS uses a different standard than HF for encoder inputs."""
+        """FMS uses a different standard than HF for encoder inputs.
 
-        return {'x': batch['input_ids'], 'mask': batch['attention_mask']}
+        The mask is also handled differently in FMS: it is correctly processed by SDPA
+        only if provided as boolean. A floating binary mask would not be converted.
+        """
+
+        return {'x': batch['input_ids'], 'mask': batch['attention_mask'].to(torch.bool)}
 
     def process_eval_set(self) -> None:
         """Pre-process evaluation dataset for QuestionAnswering task."""
@@ -210,7 +214,7 @@ def process_eval_set(self) -> None:
                 f"Using max_prompt_length={model_max_length} instead."
             )
             self.max_prompt_length = min(
-                args.max_seq_length,
+                args.max_prompt_length,
                 model_max_length,
             )
 
@@ -593,7 +597,8 @@ def run_evaluation(self) -> None:
         all_end_logits = []
         for step, batch in enumerate(eval_dataloader):
             with torch.no_grad():
-                dprint(f"Step {step + 1} / {len(eval_dataloader)}")
+                if args.verbose:
+                    dprint(f"Step {step + 1} / {len(eval_dataloader)}")
                 batch = self.convert_batch_to_fms_style(batch)
                 batch = move_to_device(batch, args.device)
                 start_logits, end_logits = self.model(**batch)
diff --git a/aiu_fms_testing_utils/utils/model_setup.py b/aiu_fms_testing_utils/utils/model_setup.py
@@ -152,8 +152,8 @@ def print_model_params(model: nn.Module, args: argparse.Namespace) -> None:
 
     if args.verbose:
         dprint("="*60 + "\n")
-        dprint("\n".join(
-            f"{k:80} {str(list(v.size())):15} {str(v.dtype):18} {str(v.device):10} "
+        dprint("\n" + "\n".join(
+            f"{k:70} {str(list(v.size())):15} {str(v.dtype):20} {str(v.device):10} "
             f"{v.float().min().item():12.4f} {v.float().max().item():12.4f}"
             for k,v in model.state_dict().items()
         ))
diff --git a/scripts/run_encoders.py b/scripts/run_encoders.py
@@ -5,6 +5,7 @@
 # Third Party
 from fms.models import get_model
 from fms.models.roberta import RoBERTaForQuestionAnswering, RoBERTa
+from fms.models.hf.roberta.modeling_roberta_hf import HFAdaptedRoBERTaForMaskedLM
 from fms.utils import tokenizers
 from torch import distributed, set_grad_enabled
 
@@ -66,7 +67,6 @@
     group=distributed.group.WORLD,
     linear_config=linear_config,
     fused_weights=args.fused_weights,
-    attn_name="math_fp8",
 )
 
 if args.force_16b_dtype:
@@ -100,7 +100,8 @@
 
 if isinstance(model, RoBERTaForQuestionAnswering):
     run_encoder_eval_qa(model, tokenizer, args)
-elif isinstance(model, RoBERTa):  # basic MaskedLM downstream task
+elif isinstance(model, RoBERTa) or isinstance(model, HFAdaptedRoBERTaForMaskedLM):
+    # basic MaskedLM downstream task
     run_encoder_eval_mlm(model, tokenizer, args)
 
 if args.distributed: