From 6e5c0d2c6ccd6520068991ad9304fa35b20df610 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:13:40 -0400 Subject: [PATCH 01/30] Refactor argument parsing Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/args_parsing.py | 357 ++++++++++++++++++++ 1 file changed, 357 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/args_parsing.py diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py new file mode 100644 index 0000000..ce0892e --- /dev/null +++ b/aiu_fms_testing_utils/utils/args_parsing.py @@ -0,0 +1,357 @@ +# Standard +import argparse + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint + + +def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: + + # FMS model loading arguments + parser.add_argument( + "--architecture", + type=str, + help="The model architecture to benchmark", + ) + parser.add_argument( + "--variant", + type=str, + default=None, + help="The model variant (configuration) to benchmark. E.g. 7b, 13b, 70b.", + ) + parser.add_argument( + "--model_path", + type=str, + help=( + "Path to the directory containing LLaMa weights " + "(.pth files sharded by tensor parallel rank, not HF weights)" + ), + ) + parser.add_argument( + "--model_source", + type=str, + help="Source of the checkpoint. E.g. 'meta', 'hf', None", + ) + parser.add_argument( + "--unfuse_weights", + action="store_true", + help=( + "If set to True, this will unfuse any fused weight modules that " + "support the unfuse_weights method" + ), + ) + parser.add_argument( + "--default_dtype", + type=str, + default=None, + choices=["bf16", "fp16", "fp32"], + help=( + "If set to one of the choices, overrides the model checkpoint " + "weight format by setting the default pytorch format" + ), + ) + + # Quantization arguments + parser.add_argument( + "--quantization", + type=str, + choices=["gptq", "int8"], + default=None, + help="Type of quantization of the model checkpoint", + ) + parser.add_argument( + "--int8_weight_per_channel", + action="store_true", + help="Enable per-channel weight quantization in INT8 quantized model", + ) + parser.add_argument( + "--int8_activ_quant_type", + default="per_token", + choices=["per_token", "per_tensor_symm", "per_tensor_asymm"], + type=str, + help="Define strategy for activation quantization in INT8 quantized model", + ) + parser.add_argument( + "--int8_smoothquant", + action="store_true", + help="Enable smoothquant in INT8 quantized model", + ) + parser.add_argument( # NOTE: roberta only so far but should expand to LLM + "--direct_quantization", + action="store_true", + help="Train INT8 model with Direct Quantization", + ) + parser.add_argument( + "--num_dq_samples", + type=int, + default=128, + help="number of samples used for Direct Quantization", + ) + + # General settings + parser.add_argument( + "--device_type", + type=str, + choices=["cuda", "cpu", "aiu", "aiu-senulator"], + default="cuda", + help="The device to run the model on" + ) + parser.add_argument( + "--seed", + type=int, + default=81072, + help="Run seed (only needed if eval dataset is shuffled)", + ) + parser.add_argument( + "--output_path", + type=str, + default="", + help="path of folder to save outputs to, if empty don't save", + ) + parser.add_argument( + "--tokenizer", + type=str, + required=True, + help="Path to the tokenizer (e.g. ~/tokenizer.model)", + ) + parser.add_argument( + "--no_use_cache", + action="store_false", + help="Disable the kv-cache (on by default)", + ) + parser.add_argument( + "--deterministic", + action="store_true", + help="`deterministic` requires env variable `CUBLAS_WORKSPACE_CONFIG=:4096:8`", + ) + parser.add_argument( + "--distributed", + action="store_true", + help="This is a distributed job (multiple instances run with RANK+WORLD_SIZE)", + ) + parser.add_argument( # could be a bool / flag + '-v', '--verbose', + action='count', + default=0, + help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)" + ) + + # Compiling arguments + parser.add_argument( + "--compile", + action="store_true", + help="Use torch.compile (slow for first inference pass)", + ) + parser.add_argument( + "--compile_mode", + type=str, + help="Mode for compilation (only valid for inductor backend)", + default="default", + choices=["default", "reduce-overhead"], + ) + parser.add_argument( + "--compile_backend", + type=str, + help="Backend for compilation (only when not running on AIU)", + default="inductor", + choices=["inductor", "eager", "aot_eager"], + ) + parser.add_argument( + "--compile_dynamic", + action="store_true", + help="Use dynamic shapes with torch.compile", + ) + + # LLM-specific inference arguments + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="size of input batch", + ) + parser.add_argument( + "--max_prompt_length", + type=int, + default=None, + help=( + "Cap the number of tokens per prompt to a maximum length prior to padding. " + "If None, there will be no cap." + ), + ) + parser.add_argument( + "--min_pad_length", + type=int, + default=0, + help=( + "Pad inputs to a minimum specified length. If any prompt is larger than " + "the specified length, padding will be determined by the largest prompt" + ), + ) + parser.add_argument( + "--fixed_prompt_length", + type=int, + default=0, + help=( + "If defined, overrides both min_pad_length and max_prompt_length. " + "Pads input to fixed_prompt_length, fails if any input needs truncation." + ), + ) + parser.add_argument( + "--max_new_tokens", + type=int, + help="max number of generated tokens", + default=100, + ) + parser.add_argument( + "--no_early_termination", + action="store_true", + help="disable early termination on generation", + ) + parser.add_argument( + "--prompt_type", + type=str, + choices=["chat", "code"], + default="chat", + help="type of prompts to be used, either chat or code", + ) + parser.add_argument( + "--prompt_path", + type=str, + default="", + help=( + "If set, load the prompts from file(s) instead of the local examples. " + "Supports glob-style patterns" + ), + ) + parser.add_argument( + "--timing", + type=str, + choices=["e2e", "per-token"], + default="", + help="if set, how to time the generation of tokens, e2e or per-token", + ) + parser.add_argument( + "--iters", + type=int, + default=1, + help=( + "Number of iterations of inference to perform. Used for variance " + "performance capture." + ), + ) + + # RoBERTa-specific evaluation arguments + parser.add_argument( + "--dataset_name", + type=str, + default="squad_v2", + help="The name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--dataset_config_name", + type=str, + default=None, + help="The configuration name of the dataset to use (via the datasets library).", + ) + parser.add_argument( + "--n_best_size", + type=int, + default=20, + help="Total number of n-best predictions to generate.", + ) + parser.add_argument( + "--null_score_diff_threshold", + type=float, + default=0.0, + help=( + "The threshold used to select the null answer: if the best answer has a " + "score that is less than the score of the null answer minus this threshold, " + "the null answer is selected for this example. Only useful when " + "`version_2_with_negative=True`." + ), + ) + parser.add_argument( + "--version_2_with_negative", + type=bool, + default=True, + help="If true, some of the examples do not have an answer.", + ) + parser.add_argument( + "--max_answer_length", + type=int, + default=30, + help=( + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another." + ), + ) + parser.add_argument( + "--validation_file", + type=str, + default=None, + help="A csv or a json file containing the validation data.", + ) + parser.add_argument( + "--max_seq_length", + type=int, + default=384, + help=( + "The maximum total input sequence length after tokenization. " + "Sequences longer than this will be truncated, " + "sequences shorter will be padded if `--pad_to_max_length` is passed." + ), + ) + parser.add_argument( + "--pad_to_max_length", + action="store_true", + help=( + "If passed, pad all samples to `max_seq_length`. " + "Otherwise, dynamic padding is used." + ), + ) + parser.add_argument( + "--max_eval_samples", + type=int, + default=None, + help=( + "For debugging purposes or quicker training, truncate the number of " + "evaluation examples to this value if set." + ), + ) + parser.add_argument( + "--preprocessing_num_workers", type=int, default=1, help="" + ) + parser.add_argument( + "--overwrite_cache", + action="store_true", + help="Overwrite the cached training and evaluation sets", + ) + parser.add_argument( + "--doc_stride", + type=int, + default=128, + help=( + "When splitting up a long document into chunks how much stride " + "to take between chunks." + ), + ) + parser.add_argument( # NOTE: consider replacing in code with batch_size (DQ vs eval?) + "--per_device_eval_batch_size", + type=int, + default=1, + help="Batch size (per device) for the evaluation dataloader.", + ) + args = parser.parse_args() + + # Add convenient arguments to parser + args.is_encoder = "bert" in args.architecture.lower() # TODO: improve this check + args.is_quantized = args.quantization is not None + args.is_aiu_backend = "aiu" in args.device_type + args.dynamo_backend = "sendnn" if args.is_aiu_backend else "inductor" + args.fused_weights = not args.unfuse_weights + + if args.verbose: + dprint("=" * 60) + dprint(args) + dprint("=" * 60) + return args From 5c0d9eceaa8d1233269fff3d277222cd7dabab4a Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:14:02 -0400 Subject: [PATCH 02/30] Refactor model setup Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/model_setup.py | 114 +++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/model_setup.py diff --git a/aiu_fms_testing_utils/utils/model_setup.py b/aiu_fms_testing_utils/utils/model_setup.py new file mode 100644 index 0000000..64cffb2 --- /dev/null +++ b/aiu_fms_testing_utils/utils/model_setup.py @@ -0,0 +1,114 @@ +# Standard +import argparse +import os +import sys + +# Third party +import numpy as np +import random +import torch +from torch import distributed + +# Local +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size +from aiu_fms_testing_utils.utils import aiu_setup + + +def get_default_dtype(args: argparse.Namespace) -> torch.dtype | None: + """Return default_dtype for non-quantized models, otherwise None. + If default_dtype is provided, it is set as torch default for non-quantized models. + """ + + default_dtype = None + if not args.is_quantized: + dtypes_map = { + "fp16": torch.float16, + "bf16": torch.bfloat16, + "fp32": torch.float32, + } + if args.default_dtype is not None: + default_dtype = dtypes_map[args.default_dtype] + if default_dtype is not None: + torch.set_default_dtype(default_dtype) + return default_dtype + + +def get_device(args: argparse.Namespace) -> torch.device: + """Return torch device and, if needed, set up AIU and its env variables. + NOTE: args.device_type is str, but this function returns torch.device. + """ + + if args.device_type == "cuda": + device = torch.device(args.device_type, local_rank) + torch.cuda.set_device(device) + elif args.is_aiu_backend: + from torch_sendnn import torch_sendnn + + if args.distributed: + aiu_setup.aiu_dist_setup( + distributed.get_rank(), + distributed.get_world_size(), + ) + else: + aiu_setup.aiu_setup(rank, world_size) + aiu_setup.set_aiu_env_vars(args) + device = torch.device("cpu") + else: + device = torch.device(args.device_type) + return device + + +def print_system_setup(args: argparse.Namespace) -> None: + """Display system info (rank 0 only).""" + + if rank == 0 and args.verbose: + dprint("-"*60) + dprint( + f"Python Version : {sys.version_info.major}." + f"{sys.version_info.minor}.{sys.version_info.micro}" + ) + dprint(f"PyTorch Version : {torch.__version__}") + dprint(f"Dynamo Backend : {args.device_type} -> {args.dynamo_backend}") + dprint(f"Distributed : {args.distributed}") + if args.device_type == 'aiu': + for peer_rank in range(aiu_setup.world_size): + pcie_env_str="AIU_WORLD_RANK_"+str(peer_rank) + dprint(f"PCI Addr. for Rank {peer_rank} : {os.environ[pcie_env_str]}") + print("-"*60) + + +def set_determinism(args: argparse.Namespace) -> None: + """Set determinism. + NOTE: torch determinism requires env variable: `CUBLAS_WORKSPACE_CONFIG=:4096:8` + """ + + if args.deterministic: + random.seed(args.seed) + torch.manual_seed(args.seed) + np.random.seed(args.seed) + torch.use_deterministic_algorithms(True) + + +def get_distributed_strategy(args: argparse.Namespace) -> str | None: + """Return distributed strategy.""" + + if args.distributed: + dist_strat = "tp" + else: + if torch.cuda.device_count() > 1 and world_size == 1: + dist_strat = "mp" + else: + dist_strat = None + return dist_strat + + +def setup_model(args: argparse.Namespace) -> tuple[str | None, torch.device, str]: + """Entry point for model setup.""" + + default_dtype = get_default_dtype(args) + device = get_device(args) + print_system_setup(args) + set_determinism(args) + dist_strat = get_distributed_strategy(args) + + return default_dtype, device, dist_strat From d7730c88069996da43e7a08a84bdfc4064d8b808 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:14:58 -0400 Subject: [PATCH 03/30] Refactor setup of quantization (addons, linear_config) Signed-off-by: Andrea Fasoli --- .../utils/quantization_setup.py | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/quantization_setup.py diff --git a/aiu_fms_testing_utils/utils/quantization_setup.py b/aiu_fms_testing_utils/utils/quantization_setup.py new file mode 100644 index 0000000..d602853 --- /dev/null +++ b/aiu_fms_testing_utils/utils/quantization_setup.py @@ -0,0 +1,147 @@ +# Standard +from functools import partial +from typing import Any +import argparse +import json +import os + +# Third Party +from transformers import PreTrainedModel + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank + + +def import_addons(args: argparse.Namespace) -> None: + """Import addons from FMS-MO. The import operation will register the selected + quantization addon (comprising adapter, linear module, and custom op) with FMS. + """ + + try: + if args.quantization == "gptq" and "aiu" in args.device_type: + from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear + elif args.quantization == "int8": + from fms_mo.aiu_addons.i8i8 import i8i8_aiu_adapter, i8i8_aiu_linear + dprint("Loaded `aiu_addons` functionalities") + except: + raise ImportError(f"Failed to import {args.quantization} addons from FMS-MO.") + + +def get_linear_config(args: argparse.Namespace) -> dict[str, Any]: + """Return a linear_config dictionary to be used to instantiate quantized modules + by FMS get_model + """ + + fused_weights = not args.unfuse_weights + if args.quantization == "gptq": + if fused_weights and args.is_aiu_backend: + raise ValueError( + "GPTQ checkpoints on AIU must always run with --unfuse_weights" + ) + if args.default_dtype is not None: + raise ValueError( + "GPTQ default_dtype must be None to preserve the checkpoint data types." + ) + + if "aiu" in args.device_type: + linear_type = "gptq_aiu" + elif args.device_type == "cpu": + linear_type = "gptq_cpu" + elif args.device_type == "cuda": + linear_type = "gptq" # GPTQ support on GPU is FMS-native + else: + raise ValueError(f"Unsupported device {args.device} for GPTQ") + + qconfig_path = args.model_path + "/quantize_config.json" + if os.path.exists(qconfig_path): + with open(qconfig_path, 'r') as f: + dprint(f"loading quantization config from {qconfig_path}") + qconfig = json.load(f) + group_size = qconfig["group_size"] + desc_act = qconfig["desc_act"] + if desc_act: + raise NotImplementedError( + "Activation reordering not supported at this time." + ) + else: + dprint( + "[WARNING] Could not locate quantization config file. " + "Default configuration will be used." + ) + group_size = 128 + desc_act = False + + linear_config = { + "linear_type": linear_type, + "group_size": group_size, + "desc_act": desc_act, + } + elif args.quantization == "int8": + if fused_weights and args.is_aiu_backend: + raise ValueError("INT8 checkpoints on AIU must always run with --unfuse_weights") + if args.default_dtype is not None: + raise ValueError( + "INT8 default_dtype must be None to preserve the checkpoint data types." + ) + + def select_int8_module( + module_name: str | None = None, + smoothquant: bool = True, + smoothquant_layers: list[str] | None = None, + ): + if module_name is None: + return "int8_aiu" + smoothquant_on_module = ( + any([m in module_name for m in smoothquant_layers]) + if smoothquant_layers is not None + else True + ) + use_smoothquant = smoothquant and smoothquant_on_module + return "int8_smoothquant_aiu" if use_smoothquant else "int8_aiu" + + if args.int8_smoothquant: + # TODO: load info from config saved during quantization + if any("granite" in p.lower() for p in [args.model_path, args.architecture]): + smoothquant_layers = ["key", "value", "w1", "wg"] + elif any("roberta" in p.lower() for p in [args.model_path, args.architecture]): + smoothquant_layers = ["query", "key", "value", "w1"] + else: + raise NotImplementedError( + "INT8 architecture does not support smoothquant." + ) + else: + smoothquant_layers = [] + + linear_config = { + "linear_type": partial( + select_int8_module, + smoothquant = args.int8_smoothquant, + smoothquant_layers = smoothquant_layers, + ), + "weight_per_channel": args.int8_weight_per_channel, + "activ_quant_type": args.int8_activ_quant_type, + } + else: + linear_config = {"linear_type": "torch_linear"} + return linear_config + + +def print_model_params(model: PreTrainedModel, args: argparse.Namespace) -> None: + """Printout model and list of model parameters with related statistics.""" + + if rank == 0 and args.verbose > 0: + dprint("="*60 + "\n") + dprint("\n".join( + f"{k:80} {str(list(v.size())):15} {str(v.dtype):18} {str(v.device):10} " + f"{v.min().item():12.4f} {v.max().item():12.4f}" + for k,v in model.state_dict().items() + )) + dprint("="*60 + "\n") + if args.architecture == "llama": + # TODO: unused keys behavior in FMS may change to return ERRORS + dprint( + "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be " + "marked as unused keys." + ) + dprint(model) + dprint("="*60 + "\n") From 819b147d8b3480d318f1a9056b05520bf351fe0b Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:15:33 -0400 Subject: [PATCH 04/30] Refactor LLM handling Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders.py | 333 ++++++++++++++++++++++++ 1 file changed, 333 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/decoders.py diff --git a/aiu_fms_testing_utils/utils/decoders.py b/aiu_fms_testing_utils/utils/decoders.py new file mode 100644 index 0000000..f0b6aa2 --- /dev/null +++ b/aiu_fms_testing_utils/utils/decoders.py @@ -0,0 +1,333 @@ +# Standard +from pathlib import Path +import argparse +import itertools +import os +import sys +import time + +# Third Party +from fms.utils import generation +from fms.utils.generation import generate, pad_input_ids +from transformers import PreTrainedModel, PreTrainedTokenizerBase +import numpy as np +import torch + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, local_rank + + +class DecoderInfer(): + """Run inference (generation) with LLM decoder models.""" + + def __init__( + self, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, + device: torch.device, + ): + self.model = model + self.tokenizer = tokenizer + self.args = args + self.device = device + + self.add_special_tokens = False + self.has_padding = True + self.max_len = 0 + self.extra_generation_kwargs = {} + + # !!! Inference arguments (hardcoded, as in the original script) + self.do_sample = [False] + self.use_cache = [args.no_use_cache] # True/False identical with greedy iff `torch.use_deterministic_algorithms(True)` + + def ids_for_prompt(self, prompt): + """Process textual prompt and return tokenized ids.""" + + tokens = self.tokenizer.tokenize(prompt) + ids = self.tokenizer.convert_tokens_to_ids(tokens) + if self.add_special_tokens: + ids = [self.tokenizer.bos_token_id] + ids + ids = torch.tensor(ids, dtype=torch.long, device=self.device) + return ids + + def truncate_prompts_to_max_length(self, prompts, max_len, max_allowed_length): + """Truncate a series of prompts to a selected max length. + This function ensures prompt truncation prior to padding the input ids.""" + + if max_allowed_length is not None and max_len > max_allowed_length: + dprint(f"max prompt length is {max_len}, truncate to {max_allowed_length}") + prompts = [prompt[:max_allowed_length] for prompt in prompts] + return prompts + + def process_eval_set(self): + """Load textual prompts from file or use defaults prompts, then convert them + to ids. + """ + + args = self.args + self.add_special_tokens = ( + self.tokenizer.bos_token_id != self.tokenizer.eos_token_id + ) + + if args.prompt_path != "": + # Before creating the Path object, check if prompt_path has a glob pattern + if isinstance(args.prompt_path, str): + prompt_path, sep, glob_pattern = args.prompt_path.partition("*") + else: + sep = "" + glob_pattern = "" + glob_pattern = sep + glob_pattern + + prompt_path = Path(os.path.expanduser(prompt_path)) + prompt_file_paths = [] + + if prompt_path.is_dir(): + if glob_pattern != "": + glob_pattern_list = [glob_pattern] + else: + glob_pattern_list = ["*.txt"] + for glob_pattern_possibility in glob_pattern_list: + file_list = list(prompt_path.glob(glob_pattern_possibility)) + if len(file_list) > 0: + prompt_file_paths = sorted(file_list) + break + + if prompt_path.is_file(): + prompt_file_paths = [prompt_path] + + # Check if we found some files + assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}" + + # Check if we have enough files + assert ( + len(prompt_file_paths) >= args.batch_size + ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + + prompts = [] + for i, prompt_file_path in enumerate(prompt_file_paths): + if i == args.batch_size: + break + prompts.append(self.ids_for_prompt(prompt_file_path.read_text(encoding="utf-8"))) + else: + if args.prompt_type == "chat": + template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:" + + prompt1 = template.format( + "Provide a list of instructions for preparing chicken soup." + ) + prompt2 = template.format("Explain some popular greetings in Spanish.") + prompt3 = template.format("Explain to me why ignorance is bliss.") + prompt4 = template.format( + "I have just come into a very large sum of money. Provide me a list of things that I can do with my new found wealth." + ) + elif args.prompt_type == "code": + template = "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n{}\n[/INST]" + prompt1 = template.format("Write a bubble sort function in python.") + prompt2 = template.format( + "Using the Java streams API, write a simple function which will get the cumulative sum of a list of integers." + ) + prompt3 = template.format( + "In bash, how do I list all directories and sub-directories which contain a .py file." + ) + prompt4 = template.format( + "Write a simple decorator in python which will modify all string inputs to ints if possible." + ) + else: + dprint("prompt_type must be one of chat or code") + exit() + + prompt1 = self.ids_for_prompt(prompt1) + prompt2 = self.ids_for_prompt(prompt2) + prompt3 = self.ids_for_prompt(prompt3) + prompt4 = self.ids_for_prompt(prompt4) + prompts = [prompt1, prompt2, prompt3, prompt4] + prompts = prompts * ((args.batch_size // 4) + 1) + prompts = prompts[: args.batch_size] + + if args.fixed_prompt_length != 0: + padding_length = args.fixed_prompt_length + max_allowed_length = args.fixed_prompt_length + else: + padding_length = args.min_pad_length + max_allowed_length = args.max_prompt_length + + self.has_padding = args.batch_size > 1 or padding_length != 0 + self.max_len = max([len(prompt) for prompt in prompts]) + + if args.fixed_prompt_length != 0 and args.fixed_prompt_length < self.max_len: + dprint( + "One or more prompts require truncation. Truncation has been disabled " + "because fixed_prompt_length was set." + ) + sys.exit(1) + prompts = self.truncate_prompts_to_max_length( + prompts, + self.max_len, + max_allowed_length, + ) + if self.has_padding: + ids, extra_generation_kwargs = pad_input_ids( + prompts, + min_pad_length=padding_length, + ) + else: + ids = prompts + if isinstance(ids, list) and len(ids) == 1: + ids = ids[0].unsqueeze(0) + extra_generation_kwargs = None + + self.extra_generation_kwargs = extra_generation_kwargs + + return ids + + def print_result(self, result, result_idx: int): + """Printout generation output.""" + + args = self.args + + if local_rank != 0: + return + if self.has_padding: + result = generation.trim_prefix(result) + + result = generation.trim_prefix(result, self.tokenizer.bos_token_id) + + # stop at EOS token if present and remove padding + if not args.no_early_termination: + result = generation.truncate_after_eos(result, self.tokenizer.eos_token_id) + + output_str = self.tokenizer.convert_tokens_to_string( + self.tokenizer.convert_ids_to_tokens(result) + ) + + if args.output_path != "": + output_path = Path(args.output_path) + output_path.mkdir(parents=True, exist_ok=True) + if output_path.is_dir(): + file_path = output_path / f"{result_idx}.txt" + with file_path.open("w", encoding="utf-8") as file: + file.write(output_str + "\n") + dprint(output_str) + print() + + def infer(self, ids, warmup): + """Run generation inference (warmup compiled model or per-warmed generation). + + NOTE: with greedy generation (do_sample=False) we _should_ always get the same + results. However, there is currently a bug in start_pos for batched rotary + embeddings that can lead varying results for the same prompt. + """ + + args = self.args + + for sample, cache in itertools.product(self.do_sample, self.use_cache): + if local_rank == 0 and not warmup: + dprint(f"use_cache {cache} | do_sample {sample}") + dprint("==================") + if ( + hasattr(self.model.config, "ntk_scaling") + and self.model.config.ntk_scaling + ): + max_seq_len = max(self.max_len, self.model.config.max_expected_seq_len) + else: + # w/o ntk scaling, extending the seq length too far gives bogus results + max_seq_len = self.model.config.max_expected_seq_len + + # Add only_last_token optimization + extra_generation_kwargs = ( + {} + if self.extra_generation_kwargs is None + else self.extra_generation_kwargs + ) + extra_generation_kwargs["only_last_token"] = True + + if args.device_type == "cpu": + # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash cpu impl when pad too much + extra_generation_kwargs["attn_algorithm"] = "math" + + if not args.no_early_termination and not warmup: + eos_token_id = self.tokenizer.eos_token_id + else: + eos_token_id = None + + result = generate( + self.model, + ids, + max_new_tokens=args.max_new_tokens, + use_cache=cache, + do_sample=sample, + max_seq_len=max_seq_len, + timing=args.timing, + eos_token_id=eos_token_id, + contiguous_cache=True, + extra_kwargs=extra_generation_kwargs, + ) + if args.timing != "": + result, timings = result + if args.timing == "e2e": + dprint(f"E2E timing information: {timings[0]:.3f}s") + elif args.timing == "per-token": + if not warmup: + dprint(f"First-token latency: {timings[0]*1000:.3f} ms") + dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms") + dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms") + dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})") + dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})") + dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms") + timings = [f"{t*1000:.3f}" for t in timings] + dprint(f"Per-token timing information: {', '.join(timings)} ms") + if len(result.shape) == 1: + result = result.unsqueeze(0) + + if not warmup: + for i in range(result.shape[0]): + self.print_result(result[i], i) + + def run_warmup(self, ids): + """Run warmup cycle of compiled model.""" + + dprint(f"Start compilation warmup...") + pt_compile_model_start = time.time() + self.infer(ids, warmup=True) + dprint( + "PyTorch compile completed, " + f"took {time.time() - pt_compile_model_start:.2f} s." + ) + + if self.args.is_aiu_backend: + from torch_sendnn import torch_sendnn + + dprint("Executing update_lazyhandle and compiling for AIU") + update_lh_time = time.time() + torch_sendnn.update_lazyhandle() + update_lh_time = time.time() - update_lh_time + dprint(f"Update_lazyhandle completed, took {update_lh_time:.3f}s") + + if self.args.device_type == "aiu": # only run warmup for AIU, not senulator + aiu_warmup_time = time.time() + self.infer(ids, warmup=True) + aiu_warmup_time = time.time() - aiu_warmup_time + dprint(f"AIU warmup completed, took {aiu_warmup_time:.3f}s") + + def run_generation(self, ids): + """Run inference generation (not a warmup).""" + + dprint(f"Start generating output...") + for _ in range(self.args.iters): + self.infer(ids, warmup=False) + + +def run_decoder_eval( + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, + device: torch.device, + ): + """Entry point to run evaluation of LLM decoder models.""" + + decoder_infer = DecoderInfer(model, tokenizer, args, device) + ids = decoder_infer.process_eval_set() + if args.compile: + decoder_infer.run_warmup(ids) + decoder_infer.run_generation(ids) From 13c0917303e601b04eeb6170b5553148a0fcb993 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:15:59 -0400 Subject: [PATCH 05/30] Refactor RoBERTa handling Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/encoders.py | 646 ++++++++++++++++++++++++ 1 file changed, 646 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/encoders.py diff --git a/aiu_fms_testing_utils/utils/encoders.py b/aiu_fms_testing_utils/utils/encoders.py new file mode 100644 index 0000000..80b3a52 --- /dev/null +++ b/aiu_fms_testing_utils/utils/encoders.py @@ -0,0 +1,646 @@ +# Standard +from tqdm import tqdm +import argparse +import collections +import json +import os +import time + +# Third Party +from datasets import load_dataset +from fms.models.hf import to_hf_api +from torch.utils.data import DataLoader +from transformers import ( + default_data_collator, + DataCollatorWithPadding, + EvalPrediction, + pipeline, + PreTrainedModel, + PreTrainedTokenizerBase, + RobertaTokenizerFast, +) +import evaluate +import numpy as np +import torch + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank + + +def get_roberta_tokenizer(tokenizer_path): + return RobertaTokenizerFast.from_pretrained(tokenizer_path) + + +def wrap_encoder(model): + """Add config info and wrapper to run pipeline for RoBERTa MaskedLM.""" + + model.config.linear_config.pop("linear_type", None) + return to_hf_api(model, task_specific_params=None) + + +class EncoderQAInfer(): + """Run QuestionAnswering task with encoder models.""" + + def __init__( + self, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, + ): + self.model = model + self.tokenizer = tokenizer + self.args = args + + self.question_column_name = "" + self.context_column_name = "" + self.answer_column_name = "" + self.pad_on_right = True + + + def prepare_validation_features(self, examples): + """Validation preprocessing""" + + args = self.args + q_col_name = self.question_column_name + c_col_name = self.context_column_name + pad_on_right = self.pad_on_right + max_seq_length = self.max_seq_length + + # Some of the questions have lots of whitespace on the left, which is not useful + # and will make the truncation of the context fail (the tokenized question will + # take a lots of space). So we remove that left whitespace + examples[q_col_name] = [ + q.lstrip() for q in examples[q_col_name] + ] + + # Tokenize our examples with truncation and maybe padding, but keep the overflows + # using a stride. This results in one example possible giving several features + # when a context is long, each of those features having a context that overlaps + # a bit the context of the previous feature. + tokenized_examples = self.tokenizer( + examples[q_col_name if pad_on_right else c_col_name], + examples[c_col_name if pad_on_right else q_col_name], + truncation="only_second" if pad_on_right else "only_first", + max_length=max_seq_length, + stride=min(args.doc_stride, max_seq_length // 2), + return_overflowing_tokens=True, + return_offsets_mapping=True, + padding="max_length" if args.pad_to_max_length else False, + ) + + # Since one example might give us several features if it has a long context, we + # need a map from a feature to its corresponding example. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # For evaluation, we will need to convert our predictions to substrings of the + # context, so we keep the corresponding example_id and we will store the offset + # mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example (to know what is the + # context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, this is the index of the example + # containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(examples["id"][sample_index]) + + # Set to None the offset_mapping that are not part of the context so + # it's easy to determine if a token position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples + + def convert_batch_to_fms_style(self, batch): + """FMS uses a different standard than HF for encoder inputs.""" + + return {'x': batch['input_ids'], 'mask': batch['attention_mask']} + + def process_eval_set(self): + """Pre-process evaluation dataset for QuestionAnswering task.""" + + args = self.args + if args.dataset_name is not None: + # Downloading and loading a dataset from the hub + raw_datasets = load_dataset( + args.dataset_name, + args.dataset_config_name, + trust_remote_code=False, + ) + else: + data_files = {} + if args.validation_file is not None: + data_files["validation"] = args.validation_file + extension = args.validation_file.split(".")[-1] + else: + raise ValueError( + "Could not determine evaluation dataset to load. Pass `dataset_name` " + "or `validation_file` argument." + ) + raw_datasets = load_dataset(extension, data_files=data_files, field="data") + + column_names = raw_datasets["train"].column_names + + self.question_column_name = "question" if "question" in column_names else column_names[0] + self.context_column_name = "context" if "context" in column_names else column_names[1] + self.answer_column_name = "answers" if "answers" in column_names else column_names[2] + + # Padding side determines if we do (question|context) or (context|question) + self.pad_on_right = self.tokenizer.padding_side == "right" + + if args.max_seq_length > self.tokenizer.model_max_length: + dprint( + f"The max_seq_length passed ({args.max_seq_length}) is larger than the " + f"maximum length for the model ({self.tokenizer.model_max_length}). " + f"Using max_seq_length={self.tokenizer.model_max_length}." + ) + + self.max_seq_length = min(args.max_seq_length, self.tokenizer.model_max_length) + + eval_examples = raw_datasets["validation"] + if args.max_eval_samples is not None: + # We will select sample from whole data + eval_examples = eval_examples.select(range(args.max_eval_samples)) + self.eval_examples = eval_examples + + eval_dataset = eval_examples.map( + self.prepare_validation_features, + batched=True, + num_proc=args.preprocessing_num_workers, + remove_columns=column_names, + load_from_cache_file=not args.overwrite_cache, + desc="Running tokenizer on validation dataset", + ) + + if args.max_eval_samples is not None: + # During Feature creation dataset samples might increase, we will select + # required samples again + eval_dataset = eval_dataset.select(range(args.max_eval_samples)) + + # store evaluation dataset prior dropping + self.eval_dataset = eval_dataset + + # DataLoaders creation: + if args.pad_to_max_length: + # If padding was already done ot max length, we use the default data collator + # that will just convert everything to tensors. + self.data_collator = default_data_collator + else: + # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us + # (by padding to the maximum length of the samples passed). + pad_to_multiple_of = None + self.data_collator = DataCollatorWithPadding( + self.tokenizer, + pad_to_multiple_of=pad_to_multiple_of, + ) + + self.eval_dataset_for_model = eval_dataset.remove_columns( + ["example_id", "offset_mapping"] + ) + self.eval_dataloader = DataLoader( + self.eval_dataset_for_model, + shuffle=False, + collate_fn=self.data_collator, + batch_size=args.batch_size, + ) + dprint("Dataloader initialized.") + + self.metric = evaluate.load( + "squad_v2" if args.version_2_with_negative else "squad" + ) + dprint("Evaluation metric initialized.") + + def postprocess_qa_predictions( + self, + examples, + features, + predictions: tuple[np.ndarray, np.ndarray], + version_2_with_negative: bool = False, + n_best_size: int = 20, + max_answer_length: int = 30, + null_score_diff_threshold: float = 0.0, + output_dir: str | None = None, + prefix: str | None = None, + ): + """ + Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the + original contexts. This is the base postprocessing functions for models that only return start and end logits. + + Args: + examples: The non-preprocessed dataset (see the main script for more information). + features: The processed dataset (see the main script for more information). + predictions (:obj:`Tuple[np.ndarray, np.ndarray]`): + The predictions of the model: two arrays containing the start logits and the end logits respectively. Its + first dimension must match the number of elements of :obj:`features`. + version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not the underlying dataset contains examples with no answers. + n_best_size (:obj:`int`, `optional`, defaults to 20): + The total number of n-best predictions to generate when looking for an answer. + max_answer_length (:obj:`int`, `optional`, defaults to 30): + The maximum length of an answer that can be generated. This is needed because the start and end predictions + are not conditioned on one another. + null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0): + The threshold used to select the null answer: if the best answer has a score that is less than the score of + the null answer minus this threshold, the null answer is selected for this example (note that the score of + the null answer for an example giving several features is the minimum of the scores for the null answer on + each feature: all features must be aligned on the fact they `want` to predict a null answer). + + Only useful when :obj:`version_2_with_negative` is :obj:`True`. + output_dir (:obj:`str`, `optional`): + If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if + :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null + answers, are saved in `output_dir`. + prefix (:obj:`str`, `optional`): + If provided, the dictionaries mentioned above are saved with `prefix` added to their names. + log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``): + ``logging`` log level (e.g., ``logging.WARNING``) + """ + + if len(predictions) != 2: + raise ValueError("`predictions` should be a tuple with two elements (start_logits, end_logits).") + all_start_logits, all_end_logits = predictions + + if len(predictions[0]) != len(features): + raise ValueError(f"Got {len(predictions[0])} predictions and {len(features)} features.") + + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(examples["id"])} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + if version_2_with_negative: + scores_diff_json = collections.OrderedDict() + + # Logging. + dprint(f"Post-processing {len(examples)} example predictions split into {len(features)} features.") + + # Let's loop over all the examples! + for example_index, example in enumerate(tqdm(examples)): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_prediction = None + prelim_predictions = [] + + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context + # available in the current feature. + token_is_max_context = features[feature_index].get("token_is_max_context", None) + + # Update minimum null prediction. + feature_null_score = start_logits[0] + end_logits[0] + if min_null_prediction is None or min_null_prediction["score"] > feature_null_score: + min_null_prediction = { + "offsets": (0, 0), + "score": feature_null_score, + "start_logit": start_logits[0], + "end_logit": end_logits[0], + } + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or len(offset_mapping[start_index]) < 2 + or offset_mapping[end_index] is None + or len(offset_mapping[end_index]) < 2 + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if end_index < start_index or end_index - start_index + 1 > max_answer_length: + continue + # Don't consider answer that don't have the maximum context available (if such information is + # provided). + if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False): + continue + + prelim_predictions.append( + { + "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]), + "score": start_logits[start_index] + end_logits[end_index], + "start_logit": start_logits[start_index], + "end_logit": end_logits[end_index], + } + ) + if version_2_with_negative and min_null_prediction is not None: + # Add the minimum null prediction + prelim_predictions.append(min_null_prediction) + null_score = min_null_prediction["score"] + + # Only keep the best `n_best_size` predictions. + predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size] + + # Add back the minimum null prediction if it was removed because of its low score. + if ( + version_2_with_negative + and min_null_prediction is not None + and not any(p["offsets"] == (0, 0) for p in predictions) + ): + predictions.append(min_null_prediction) + + # Use the offsets to gather the answer text in the original context. + context = example["context"] + for pred in predictions: + offsets = pred.pop("offsets") + pred["text"] = context[offsets[0] : offsets[1]] + + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): + predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) + + # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using + # the LogSumExp trick). + scores = np.array([pred.pop("score") for pred in predictions]) + exp_scores = np.exp(scores - np.max(scores)) + probs = exp_scores / exp_scores.sum() + + # Include the probabilities in our predictions. + for prob, pred in zip(probs, predictions): + pred["probability"] = prob + + # Pick the best prediction. If the null answer is not possible, this is easy. + if not version_2_with_negative: + all_predictions[example["id"]] = predictions[0]["text"] + else: + # Otherwise we first need to find the best non-empty prediction. + i = 0 + while predictions[i]["text"] == "": + i += 1 + best_non_null_pred = predictions[i] + + # Then we compare to the null prediction using the threshold. + score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"] + scores_diff_json[example["id"]] = float(score_diff) # To be JSON-serializable. + if score_diff > null_score_diff_threshold: + all_predictions[example["id"]] = "" + else: + all_predictions[example["id"]] = best_non_null_pred["text"] + + # Make `predictions` JSON-serializable by casting np.float back to float. + all_nbest_json[example["id"]] = [ + {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()} + for pred in predictions + ] + + # If we have an output_dir, let's save all those dicts. + if output_dir is not None: + if not os.path.isdir(output_dir): + raise EnvironmentError(f"{output_dir} is not a directory.") + + prediction_file = os.path.join( + output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json" + ) + nbest_file = os.path.join( + output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json" + ) + if version_2_with_negative: + null_odds_file = os.path.join( + output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json" + ) + + dprint(f"Saving predictions to {prediction_file}.") + with open(prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + dprint(f"Saving nbest_preds to {nbest_file}.") + with open(nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + dprint(f"Saving null_odds to {null_odds_file}.") + with open(null_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + def post_processing_function(self, examples, features, predictions, stage="eval"): + """Post-processing: we match the start logits and end logits to answers in + the original context.""" + + args = self.args + predictions = self.postprocess_qa_predictions( + examples=examples, + features=features, + predictions=predictions, + version_2_with_negative=args.version_2_with_negative, + n_best_size=args.n_best_size, + max_answer_length=args.max_answer_length, + null_score_diff_threshold=args.null_score_diff_threshold, + output_dir=None, + prefix=stage, + ) + # Format the result to the format the metric expects. + if args.version_2_with_negative: + formatted_predictions = [ + {"id": k, "prediction_text": v, "no_answer_probability": 0.0} + for k, v in predictions.items() + ] + else: + formatted_predictions = [ + {"id": k, "prediction_text": v} for k, v in predictions.items() + ] + + references = [ + {"id": ex["id"], "answers": ex[self.answer_column_name]} for ex in examples + ] + return EvalPrediction(predictions=formatted_predictions, label_ids=references) + + def create_and_fill_np_array(self, start_or_end_logits, dataset, max_len): + """ + Create and fill numpy array of size + len_of_validation_data * max_length_of_output_tensor + + Args: + start_or_end_logits(:obj:`tensor`): + This is the output predictions of the model. We can only enter either + start or end logits. + eval_dataset: Evaluation dataset + max_len(:obj:`int`): + The maximum length of the output tensor. ( See the model.eval() part + for more details ) + """ + + step = 0 + # create a numpy array and fill it with -100. + logits_concat = np.full((len(dataset), max_len), -100, dtype=np.float64) + # Now since we have create an array now we will populate it with the outputs gathered using accelerator.gather_for_metrics + for i, output_logit in enumerate(start_or_end_logits): # populate columns + # We have to fill it such that we have to take the whole tensor and replace it on the newly created array + # And after every iteration we have to change the step + + batch_size = output_logit.shape[0] + cols = output_logit.shape[1] + + if step + batch_size < len(dataset): + logits_concat[step : step + batch_size, :cols] = output_logit + else: + logits_concat[step:, :cols] = output_logit[: len(dataset) - step] + + step += batch_size + + return logits_concat + + def run_warmup(self): + """Run warmup cycle of compiled encoder model set for QuestionAnswering task.""" + + dprint(f"Starting warm-up...") + warmup_start_time = time.time() + dataloader_for_compile = DataLoader( + self.eval_dataset_for_model, + shuffle=False, + collate_fn=self.data_collator, + batch_size=1, + ) + first_batch = self.convert_batch_to_fms_style(next(iter(dataloader_for_compile))) + self.model(**first_batch) + if rank == 0: + dprint(f"Warmup completed in {time.time() - warmup_start_time:.1f} s\n---") + + def run_evaluation(self): + """Run QuestionAnswering evaluation.""" + + args = self.args + eval_dataloader = self.eval_dataloader + + if rank == 0: + dprint(f"Running evaluation ({len(eval_dataloader)} samples)...") + start_time = time.time() + + all_start_logits = [] + all_end_logits = [] + for step, batch in enumerate(eval_dataloader): + with torch.no_grad(): + dprint(f"Step {step + 1} / {len(eval_dataloader)}") + batch = self.convert_batch_to_fms_style(batch) + start_logits, end_logits = self.model(**batch) + all_start_logits.append(start_logits.cpu().numpy()) + all_end_logits.append(end_logits.cpu().numpy()) + eval_duration = time.time() - start_time + dprint( + f"Runtime: {eval_duration:.0f} s | " + f"{eval_duration / len(eval_dataloader):.2f} s/batch | " + f"{eval_duration / (len(eval_dataloader) * args.per_device_eval_batch_size):.2f}" + " s/sample " + f"(tot = {len(eval_dataloader) * args.per_device_eval_batch_size}, " + f"bs = {args.per_device_eval_batch_size})" + ) + + # concatenate the numpy array + max_len = max([x.shape[1] for x in all_start_logits]) + start_logits_concat = self.create_and_fill_np_array( + all_start_logits, + self.eval_dataset, + max_len, + ) + end_logits_concat = self.create_and_fill_np_array( + all_end_logits, + self.eval_dataset, + max_len, + ) + + del all_start_logits + del all_end_logits + + outputs_numpy = (start_logits_concat, end_logits_concat) + prediction = self.post_processing_function( + self.eval_examples, + self.eval_dataset, + outputs_numpy, + ) + eval_metric = self.metric.compute( + predictions=prediction.predictions, + references=prediction.label_ids, + ) + dprint(f"Evaluation metrics: {eval_metric}") + + +class EncoderMLMInfer(): + """Run MaskedLM task with encoder models.""" + + def __init__( + self, + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, + ): + self.model = model + self.tokenizer = tokenizer + self.args = args + + + def process_eval_set(self): + """Barebone function that sets up a single example prompt (for now).""" + + self.prompt = "the dog chased the cat while aggressively" + + def run_evaluation(self, warmup=False): + """Run evaluation cycle of compiled encoder model set for MaskedLM task. + No output printout if warmup is True. + """ + + dprint(f"Starting evaluation ({warmup=})...") + warmup_start_time = time.time() + unmasker = pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer) + output = unmasker(self.prompt) + if rank == 0: + dprint(f"Run completed in {time.time() - warmup_start_time:.1f} s\n---") + if not warmup: + dprint(f"{self.prompt}\nAnswers:") + for ans in output: + dprint(f"{ans['token_str']:10} | {ans['score']:6.4f}") + + +def run_encoder_eval_qa( + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, +): + """Entry point to run QuestionAnswering Evaluation of encoder model. + + Processing based on pytorch example: + https://github.com/huggingface/transformers/blob/main/examples/pytorch/... + ...question-answering/run_qa_no_trainer.py + """ + + encoder_qa_infer = EncoderQAInfer(model, tokenizer, args) + encoder_qa_infer.process_eval_set() + if args.compile: + encoder_qa_infer.run_warmup() + encoder_qa_infer.run_evaluation() + + +def run_encoder_eval_mlm( + model: PreTrainedModel, + tokenizer: PreTrainedTokenizerBase, + args: argparse.Namespace, +): + """Entry point to run evaluation of encoder models.""" + + encoder_mlm_infer = EncoderMLMInfer(model, tokenizer, args) + encoder_mlm_infer.process_eval_set() + if args.compile: + encoder_mlm_infer.run_evaluation(warmup=True) + encoder_mlm_infer.run_evaluation() From 5895831e35a8b6a650dbe2339ad03110eca07133 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:16:38 -0400 Subject: [PATCH 06/30] Refactor Direct Quantization (wip) Signed-off-by: Andrea Fasoli --- .../utils/direct_quantization.py | 260 ++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 aiu_fms_testing_utils/utils/direct_quantization.py diff --git a/aiu_fms_testing_utils/utils/direct_quantization.py b/aiu_fms_testing_utils/utils/direct_quantization.py new file mode 100644 index 0000000..4f7c898 --- /dev/null +++ b/aiu_fms_testing_utils/utils/direct_quantization.py @@ -0,0 +1,260 @@ +# Standard +from pathlib import Path +from tqdm import tqdm +import argparse +import os +import time + +# Third Party +from torch.utils.data import DataLoader # [R] +from transformers import ( # [R] + default_data_collator, + DataCollatorWithPadding, + EvalPrediction, + RobertaForQuestionAnswering, + RobertaForMaskedLM, + RobertaTokenizerFast, + pipeline, +) +import torch + +# Local Packages +from fms_mo import qconfig_init, qmodel_prep # [R] +from fms_mo.quant.ptq import dq_llm, get_act_scales # [R] +from fms_mo.utils.utils import prepare_input # [R] +from utils.roberta_int8_utils import ( # [R] change this + validate_arguments, + get_wikitext2, + use_default_qcfg, + process_state_dict, + mask_examples, + dequantize_int8_weights, +) + + +QUANTIZED_LAYERS_ROBERTA = [ + "attention.self.query", + "attention.self.key", + "attention.self.value", + "attention.output.dense", + "intermediate.dense", + "output.dense", +] + +# TODO: change print to dprint +# TODO: add LLM DQ +# TODO: load wikitext using FMS-MO instead of custom function + +def run_dq_roberta(args: argparse.Namespace): + """Run INT8 Direct Quantization for RoBERTa. + """ + + #------------- + # Instantiate HF RoBERTa FP16 + #------------- + print("* Begin Direct Quantization (DQ) process.") + torch.set_default_dtype(torch.float16) + tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer) + fp16_model_path = args.fp16_ckpt_path if args.fp16_ckpt_path else 'roberta-base' + if args.architecture == "roberta": + model = RobertaForMaskedLM.from_pretrained( + fp16_model_path, + torch_dtype=torch.float16, + ) + elif args.architecture == "roberta_question_answering": + model = RobertaForQuestionAnswering.from_pretrained( + fp16_model_path, + torch_dtype=torch.float16, + ) + else: + raise NotImplementedError( + f"Variant {args.architecture} is not supported for Direct Quantization" + ) + model.to("cpu") + print("* FP16 model loaded to CPU.") + + train_dataset, test_dataset = get_wikitext2(tokenizer) + dq_dataloader = DataLoader( + train_dataset[:args.num_dq_samples], + shuffle=True, + collate_fn=default_data_collator, + batch_size=1, + ) + print(f"* Dataset for DQ loaded (samples = {len(dq_dataloader.dataset)}).") + + #------------- + # Set fms_mo configuration + #------------- + qcfg = qconfig_init(recipe=args.int8_qcfg_path, args=args) + + # preferred method is to update qconfig from recipe, providing --int8_qcfg_path + # but the following will set some defaults if config json is not passed + if not args.int8_qcfg_path: + print("* Using a default quantization configuration for missing parameters.") + qcfg = use_default_qcfg(qcfg) + qcfg["logger"] = print + qcfg["qw_mode"] = "maxperCh" if args.weight_per_channel else "max" + if args.activ_quant_type == "per_token": + qcfg["qa_mode"] = "pertokenmax" + elif args.activ_quant_type == "per_tensor_symm": + qcfg["qa_mode"] = "maxsym" + else: + qcfg["qa_mode"] = "max" + qcfg["a_init_method"] = "max" + qcfg["qw_mode_calib"] = "max" + qcfg["qa_mode_calib"] = "max" + + if args.verbose: + print("=" * 60) + print("QUANTIZATION CONFIGURATION") + print("\n".join(f"{k:60} {v}" for k,v in qcfg.items() if not isinstance(v, dict))) + + #------------- + # Prepare inputs as list and generate quantized model with fms_mo + # This is not an FMS model. fms_mo model can run Direct Quantization + #------------- + examples = None + examples_for_prep = None + if qcfg["qmodel_calibration"]: + if args.activ_quant_type == "per_tensor_asymm": + print("=" * 60) + print(f"qmodel_calibration = {qcfg['qmodel_calibration']}") + print(f"qmodel_calibration_new = {qcfg['qmodel_calibration_new']}") + raise NotImplementedError( + "Direct Quantization (DQ) using `qmodel_calibration` is not compatible " + "with INT8 asymmetric quantization of activations in fms-mo. " + "Please pass `qmodel_calibration_new` argument instead." + ) + examples = qcfg["qmodel_calibration"] + elif qcfg["qmodel_calibration_new"]: + examples = qcfg["qmodel_calibration_new"] + if examples: + examples_for_prep = [next(iter(dq_dataloader)) for _ in range(examples)] + + #------------- + # Prepare quantized model using fms_mo + #------------- + print("=" * 60) + print(f"* Begin preparation of quantized model.") + if qcfg["qmodel_calibration"]: + print("* Calibration to be applied during this preparation step.") + prep_time_start = time.time() + qmodel_prep( + model, + examples_for_prep, + qcfg, + dev="cpu", # always run Direct Quantization on CPU, not AIU + use_layer_name_pattern_matching=False, + save_fname='roberta-base-w8a8', + ) + if qcfg["qmodel_calibration"]: + print( + "* Quantized model has been instantiated and pre-calibrated " + f"(took {time.time() - prep_time_start:.1f} s)." + ) + else: + print( + "* Quantized model has been instantiated and needs calibration " + f"(took {time.time() - prep_time_start:.1f} s)." + ) + + #------------- + # Apply smoothquant + #------------- + if qcfg['smoothq']: + sq_time_start = time.time() + print("* Being applying SmoothQuant scales.") + assert qcfg['smoothq'] == True, "doing smoothq" + if not os.path.exists(qcfg['act_scale_path']): + print( + "generate new smoothq activation scales " + f"at {qcfg['act_scale_path']}" + ) + smoothq_alpha_requested = None + if qcfg["smoothq_alpha"] != 0: + smoothq_alpha_requested = qcfg["smoothq_alpha"] + qcfg["smoothq_alpha"] = 0 + print("[WARNNG] using smoothq_alpha = 0 for scale generation") + act_scales = get_act_scales(model, dq_dataloader, qcfg, device="cpu") + torch.save(act_scales, qcfg['act_scale_path']) + if smoothq_alpha_requested: + qcfg["smoothq_alpha"] = smoothq_alpha_requested + print(f"smoothq_alpha set back to {qcfg['smoothq_alpha']}") + else: + print( + f"using smoothq activation scales from {qcfg['act_scale_path']}" + ) + act_scales = torch.load(qcfg['act_scale_path'], map_location='cpu') + + dq_llm(model, act_scales, qcfg) + print(f"* SmoothQuant scales applied (took = {time.time() - sq_time_start:.1f} s).") + print("=="*20) + else: + print("* SmoothQuant is DISABLED.") + + #------------- + # Run calibration = Direct Quantization DQ + #------------- + if qcfg['qmodel_calibration_new'] > 0: + calib_time_start = time.time() + print("* Begin calibration of activation quantized parameters.") + pbar = tqdm( + dq_dataloader, + desc="* Calibration progress", + total = qcfg['qmodel_calibration_new'] + ) + for data_mb, _ in zip(pbar, range(qcfg['qmodel_calibration_new'])): + data_mb = prepare_input( + device=model.device, + data=data_mb, + ) + with torch.no_grad(): + model(**data_mb) + print(f"* Calibration completed (took = {time.time() - calib_time_start:.1f} s).") + + if args.verbose: + print("=" * 60) + print("* PARAMETERS") + print("\n".join( + f"{k:80} {str(list(v.size())):15} {v.dtype}" + for k,v in model.named_parameters() + )) + print("* BUFFERS") + print("\n".join( + f"{k:80} {str(list(v.size())):15} {v.dtype}" + for k,v in model.named_buffers() + )) + + #------------- + # Save checkpoint with integer weights (AIU requirement) + #------------- + keys_to_ignore = [ + "num_module_called", + "smoothq_act_scale", + "smoothq_alpha", + "calib_counter", + "obsrv_clipval", + "obsrv_clipvaln", + "obsrv_w_clipval", + ] + + print(f"Begin processing model state dictionary for saving.") + new_sd = process_state_dict( + model=model, + quantized_layers=QUANTIZED_LAYERS_ROBERTA, + keys_to_ignore=keys_to_ignore, + verbose=args.verbose, + ) + + task = "mlm" if args.architecture == "roberta" else "qa" + smoothq_str = qcfg['smoothq_alpha'] if qcfg['smoothq'] else "no" + save_path = str( + Path(args.output_path) / + f"roberta-base_{task}_w8-{qcfg['qw_mode']}_a8-{qcfg['qa_mode']}" + f"_bmm32_smoothq-{smoothq_str}_dq.pt" + ) + torch.save(new_sd, save_path) + print(f"Model saved to {save_path}") + + tokenizer.save_pretrained(args.output_path) + print(f"Tokenizer saved to {args.output_path}") \ No newline at end of file From 8b1d37ee7fd9c823f3aefb42bd2915818a162c84 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:17:09 -0400 Subject: [PATCH 07/30] Refactor inference entry point for LLM and RoBERTa Signed-off-by: Andrea Fasoli --- scripts/inference.py | 749 +++++-------------------------------------- 1 file changed, 75 insertions(+), 674 deletions(-) diff --git a/scripts/inference.py b/scripts/inference.py index 36ef744..baddd29 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -1,714 +1,115 @@ # Standard import argparse -from functools import partial -import itertools -import json -import os -from pathlib import Path -import random import time # Third Party -from aiu_fms_testing_utils.utils import aiu_setup -from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size -import numpy as np -import torch -from torch import distributed as dist -from fms.models import get_model, register_model -from fms.models.llama import LLaMAConfig, _llama_factory_factory -from fms.utils import generation, tokenizers -from fms.utils.generation import generate, pad_input_ids +from fms.models import get_model +from fms.utils import tokenizers +from torch import distributed, set_grad_enabled - -# This example script validates the LLaMA implementation by running inference on a couple of prompts. -# -# Example usage with single-GPU 7B model on slurm, with torch.compile and determinstic behavior: -# CUBLAS_WORKSPACE_CONFIG=:4096:8 srun -N 1 --gres=gpu:1 python scripts/inference.py --model_path=~/models/7B-F/ --tokenizer=~/models/tokenizer.model --compile --deterministic -# Example usage of 13B model on 2 GPUs with Tensor Parallel: -# srun -N 1 --gres=gpu:2 torchrun --nproc_per_node=2 scripts/inference.py --model_path=~/models/13B-F --tokenizer=~/models/tokenizer.model --distributed - -parser = argparse.ArgumentParser( - description="Script to run inference on a causal model" -) -parser.add_argument( - "--device_type", - type=str, - choices=["cuda", "cpu", "aiu", "aiu-senulator"], - default="cuda", - help="The device to run the model on" -) -parser.add_argument( - "--architecture", - type=str, - help="The model architecture to benchmark", -) -parser.add_argument( - "--variant", - type=str, - default=None, - help="The model variant (configuration) to benchmark. E.g. 7b, 13b, 70b.", -) -parser.add_argument( - "--model_path", - type=str, - help="Path to the directory containing LLaMa weights (.pth files sharded by tensor parallel rank, not HF weights)", -) -parser.add_argument( - "--model_source", - type=str, - help="Source of the checkpoint. E.g. 'meta', 'hf', None", -) -parser.add_argument( - "--quantization", - type=str, - choices=["gptq", "int8"], - default=None, - help="Type of quantization of the model checkpoint", -) -parser.add_argument( - "--int8_weight_per_channel", - action="store_true", - help="Enable per-channel weight quantization in INT8 quantized model", -) -parser.add_argument( - "--int8_activ_quant_type", - default="per_token", - choices=["per_token", "per_tensor_symm", "per_tensor_asymm"], - type=str, - help="Define strategy for activation quantization in INT8 quantized model", -) -parser.add_argument( - "--int8_smoothquant", - action="store_true", - help="Enable smoothquant in INT8 quantized model", -) -parser.add_argument( - "--tokenizer", - type=str, - required=True, - help="Path to the tokenizer (e.g. ~/tokenizer.model)", -) -parser.add_argument( - "--no_use_cache", - action="store_false", - help="Disable the kv-cache (on by default)", -) -parser.add_argument( - "--unfuse_weights", - action="store_true", - help="If set to True, this will unfuse any fused weight modules that support the unfuse_weights method", -) -parser.add_argument( - "--default_dtype", - type=str, - default=None, - choices=["bf16", "fp16", "fp32"], - help="If set to one of the choices, overrides the model checkpoint weight format by setting the default pytorch format", -) -parser.add_argument( - "--compile", - action="store_true", - help="Use torch.compile (slow for first inference pass)", -) -parser.add_argument( - "--compile_mode", - type=str, - help="Mode for compilation (only valid for inductor backend)", - default="default", - choices=["default", "reduce-overhead"], -) -parser.add_argument( - "--compile_backend", - type=str, - help="Backend for compilation (only when not running on AIU)", - default="inductor", - choices=["inductor", "eager", "aot_eager"], -) -parser.add_argument( - "--compile_dynamic", - action="store_true", - help="Use dynamic shapes with torch.compile", -) -parser.add_argument( - "--deterministic", - action="store_true", - help="Set torch.use_deterministic_algorithms? Requires env variable `CUBLAS_WORKSPACE_CONFIG=:4096:8`", -) -parser.add_argument( - "--distributed", - action="store_true", - help="This is a distributed job (multiple instances run with RANK+WORLD_SIZE)", -) -parser.add_argument( - "--batch_size", - type=int, - default=1, - help="size of input batch", -) -parser.add_argument( - "--max_prompt_length", - type=int, - default=None, - help="cap the number of tokens per prompt to a maximum length prior to padding. If None, there will be no cap.", -) -parser.add_argument( - "--min_pad_length", - type=int, - help="Pad inputs to a minimum specified length. If any prompt is larger than the specified length, padding will be determined by the largest prompt", - default=0, -) -parser.add_argument( - "--fixed_prompt_length", - type=int, - help="If defined, overrides both min_pad_length and max_prompt_length. Pads input to fixed_prompt_length, fails if any input needs truncation.", - default=0, -) -parser.add_argument( - "--max_new_tokens", - type=int, - help="max number of generated tokens", - default=100, -) -parser.add_argument( - "--no_early_termination", - action="store_true", - help="disable early termination on generation", +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank +from aiu_fms_testing_utils.utils.args_parsing import get_args +from aiu_fms_testing_utils.utils.decoders import run_decoder_eval +from aiu_fms_testing_utils.utils.encoders import ( + get_roberta_tokenizer, + wrap_encoder, + run_encoder_eval_qa, + run_encoder_eval_mlm, ) -parser.add_argument( - "--prompt_type", - type=str, - choices=["chat", "code"], - default="chat", - help="type of prompts to be used, either chat or code", +from aiu_fms_testing_utils.utils.model_setup import setup_model +from aiu_fms_testing_utils.utils.quantization_setup import ( + import_addons, + get_linear_config, + print_model_params, ) -parser.add_argument( - "--prompt_path", - type=str, - default="", - help="if set, load the prompts from file(s) instead of the local examples. Supports glob-style patterns", -) -parser.add_argument( - "--output_path", - type=str, - default="", - help="path of folder to save outputs to, if empty don't save", -) -parser.add_argument( - "--timing", - type=str, - choices=["e2e", "per-token"], - default="", - help="if set, how to time the generation of tokens, e2e or per-token", -) -parser.add_argument( - "--iters", - type=int, - default=1, - help="Number of iterations of inference to perform. Used for variance performance capture.", -) -parser.add_argument( - '-v', '--verbose', - action='count', - default=0, - help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)" -) -args = parser.parse_args() - -if args.quantization == "gptq": - if "aiu" in args.device_type: - try: - from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear - print("Loaded `aiu_addons` functionalities") - except: - raise ImportError("Failed to import GPTQ addons from fms-mo.") -elif args.quantization == "int8": - try: - from fms_mo.aiu_addons.i8i8 import i8i8_aiu_adapter, i8i8_aiu_linear - print("Loaded `aiu_addons` functionalities") - except: - raise ImportError("Failed to import INT8 addons from fms-mo.") - -# this is a test model config -config = LLaMAConfig( - emb_dim=1024, - nheads=8, - nlayers=10, - src_vocab_size=128256, -) -register_model("llama", "194m", _llama_factory_factory(config)) - -default_dtype = None -dtypes_map = { - "fp16": torch.float16, - "bf16": torch.bfloat16, - "fp32": torch.float32, -} -if args.default_dtype is not None: - default_dtype = dtypes_map[args.default_dtype] -if default_dtype is not None: - torch.set_default_dtype(default_dtype) -dprint(f"{args}") +parser = argparse.ArgumentParser(description="Entry point for AIU inference") +args = get_args(parser) -is_aiu_backend = "aiu" in args.device_type +if args.is_quantized: + import_addons(args) if args.distributed: - dist.init_process_group() - # Fix until PT 2.3 - torch._C._distributed_c10d._register_process_group("default", dist.group.WORLD) - aiu_setup.aiu_dist_setup(dist.get_rank(), dist.get_world_size()) - -if args.device_type == "cuda": - device = torch.device(args.device_type, local_rank) - torch.cuda.set_device(device) -elif is_aiu_backend: - from torch_sendnn import torch_sendnn - - if not args.distributed: - aiu_setup.aiu_setup(rank, world_size) - - _target_cache_size = max( - int(args.max_new_tokens * 2), - int(args.min_pad_length * 2.5), - int(args.fixed_prompt_length * 2.5), - ) - _prompt_size = max(int(args.min_pad_length), int(args.fixed_prompt_length)) - if hasattr(torch._dynamo.config, "accumulated_cache_size_limit"): - if _target_cache_size > torch._dynamo.config.accumulated_cache_size_limit: - _prev = torch._dynamo.config.accumulated_cache_size_limit - torch._dynamo.config.accumulated_cache_size_limit = _target_cache_size - dprint( - f"NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit from {_prev} to {torch._dynamo.config.accumulated_cache_size_limit} to accomodate prompt size of {_prompt_size} and decode tokens of {args.max_new_tokens}" - ) - - if _target_cache_size > torch._dynamo.config.cache_size_limit: - _prev = torch._dynamo.config.cache_size_limit - torch._dynamo.config.cache_size_limit = _target_cache_size - dprint( - f"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from {_prev} to {torch._dynamo.config.cache_size_limit} to accomodate prompt size of {_prompt_size} and decode tokens of {args.max_new_tokens}" - ) - - if not args.compile_dynamic: - torch._dynamo.config.assume_static_by_default = True - torch._dynamo.config.dynamic_shapes = False - torch._dynamo.config.automatic_dynamic_shapes = False - - # This should be set outside!!! - os.environ.setdefault("SENCORES", "32") - os.environ.setdefault("SENCORELETS", "2") - os.environ.setdefault("DATA_PREC", "fp16") - os.environ.setdefault("FLEX_OVERWRITE_NMB_FRAME", "1") - os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") - - os.environ.setdefault("COMPILATION_MODE", "offline_decoder") - - if args.device_type == "aiu-senulator": - os.environ["FLEX_COMPUTE"] = "SENULATOR" - os.environ["FLEX_DEVICE"] = "MOCK" - else: - if "AIU_WORLD_RANK_0" not in os.environ: - print("must set AIU_WORLD_RANK_0") - exit() - os.environ.setdefault("FLEX_COMPUTE", "SENTIENT") - os.environ.setdefault("FLEX_DEVICE", "VFIO") - - device = torch.device("cpu") -else: - device = torch.device(args.device_type) - -# requires setting environment variable: `CUBLAS_WORKSPACE_CONFIG=:4096:8` -if args.deterministic: - SEED = 42 - random.seed(SEED) - torch.manual_seed(SEED) # pytorch random seed - np.random.seed(SEED) # numpy random seed - torch.use_deterministic_algorithms(True) - -dprint("loading model") -loading_model_time = time.time() -if args.distributed: - distr_param = "tp" -else: - if torch.cuda.device_count() > 1 and world_size == 1: - distr_param = "mp" - else: - distr_param = None - -fused_weights = not args.unfuse_weights -if args.quantization == "gptq": - if fused_weights and is_aiu_backend: - raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights") - if default_dtype is not None: - raise ValueError( - "GPTQ default_dtype must be None to preserve the checkpoint data types." - ) - - if "aiu" in args.device_type: - linear_type = "gptq_aiu" - elif args.device_type == "cpu": - linear_type = "gptq_cpu" - elif args.device_type == "cuda": - linear_type = "gptq" # GPTQ support on GPU is FMS-native - else: - raise ValueError(f"Unsupported device {args.device} for GPTQ") + distributed.init_process_group() # used by inference.py + # distributed.init_process_group(backend="gloo", rank=local_rank, world_size=world_size) - qconfig_path = args.model_path + "/quantize_config.json" - if os.path.exists(qconfig_path): - with open(qconfig_path, 'r') as f: - dprint(f"loading quantization config from {qconfig_path}") - qconfig = json.load(f) - group_size = qconfig["group_size"] - desc_act = qconfig["desc_act"] - if desc_act: - raise NotImplementedError( - "Activation reordering not supported at this time." - ) - else: - dprint( - "[WARNING] Could not locate quantization config file. " - "Default configuration will be used." - ) - group_size = 128 - desc_act = False +# Main model setup +default_dtype, device, dist_strat = setup_model(args) - linear_config = { - "linear_type": linear_type, - "group_size": group_size, - "desc_act": desc_act, - } -elif args.quantization == "int8": - if fused_weights and is_aiu_backend: - raise ValueError("INT8 checkpoints on AIU must always run with --unfuse_weights") - if default_dtype is not None: - raise ValueError( - "INT8 default_dtype must be None to preserve the checkpoint data types." - ) +model_path = args.model_path +if args.direct_quantization: + save_path = None - def select_int8_module( - module_name: str | None = None, - smoothquant: bool = True, - smoothquant_layers: list[str] | None = None, - ): - if module_name is None: - return "int8_aiu" - smoothquant_on_module = ( - any([m in module_name for m in smoothquant_layers]) - if smoothquant_layers is not None - else True - ) - use_smoothquant = smoothquant and smoothquant_on_module - return "int8_smoothquant_aiu" if use_smoothquant else "int8_aiu" + # !!! insert DQ here (for RoBERTa first, then add for LLM) - if args.int8_smoothquant: - # TODO: consider saving this info into config during quantization - if any("granite" in p.lower() for p in [args.model_path, args.architecture]): - smoothquant_layers = ["key", "value", "w1", "wg"] - elif any("roberta" in p.lower() for p in [args.model_path, args.architecture]): - smoothquant_layers = ["query", "key", "value", "w1"] - else: - raise NotImplementedError( - "INT8 architecture does not support smoothquant." - ) - else: - smoothquant_layers = [] + # if DQ is used, args.model_path represent FP16 ckpt but we need to load the + # newly-created INT8 ckpt. Without DQ, args.model_path is the INT8 ckpt already. + model_path = save_path - linear_config = { - "linear_type": partial( - select_int8_module, - smoothquant = args.int8_smoothquant, - smoothquant_layers = smoothquant_layers, - ), - "weight_per_channel": args.int8_weight_per_channel, - "activ_quant_type": args.int8_activ_quant_type, - } -else: - linear_config = {"linear_type": "torch_linear"} +# Retrieve linear configuration (quantized or not) to instantiate FMS model +linear_config = get_linear_config(args) -dprint("="*60) -dprint(f"model_path={args.model_path}") -dprint(f"{linear_config=}") -dprint(f"{fused_weights=}") -dprint(f"data_type={default_dtype}") -dprint("="*60 + "\n") +if rank == 0: + dprint("="*60) + dprint(f"model_path={args.model_path}") + dprint(f"{linear_config=}") + dprint(f"fused_weights={args.fused_weights}") + dprint(f"data_type={default_dtype}") + dprint("="*60 + "\n") +dprint("Loading model...") +loading_model_start = time.time() model = get_model( args.architecture, args.variant, - model_path=args.model_path, - device_type="cpu" if is_aiu_backend else args.device_type, + model_path=model_path, + device_type="cpu" if args.is_aiu_backend else args.device_type, data_type=default_dtype, source=args.model_source, - distributed_strategy=distr_param, - group=dist.group.WORLD, + distributed_strategy=dist_strat, + group=distributed.group.WORLD, linear_config=linear_config, - fused_weights=fused_weights, + fused_weights=args.fused_weights, ) -if args.quantization in ["gptq", "int8"]: - if rank == 0 and args.verbose > 0: - dprint("PARAMS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_parameters())) - dprint("BUFFERS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_buffers())) - dprint("="*60 + "\n") - if args.architecture == "llama": - dprint("[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys.") - dprint(model) - dprint("="*60 + "\n") +if args.is_quantized: + print_model_params(model, args) + +if "roberta" in args.architecture: + tokenizer = get_roberta_tokenizer(args.tokenizer) +else: + tokenizer = tokenizers.get_tokenizer(args.tokenizer) -tokenizer = tokenizers.get_tokenizer(args.tokenizer) model.eval() -torch.set_grad_enabled(False) -loading_model_time = time.time() - loading_model_time -dprint(f"loading complete, took {loading_model_time:.3f}s") +set_grad_enabled(False) +if args.distributed: + distributed.barrier() +dprint(f"Loading model completed in {time.time() - loading_model_start:.2f} s.") + +if args.architecture == "roberta": + model = wrap_encoder(model) if args.compile: - dprint("compiling model") - if is_aiu_backend: + dprint("Compiling model...") + if args.is_aiu_backend: model.compile(backend="sendnn_decoder") else: # compiling can make first inference pass slow model.compile(mode=args.compile_mode, backend=args.compile_backend) - -add_special_tokens = tokenizer.bos_token_id != tokenizer.eos_token_id - - -def ids_for_prompt(prompt): - tokens = tokenizer.tokenize(prompt) - ids = tokenizer.convert_tokens_to_ids(tokens) - if add_special_tokens: - ids = [tokenizer.bos_token_id] + ids - ids = torch.tensor(ids, dtype=torch.long, device=device) - return ids - - -def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length): - # we may want the prompt length to be fixed to some max length - # this will ensure that prior to padding the input ids - if max_allowed_length is not None and max_len > max_allowed_length: - dprint(f"max prompt length is {max_len}, truncating to {max_allowed_length}") - prompts = [prompt[:max_allowed_length] for prompt in prompts] - return prompts - - -if args.prompt_path != "": - # Before creating the Path object, check if prompt_path has a glob pattern - if isinstance(args.prompt_path, str): - prompt_path, sep, glob_pattern = args.prompt_path.partition("*") - else: - sep = "" - glob_pattern = "" - glob_pattern = sep + glob_pattern - - prompt_path = Path(os.path.expanduser(prompt_path)) - prompt_file_paths = [] - - if prompt_path.is_dir(): - if glob_pattern != "": - glob_pattern_list = [glob_pattern] - else: - glob_pattern_list = ["*.txt"] - for glob_pattern_possibility in glob_pattern_list: - file_list = list(prompt_path.glob(glob_pattern_possibility)) - if len(file_list) > 0: - prompt_file_paths = sorted(file_list) - break - - if prompt_path.is_file(): - prompt_file_paths = [prompt_path] - - # Check if we found some files - assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}" - - # Check if we have enough files - assert ( - len(prompt_file_paths) >= args.batch_size - ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" - - prompts = [] - for i, prompt_file_path in enumerate(prompt_file_paths): - if i == args.batch_size: - break - prompts.append(ids_for_prompt(prompt_file_path.read_text(encoding="utf-8"))) - -else: - if args.prompt_type == "chat": - template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:" - - prompt1 = template.format( - "Provide a list of instructions for preparing chicken soup." - ) - prompt2 = template.format("Explain some popular greetings in Spanish.") - prompt3 = template.format("Explain to me why ignorance is bliss.") - prompt4 = template.format( - "I have just come into a very large sum of money. Provide me a list of things that I can do with my new found wealth." - ) - elif args.prompt_type == "code": - template = "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n{}\n[/INST]" - prompt1 = template.format("Write a bubble sort function in python.") - prompt2 = template.format( - "Using the Java streams API, write a simple function which will get the cumulative sum of a list of integers." - ) - prompt3 = template.format( - "In bash, how do I list all directories and sub-directories which contain a .py file." - ) - prompt4 = template.format( - "Write a simple decorator in python which will modify all string inputs to ints if possible." - ) - else: - dprint("prompt_type must be one of chat or code") - exit() - - prompt1 = ids_for_prompt(prompt1) - prompt2 = ids_for_prompt(prompt2) - prompt3 = ids_for_prompt(prompt3) - prompt4 = ids_for_prompt(prompt4) - prompts = [prompt1, prompt2, prompt3, prompt4] - prompts = prompts * ((args.batch_size // 4) + 1) - prompts = prompts[: args.batch_size] - -if args.fixed_prompt_length != 0: - padding_length = args.fixed_prompt_length - max_allowed_length = args.fixed_prompt_length + dprint("Model compiled.") else: - padding_length = args.min_pad_length - max_allowed_length = args.max_prompt_length - -has_padding = args.batch_size > 1 or padding_length != 0 -max_len = max([len(prompt) for prompt in prompts]) + dprint("[WARNING] SKIP COMPILE.") -if args.fixed_prompt_length != 0 and args.fixed_prompt_length < max_len: - dprint( - f"One or more prompts require truncation. Truncation has been disabled as fixed_prompt_length has been set." - ) - exit(1) -prompts = truncate_prompts_to_max_length(prompts, max_len, max_allowed_length) -if has_padding: - ids, extra_generation_kwargs = pad_input_ids(prompts, min_pad_length=padding_length) +if args.is_encoder: + if args.architecture == "roberta_question_answering": + run_encoder_eval_qa(model, tokenizer, args) + elif args.architecture == "roberta": # basic MaskedLM downstream task + run_encoder_eval_mlm(model, tokenizer, args) else: - ids = prompts - if isinstance(ids, list) and len(ids) == 1: - ids = ids[0].unsqueeze(0) - extra_generation_kwargs = None - - -def print_result(result, result_idx: int): - if local_rank != 0: - return - if has_padding: - result = generation.trim_prefix(result) - - result = generation.trim_prefix(result, tokenizer.bos_token_id) - - # stop at EOS token if present and remove padding - if not args.no_early_termination: - result = generation.truncate_after_eos(result, tokenizer.eos_token_id) - - output_str = tokenizer.convert_tokens_to_string( - tokenizer.convert_ids_to_tokens(result) - ) - - if args.output_path != "": - output_path = Path(args.output_path) - output_path.mkdir(parents=True, exist_ok=True) - if output_path.is_dir(): - file_path = output_path / f"{result_idx}.txt" - with file_path.open("w", encoding="utf-8") as file: - file.write(output_str + "\n") - dprint(output_str) - print() - - -def infer(use_cache, do_sample, warmup): - # With greedy generation (do_sample=False) we _should_ always get the same results. - # There is currently a bug in start_pos for batched rotary embeddings that can lead - # varying results for the same prompt. - if local_rank == 0 and not warmup: - dprint(f"use_cache {use_cache};; do_sample {do_sample}") - dprint("==================") - if hasattr(model.config, "ntk_scaling") and model.config.ntk_scaling: - max_seq_len = max(max_len, model.config.max_expected_seq_len) - else: - # without ntk scaling, extending the seq length too far gives bogus results. - max_seq_len = model.config.max_expected_seq_len - - # Add only_last_token optimization - global extra_generation_kwargs - if extra_generation_kwargs is None: - extra_generation_kwargs = {} - extra_generation_kwargs["only_last_token"] = True + run_decoder_eval(model, tokenizer, args, device) - if args.device_type == "cpu": - # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash cpu impl when padding too much - extra_generation_kwargs["attn_algorithm"] = "math" - - if not args.no_early_termination and not warmup: - eos_token_id = tokenizer.eos_token_id - else: - eos_token_id = None - - result = generate( - model, - ids, - max_new_tokens=args.max_new_tokens, - use_cache=use_cache, - do_sample=do_sample, - max_seq_len=max_seq_len, - timing=args.timing, - eos_token_id=eos_token_id, - contiguous_cache=True, - extra_kwargs=extra_generation_kwargs, - ) - if args.timing != "": - result, timings = result - if args.timing == "e2e": - dprint(f"E2E timing information: {timings[0]:.3f}s") - elif args.timing == "per-token": - if not warmup: - dprint(f"First-token latency: {timings[0]*1000:.3f} ms") - dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms") - dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms") - dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})") - dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})") - dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms") - timings = [f"{t*1000:.3f}" for t in timings] - dprint(f"Per-token timing information: {', '.join(timings)} ms") - if len(result.shape) == 1: - result = result.unsqueeze(0) - - if not warmup: - for i in range(result.shape[0]): - print_result(result[i], i) - - -do_sample = [False] -use_cache = [ - args.no_use_cache -] # True/False are identical with greedy iff `torch.use_deterministic_algorithms(True)` - -if args.compile: - dprint(f"compilation warmup") - pt_compile_model_time = time.time() - for sample, cache in itertools.product(do_sample, use_cache): - infer(cache, sample, True) - pt_compile_model_time = time.time() - pt_compile_model_time - dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") - - if is_aiu_backend: - dprint("executing update_lazyhandle and compiling for AIU") - update_lh_time = time.time() - torch_sendnn.update_lazyhandle() - update_lh_time = time.time() - update_lh_time - dprint(f"update_lazyhandle complete, took {update_lh_time:.3f}s") - - if args.device_type == "aiu": # only run warmup for AIU, no need for senulator - aiu_warmup_time = time.time() - for sample, cache in itertools.product(do_sample, use_cache): - infer(cache, sample, True) - aiu_warmup_time = time.time() - aiu_warmup_time - dprint(f"AIU warmup complete, took {aiu_warmup_time:.3f}s") - -dprint(f"generating output") - -for sample, cache in itertools.product(do_sample, use_cache): - for _ in range(args.iters): - infer(cache, sample, False) +if args.distributed: + distributed.barrier() + distributed.destroy_process_group() From 238b05d0eee4d4bb23f8455952dfd290d718bd36 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Tue, 22 Apr 2025 21:20:06 -0400 Subject: [PATCH 08/30] Refactor AIU setup (relocate env vars setup) Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 61 ++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index fb9a3df..a53ec76 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -1,4 +1,6 @@ +import argparse import os +import torch # ============================================================== # Common utilities @@ -57,6 +59,8 @@ def aiu_setup(rank=0, world_size=1, local_rank=0, local_size=1, verbose=False): def aiu_dist_setup(rank, world_size, local_rank=-0, local_size=-1, verbose=False): if local_rank < 0: local_rank = rank + + # FIXME: local_size not in use ? if local_size < 0: local_size = world_size @@ -67,3 +71,60 @@ def aiu_dist_setup(rank, world_size, local_rank=-0, local_size=-1, verbose=False dprint(f"Detected running via torchrun") aiu_setup(rank, world_size) + + +# ============================================================== +# Environment variables utilities +# ============================================================== +def set_aiu_env_vars(args: argparse.Namespace) -> None: + """Set necessary environment variables for AIU""" + + _target_cache_size = max( + int(args.max_new_tokens * 2), + int(args.min_pad_length * 2.5), + int(args.fixed_prompt_length * 2.5), + ) + _prompt_size = max(int(args.min_pad_length), int(args.fixed_prompt_length)) + if hasattr(torch._dynamo.config, "accumulated_cache_size_limit"): + if _target_cache_size > torch._dynamo.config.accumulated_cache_size_limit: + _prev = torch._dynamo.config.accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = _target_cache_size + dprint( + "NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit " + f"from {_prev} to {torch._dynamo.config.accumulated_cache_size_limit} " + f"to accomodate prompt size of {_prompt_size} and decode tokens of " + f"{args.max_new_tokens}" + ) + + if _target_cache_size > torch._dynamo.config.cache_size_limit: + _prev = torch._dynamo.config.cache_size_limit + torch._dynamo.config.cache_size_limit = _target_cache_size + dprint( + f"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from {_prev} to " + f"{torch._dynamo.config.cache_size_limit} to accomodate prompt size of " + f"{_prompt_size} and decode tokens of {args.max_new_tokens}" + ) + + if not args.compile_dynamic: + torch._dynamo.config.assume_static_by_default = True + torch._dynamo.config.dynamic_shapes = False + torch._dynamo.config.automatic_dynamic_shapes = False + + # This should be set outside!!! + os.environ.setdefault("SENCORES", "32") + os.environ.setdefault("SENCORELETS", "2") + os.environ.setdefault("DATA_PREC", "fp16") + os.environ.setdefault("FLEX_OVERWRITE_NMB_FRAME", "1") + os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") + + os.environ.setdefault("COMPILATION_MODE", "offline_decoder") + + if args.device_type == "aiu-senulator": + os.environ["FLEX_COMPUTE"] = "SENULATOR" + os.environ["FLEX_DEVICE"] = "MOCK" + else: + if "AIU_WORLD_RANK_0" not in os.environ: + print("must set AIU_WORLD_RANK_0") + exit() + os.environ.setdefault("FLEX_COMPUTE", "SENTIENT") + os.environ.setdefault("FLEX_DEVICE", "VFIO") From 38fef7ad16a354beb9d6a998bbe715c357b1fc60 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 14:00:00 -0400 Subject: [PATCH 09/30] Remove deprecated local_size Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index a53ec76..0fc4bd0 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -23,7 +23,7 @@ def dprint(text): # ============================================================== # Common setup # ============================================================== -def aiu_setup(rank=0, world_size=1, local_rank=0, local_size=1, verbose=False): +def aiu_setup(rank=0, world_size=1, local_rank=0, verbose=False): # ------------- # Envar setup for Sentient backend # ------------- @@ -56,14 +56,10 @@ def aiu_setup(rank=0, world_size=1, local_rank=0, local_size=1, verbose=False): # ============================================================== # Distributed setup # ============================================================== -def aiu_dist_setup(rank, world_size, local_rank=-0, local_size=-1, verbose=False): +def aiu_dist_setup(rank, world_size, local_rank=-0, verbose=False): if local_rank < 0: local_rank = rank - # FIXME: local_size not in use ? - if local_size < 0: - local_size = world_size - if os.getenv("TORCHELASTIC_RUN_ID") is None: os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "12355" From 3b94a3613062ceaa2bfca31951248d8c13a7efd4 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 14:19:05 -0400 Subject: [PATCH 10/30] Remove env vars already set in e2e_stable image Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index 0fc4bd0..2865027 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -107,10 +107,6 @@ def set_aiu_env_vars(args: argparse.Namespace) -> None: torch._dynamo.config.automatic_dynamic_shapes = False # This should be set outside!!! - os.environ.setdefault("SENCORES", "32") - os.environ.setdefault("SENCORELETS", "2") - os.environ.setdefault("DATA_PREC", "fp16") - os.environ.setdefault("FLEX_OVERWRITE_NMB_FRAME", "1") os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") os.environ.setdefault("COMPILATION_MODE", "offline_decoder") From effb27bfa5899c5c5cbd5f22bdcd92f58ddbca9c Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 17:46:31 -0400 Subject: [PATCH 11/30] Group and update parser arguments Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/args_parsing.py | 144 ++++++++++---------- 1 file changed, 70 insertions(+), 74 deletions(-) diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py index ce0892e..7d2992d 100644 --- a/aiu_fms_testing_utils/utils/args_parsing.py +++ b/aiu_fms_testing_utils/utils/args_parsing.py @@ -7,19 +7,20 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: - # FMS model loading arguments - parser.add_argument( + # Arguments for FMS model loading + args_model_loading = parser.add_argument_group("FMS model loading") + args_model_loading.add_argument( "--architecture", type=str, help="The model architecture to benchmark", ) - parser.add_argument( + args_model_loading.add_argument( "--variant", type=str, default=None, help="The model variant (configuration) to benchmark. E.g. 7b, 13b, 70b.", ) - parser.add_argument( + args_model_loading.add_argument( "--model_path", type=str, help=( @@ -27,20 +28,19 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "(.pth files sharded by tensor parallel rank, not HF weights)" ), ) - parser.add_argument( + args_model_loading.add_argument( "--model_source", type=str, help="Source of the checkpoint. E.g. 'meta', 'hf', None", ) - parser.add_argument( + args_model_loading.add_argument( "--unfuse_weights", action="store_true", help=( - "If set to True, this will unfuse any fused weight modules that " - "support the unfuse_weights method" + "If set to True, this will unfuse any fused weight modules" ), ) - parser.add_argument( + args_model_loading.add_argument( "--default_dtype", type=str, default=None, @@ -52,133 +52,144 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: ) # Quantization arguments - parser.add_argument( + args_quantization = parser.add_argument_group("Model quantization") + args_quantization.add_argument( "--quantization", type=str, choices=["gptq", "int8"], default=None, help="Type of quantization of the model checkpoint", ) - parser.add_argument( + args_quantization.add_argument( "--int8_weight_per_channel", action="store_true", help="Enable per-channel weight quantization in INT8 quantized model", ) - parser.add_argument( + args_quantization.add_argument( "--int8_activ_quant_type", default="per_token", choices=["per_token", "per_tensor_symm", "per_tensor_asymm"], type=str, help="Define strategy for activation quantization in INT8 quantized model", ) - parser.add_argument( + args_quantization.add_argument( "--int8_smoothquant", action="store_true", help="Enable smoothquant in INT8 quantized model", ) - parser.add_argument( # NOTE: roberta only so far but should expand to LLM - "--direct_quantization", + args_quantization.add_argument( + "--int8_direct_quantization", action="store_true", help="Train INT8 model with Direct Quantization", ) - parser.add_argument( - "--num_dq_samples", + args_quantization.add_argument( + "--int8_num_dq_samples", type=int, default=128, help="number of samples used for Direct Quantization", ) - # General settings - parser.add_argument( + # General run settings + args_run_settings = parser.add_argument_group("Run settings") + args_run_settings.add_argument( "--device_type", type=str, choices=["cuda", "cpu", "aiu", "aiu-senulator"], default="cuda", help="The device to run the model on" ) - parser.add_argument( + args_run_settings.add_argument( "--seed", type=int, default=81072, help="Run seed (only needed if eval dataset is shuffled)", ) - parser.add_argument( + args_run_settings.add_argument( "--output_path", type=str, default="", help="path of folder to save outputs to, if empty don't save", ) - parser.add_argument( + args_run_settings.add_argument( "--tokenizer", type=str, required=True, help="Path to the tokenizer (e.g. ~/tokenizer.model)", ) - parser.add_argument( + args_run_settings.add_argument( "--no_use_cache", action="store_false", help="Disable the kv-cache (on by default)", ) - parser.add_argument( + args_run_settings.add_argument( "--deterministic", action="store_true", - help="`deterministic` requires env variable `CUBLAS_WORKSPACE_CONFIG=:4096:8`", + help=( + "`deterministic` requires env variable `CUBLAS_WORKSPACE_CONFIG=:4096:8`" + " when running on CPU or GPU. This flag is ignored on AIU." + ), ) - parser.add_argument( + args_run_settings.add_argument( "--distributed", action="store_true", help="This is a distributed job (multiple instances run with RANK+WORLD_SIZE)", ) - parser.add_argument( # could be a bool / flag + args_run_settings.add_argument( # could be a bool / flag '-v', '--verbose', action='count', default=0, help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)" ) - # Compiling arguments - parser.add_argument( + # Arguments for compilation + args_compile = parser.add_argument_group("Compiler") + args_compile.add_argument( "--compile", action="store_true", help="Use torch.compile (slow for first inference pass)", ) - parser.add_argument( + args_compile.add_argument( "--compile_mode", type=str, help="Mode for compilation (only valid for inductor backend)", default="default", choices=["default", "reduce-overhead"], ) - parser.add_argument( + args_compile.add_argument( "--compile_backend", type=str, help="Backend for compilation (only when not running on AIU)", default="inductor", choices=["inductor", "eager", "aot_eager"], ) - parser.add_argument( + args_compile.add_argument( "--compile_dynamic", action="store_true", help="Use dynamic shapes with torch.compile", ) - # LLM-specific inference arguments - parser.add_argument( + # Arguments shared between Decoder and Encoder models + args_dec_enc = parser.add_argument_group("Decoders or Encoders (shared args)") + args_dec_enc.add_argument( "--batch_size", type=int, default=1, help="size of input batch", ) - parser.add_argument( + args_dec_enc.add_argument( "--max_prompt_length", type=int, default=None, help=( "Cap the number of tokens per prompt to a maximum length prior to padding. " - "If None, there will be no cap." + "If None, prompts to decoder models will have no cap, while prompts to " + "encoder models will be capped to a default of 384 tokens." ), ) - parser.add_argument( + + # Decoder model arguments + args_decoder = parser.add_argument_group("Decoders") + args_decoder.add_argument( "--min_pad_length", type=int, default=0, @@ -187,7 +198,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "the specified length, padding will be determined by the largest prompt" ), ) - parser.add_argument( + args_decoder.add_argument( "--fixed_prompt_length", type=int, default=0, @@ -196,25 +207,25 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "Pads input to fixed_prompt_length, fails if any input needs truncation." ), ) - parser.add_argument( + args_decoder.add_argument( "--max_new_tokens", type=int, help="max number of generated tokens", default=100, ) - parser.add_argument( + args_decoder.add_argument( "--no_early_termination", action="store_true", help="disable early termination on generation", ) - parser.add_argument( + args_decoder.add_argument( "--prompt_type", type=str, choices=["chat", "code"], default="chat", help="type of prompts to be used, either chat or code", ) - parser.add_argument( + args_decoder.add_argument( "--prompt_path", type=str, default="", @@ -223,14 +234,14 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "Supports glob-style patterns" ), ) - parser.add_argument( + args_decoder.add_argument( "--timing", type=str, choices=["e2e", "per-token"], default="", help="if set, how to time the generation of tokens, e2e or per-token", ) - parser.add_argument( + args_decoder.add_argument( "--iters", type=int, default=1, @@ -240,26 +251,27 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: ), ) - # RoBERTa-specific evaluation arguments - parser.add_argument( + # Encoder model arguments + args_encoder = parser.add_argument_group("Encoders") + args_encoder.add_argument( "--dataset_name", type=str, default="squad_v2", help="The name of the dataset to use (via the datasets library).", ) - parser.add_argument( + args_encoder.add_argument( "--dataset_config_name", type=str, default=None, help="The configuration name of the dataset to use (via the datasets library).", ) - parser.add_argument( + args_encoder.add_argument( "--n_best_size", type=int, default=20, help="Total number of n-best predictions to generate.", ) - parser.add_argument( + args_encoder.add_argument( "--null_score_diff_threshold", type=float, default=0.0, @@ -270,13 +282,13 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "`version_2_with_negative=True`." ), ) - parser.add_argument( + args_encoder.add_argument( "--version_2_with_negative", type=bool, default=True, help="If true, some of the examples do not have an answer.", ) - parser.add_argument( + args_encoder.add_argument( "--max_answer_length", type=int, default=30, @@ -285,23 +297,13 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "because the start and end predictions are not conditioned on one another." ), ) - parser.add_argument( + args_encoder.add_argument( "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data.", ) - parser.add_argument( - "--max_seq_length", - type=int, - default=384, - help=( - "The maximum total input sequence length after tokenization. " - "Sequences longer than this will be truncated, " - "sequences shorter will be padded if `--pad_to_max_length` is passed." - ), - ) - parser.add_argument( + args_encoder.add_argument( "--pad_to_max_length", action="store_true", help=( @@ -309,7 +311,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "Otherwise, dynamic padding is used." ), ) - parser.add_argument( + args_encoder.add_argument( "--max_eval_samples", type=int, default=None, @@ -318,15 +320,15 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "evaluation examples to this value if set." ), ) - parser.add_argument( + args_encoder.add_argument( "--preprocessing_num_workers", type=int, default=1, help="" ) - parser.add_argument( + args_encoder.add_argument( "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets", ) - parser.add_argument( + args_encoder.add_argument( "--doc_stride", type=int, default=128, @@ -335,12 +337,6 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "to take between chunks." ), ) - parser.add_argument( # NOTE: consider replacing in code with batch_size (DQ vs eval?) - "--per_device_eval_batch_size", - type=int, - default=1, - help="Batch size (per device) for the evaluation dataloader.", - ) args = parser.parse_args() # Add convenient arguments to parser @@ -350,7 +346,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args.dynamo_backend = "sendnn" if args.is_aiu_backend else "inductor" args.fused_weights = not args.unfuse_weights - if args.verbose: + if args.verbose > 0: dprint("=" * 60) dprint(args) dprint("=" * 60) From 600ba6719eb10c187fbe74589d9160cafdc5c0fd Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 17:53:05 -0400 Subject: [PATCH 12/30] Rename enc/dec utils Signed-off-by: Andrea Fasoli --- .../utils/{decoders.py => decoders_utils.py} | 0 .../utils/{encoders.py => encoders_utils.py} | 54 ++++++++++++++----- 2 files changed, 42 insertions(+), 12 deletions(-) rename aiu_fms_testing_utils/utils/{decoders.py => decoders_utils.py} (100%) rename aiu_fms_testing_utils/utils/{encoders.py => encoders_utils.py} (93%) diff --git a/aiu_fms_testing_utils/utils/decoders.py b/aiu_fms_testing_utils/utils/decoders_utils.py similarity index 100% rename from aiu_fms_testing_utils/utils/decoders.py rename to aiu_fms_testing_utils/utils/decoders_utils.py diff --git a/aiu_fms_testing_utils/utils/encoders.py b/aiu_fms_testing_utils/utils/encoders_utils.py similarity index 93% rename from aiu_fms_testing_utils/utils/encoders.py rename to aiu_fms_testing_utils/utils/encoders_utils.py index 80b3a52..fe1a6a0 100644 --- a/aiu_fms_testing_utils/utils/encoders.py +++ b/aiu_fms_testing_utils/utils/encoders_utils.py @@ -56,6 +56,30 @@ def __init__( self.answer_column_name = "" self.pad_on_right = True + self.validate_arguments() + + + def validate_arguments(self): + """Ensure arguments compatibility with Encoder models.""" + + args = self.args + if args.min_pad_length != 0: + raise ValueError( + "Argument min_pad_length should not be provided to encoders. " + "To pad the input sequence, use --pad_to_max_length flag instead." + ) + if args.fixed_prompt_length != 0: + raise ValueError( + "Argument fixed_prompt_length should not be provided to encoders. " + "To pad the input sequence, use --pad_to_max_length flag instead." + ) + if args.max_new_tokens != 100: + raise ValueError( + "Argument max_new_token should not be provided to encoders. " + "To define the max length of a generated answer in QuestionAnswering " + "use --max_answer_length instead." + ) + def prepare_validation_features(self, examples): """Validation preprocessing""" @@ -64,7 +88,11 @@ def prepare_validation_features(self, examples): q_col_name = self.question_column_name c_col_name = self.context_column_name pad_on_right = self.pad_on_right - max_seq_length = self.max_seq_length + max_prompt_length = ( + args.max_prompt_length + if args.max_prompt_length is not None + else 384 + ) # Some of the questions have lots of whitespace on the left, which is not useful # and will make the truncation of the context fail (the tokenized question will @@ -81,8 +109,8 @@ def prepare_validation_features(self, examples): examples[q_col_name if pad_on_right else c_col_name], examples[c_col_name if pad_on_right else q_col_name], truncation="only_second" if pad_on_right else "only_first", - max_length=max_seq_length, - stride=min(args.doc_stride, max_seq_length // 2), + max_length=max_prompt_length, + stride=min(args.doc_stride, max_prompt_length // 2), return_overflowing_tokens=True, return_offsets_mapping=True, padding="max_length" if args.pad_to_max_length else False, @@ -154,14 +182,16 @@ def process_eval_set(self): # Padding side determines if we do (question|context) or (context|question) self.pad_on_right = self.tokenizer.padding_side == "right" - if args.max_seq_length > self.tokenizer.model_max_length: + if args.max_prompt_length > self.tokenizer.model_max_length: dprint( - f"The max_seq_length passed ({args.max_seq_length}) is larger than the " - f"maximum length for the model ({self.tokenizer.model_max_length}). " - f"Using max_seq_length={self.tokenizer.model_max_length}." + f"max_prompt_length ({args.max_prompt_length}) is larger than the " + f"maximum length supported ({self.tokenizer.model_max_length}). " + f"Using max_prompt_length={self.tokenizer.model_max_length} instead." + ) + self.max_prompt_length = min( + args.max_seq_length, + self.tokenizer.model_max_length, ) - - self.max_seq_length = min(args.max_seq_length, self.tokenizer.model_max_length) eval_examples = raw_datasets["validation"] if args.max_eval_samples is not None: @@ -542,10 +572,10 @@ def run_evaluation(self): dprint( f"Runtime: {eval_duration:.0f} s | " f"{eval_duration / len(eval_dataloader):.2f} s/batch | " - f"{eval_duration / (len(eval_dataloader) * args.per_device_eval_batch_size):.2f}" + f"{eval_duration / (len(eval_dataloader) * args.batch_size):.2f}" " s/sample " - f"(tot = {len(eval_dataloader) * args.per_device_eval_batch_size}, " - f"bs = {args.per_device_eval_batch_size})" + f"(tot = {len(eval_dataloader) * args.batch_size}, " + f"bs = {args.batch_size})" ) # concatenate the numpy array From 031abde0310c9ae4fcf0fe3853ec0731d3e98396 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 19:17:00 -0400 Subject: [PATCH 13/30] Gating some AIU settings Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 54 ++++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index 2865027..ff870cd 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -75,41 +75,41 @@ def aiu_dist_setup(rank, world_size, local_rank=-0, verbose=False): def set_aiu_env_vars(args: argparse.Namespace) -> None: """Set necessary environment variables for AIU""" - _target_cache_size = max( - int(args.max_new_tokens * 2), - int(args.min_pad_length * 2.5), - int(args.fixed_prompt_length * 2.5), - ) - _prompt_size = max(int(args.min_pad_length), int(args.fixed_prompt_length)) - if hasattr(torch._dynamo.config, "accumulated_cache_size_limit"): - if _target_cache_size > torch._dynamo.config.accumulated_cache_size_limit: - _prev = torch._dynamo.config.accumulated_cache_size_limit - torch._dynamo.config.accumulated_cache_size_limit = _target_cache_size + if not args.compile_dynamic: + _target_cache_size = max( + int(args.max_new_tokens * 2), + int(args.min_pad_length * 2.5), + int(args.fixed_prompt_length * 2.5), + ) + _prompt_size = max(int(args.min_pad_length), int(args.fixed_prompt_length)) + if hasattr(torch._dynamo.config, "accumulated_cache_size_limit"): + if _target_cache_size > torch._dynamo.config.accumulated_cache_size_limit: + _prev = torch._dynamo.config.accumulated_cache_size_limit + torch._dynamo.config.accumulated_cache_size_limit = _target_cache_size + dprint( + "NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit " + f"from {_prev} to {torch._dynamo.config.accumulated_cache_size_limit} " + f"to accomodate prompt size of {_prompt_size} and decode tokens of " + f"{args.max_new_tokens}" + ) + + if _target_cache_size > torch._dynamo.config.cache_size_limit: + _prev = torch._dynamo.config.cache_size_limit + torch._dynamo.config.cache_size_limit = _target_cache_size dprint( - "NOTICE: Adjusting torch._dynamo.config.accumulated_cache_size_limit " - f"from {_prev} to {torch._dynamo.config.accumulated_cache_size_limit} " - f"to accomodate prompt size of {_prompt_size} and decode tokens of " - f"{args.max_new_tokens}" + f"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from {_prev} to " + f"{torch._dynamo.config.cache_size_limit} to accomodate prompt size of " + f"{_prompt_size} and decode tokens of {args.max_new_tokens}" ) - if _target_cache_size > torch._dynamo.config.cache_size_limit: - _prev = torch._dynamo.config.cache_size_limit - torch._dynamo.config.cache_size_limit = _target_cache_size - dprint( - f"NOTICE: Adjusting torch._dynamo.config.cache_size_limit from {_prev} to " - f"{torch._dynamo.config.cache_size_limit} to accomodate prompt size of " - f"{_prompt_size} and decode tokens of {args.max_new_tokens}" - ) - - if not args.compile_dynamic: torch._dynamo.config.assume_static_by_default = True torch._dynamo.config.dynamic_shapes = False torch._dynamo.config.automatic_dynamic_shapes = False - # This should be set outside!!! - os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") + # os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") # CONFIRM IF THIS IS NEEDE - os.environ.setdefault("COMPILATION_MODE", "offline_decoder") + if not args.is_encoder: + os.environ.setdefault("COMPILATION_MODE", "offline_decoder") if args.device_type == "aiu-senulator": os.environ["FLEX_COMPUTE"] = "SENULATOR" From 8657a112ecfd23e0749c506c6501fc0007c4b355 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 19:23:19 -0400 Subject: [PATCH 14/30] Split inference into decoder/encoder scripts (wip) Signed-off-by: Andrea Fasoli --- scripts/run_decoder.py | 89 ++++++++++++++++++++++++++++++++ scripts/run_encoder.py | 113 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 scripts/run_decoder.py create mode 100644 scripts/run_encoder.py diff --git a/scripts/run_decoder.py b/scripts/run_decoder.py new file mode 100644 index 0000000..b3dfb7b --- /dev/null +++ b/scripts/run_decoder.py @@ -0,0 +1,89 @@ +# Standard +import argparse +import time + +# Third Party +from fms.models import get_model +from fms.utils import tokenizers +from torch import distributed, set_grad_enabled + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank +from aiu_fms_testing_utils.utils.args_parsing import get_args +from aiu_fms_testing_utils.utils.decoders_utils import run_decoder_eval +from aiu_fms_testing_utils.utils.model_setup import setup_model +from aiu_fms_testing_utils.utils.quantization_setup import ( + import_addons, + get_linear_config, + print_model_params, +) + + +parser = argparse.ArgumentParser( + description="Entry point for AIU inference of decoder models." +) +args = get_args(parser) +args.is_encoder = False # add argument directly into Namespace + +if args.is_quantized: + import_addons(args) + +if args.distributed: + distributed.init_process_group() + +# Main model setup +default_dtype, device, dist_strat = setup_model(args) + +# Retrieve linear configuration (quantized or not) to instantiate FMS model +linear_config = get_linear_config(args) + +if rank == 0: + dprint("="*60) + dprint(f"model_path={args.model_path}") + dprint(f"{linear_config=}") + dprint(f"fused_weights={args.fused_weights}") + dprint(f"data_type={default_dtype}") + dprint("="*60 + "\n") + +dprint("Loading model...") +loading_model_start = time.time() +model = get_model( + args.architecture, + args.variant, + model_path=args.model_path, + device_type="cpu" if args.is_aiu_backend else args.device_type, + data_type=default_dtype, + source=args.model_source, + distributed_strategy=dist_strat, + group=distributed.group.WORLD, + linear_config=linear_config, + fused_weights=args.fused_weights, +) + +if args.is_quantized: + print_model_params(model, args) + +tokenizer = tokenizers.get_tokenizer(args.tokenizer) + +model.eval() +set_grad_enabled(False) +if args.distributed: + distributed.barrier() +dprint(f"Loading model completed in {time.time() - loading_model_start:.2f} s.") + +if args.compile: + dprint("Compiling model...") + if args.is_aiu_backend: + model.compile(backend="sendnn_decoder") + else: + # compiling can make first inference pass slow + model.compile(mode=args.compile_mode, backend=args.compile_backend) + dprint("Model compiled.") +else: + dprint("[WARNING] SKIP COMPILE.") + +run_decoder_eval(model, tokenizer, args, device) + +if args.distributed: + distributed.barrier() + distributed.destroy_process_group() diff --git a/scripts/run_encoder.py b/scripts/run_encoder.py new file mode 100644 index 0000000..c7e07f1 --- /dev/null +++ b/scripts/run_encoder.py @@ -0,0 +1,113 @@ +# Standard +import argparse +import time + +# Third Party +from fms.models import get_model +from fms.utils import tokenizers +from torch import distributed, set_grad_enabled + +# Local Packages +from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size +from aiu_fms_testing_utils.utils.args_parsing import get_args +from aiu_fms_testing_utils.utils.encoders_utils import ( + get_roberta_tokenizer, + wrap_encoder, + run_encoder_eval_qa, + run_encoder_eval_mlm, +) +from aiu_fms_testing_utils.utils.model_setup import setup_model +from aiu_fms_testing_utils.utils.quantization_setup import ( + import_addons, + get_linear_config, + print_model_params, +) + + +parser = argparse.ArgumentParser( + description="Entry point for AIU inference of encoder models." +) +args = get_args(parser) +args.is_encoder = True # add argument directly into Namespace + +if args.is_quantized: + import_addons(args) + +if args.distributed: + distributed.init_process_group(backend="gloo", rank=rank, world_size=world_size) + +# Main model setup +default_dtype, device, dist_strat = setup_model(args) + +model_path = args.model_path +if args.int8_direct_quantization: + save_path = None + + # !!! insert DQ for encoders here + + # if DQ is used, args.model_path represent FP16 ckpt but we need to load the + # newly-created INT8 ckpt. Without DQ, args.model_path is the INT8 ckpt already. + model_path = save_path + +# Retrieve linear configuration (quantized or not) to instantiate FMS model +linear_config = get_linear_config(args) + +if rank == 0: + dprint("="*60) + dprint(f"model_path={args.model_path}") + dprint(f"{linear_config=}") + dprint(f"fused_weights={args.fused_weights}") + dprint(f"data_type={default_dtype}") + dprint("="*60 + "\n") + +dprint("Loading model...") +loading_model_start = time.time() +model = get_model( + args.architecture, + args.variant, + model_path=model_path, + device_type="cpu" if args.is_aiu_backend else args.device_type, + data_type=default_dtype, + source=args.model_source, + distributed_strategy=dist_strat, + group=distributed.group.WORLD, + linear_config=linear_config, + fused_weights=args.fused_weights, +) + +if args.is_quantized: + print_model_params(model, args) + +if "roberta" in args.architecture: + tokenizer = get_roberta_tokenizer(args.tokenizer) +else: + tokenizer = tokenizers.get_tokenizer(args.tokenizer) + +model.eval() +set_grad_enabled(False) +if args.distributed: + distributed.barrier() +dprint(f"Loading model completed in {time.time() - loading_model_start:.2f} s.") + +if args.architecture == "roberta": + model = wrap_encoder(model) + +if args.compile: + dprint("Compiling model...") + if args.is_aiu_backend: + model.compile(backend="sendnn_decoder") + else: + # compiling can make first inference pass slow + model.compile(mode=args.compile_mode, backend=args.compile_backend) + dprint("Model compiled.") +else: + dprint("[WARNING] SKIP COMPILE.") + +if args.architecture == "roberta_question_answering": + run_encoder_eval_qa(model, tokenizer, args) +elif args.architecture == "roberta": # basic MaskedLM downstream task + run_encoder_eval_mlm(model, tokenizer, args) + +if args.distributed: + distributed.barrier() + distributed.destroy_process_group() From 0d042c63092f18e9fc8c16e9936e975de58b9d84 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 21:06:15 -0400 Subject: [PATCH 15/30] Fix tokenizer; add some dec/enc args validation Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders_utils.py | 26 ++++++++-- aiu_fms_testing_utils/utils/encoders_utils.py | 52 ++++++++++--------- scripts/run_encoder.py | 6 +-- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index f0b6aa2..0af0c6a 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -9,7 +9,8 @@ # Third Party from fms.utils import generation from fms.utils.generation import generate, pad_input_ids -from transformers import PreTrainedModel, PreTrainedTokenizerBase +from fms.utils.tokenizers import BaseTokenizer +from torch import nn import numpy as np import torch @@ -22,8 +23,8 @@ class DecoderInfer(): def __init__( self, - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, device: torch.device, ): @@ -41,6 +42,21 @@ def __init__( self.do_sample = [False] self.use_cache = [args.no_use_cache] # True/False identical with greedy iff `torch.use_deterministic_algorithms(True)` + self.validate_decoder_arguments() + + def validate_decoder_arguments(self): + """Ensure arguments compatibility with Encoder models.""" + + args = self.args + if getattr(args, "is_encoder", True): + raise ValueError( + "Running decoder model but is_encoder argument is either not set or True" + ) + if "bert" in args.architecture.lower(): + raise ValueError( + f"Architecture {args.architecture} should be run as an encoder model." + ) + def ids_for_prompt(self, prompt): """Process textual prompt and return tokenized ids.""" @@ -319,8 +335,8 @@ def run_generation(self, ids): def run_decoder_eval( - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, device: torch.device, ): diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py index fe1a6a0..7c6b9e0 100644 --- a/aiu_fms_testing_utils/utils/encoders_utils.py +++ b/aiu_fms_testing_utils/utils/encoders_utils.py @@ -9,15 +9,14 @@ # Third Party from datasets import load_dataset from fms.models.hf import to_hf_api +from fms.utils.tokenizers import BaseTokenizer +from torch import nn from torch.utils.data import DataLoader from transformers import ( default_data_collator, DataCollatorWithPadding, EvalPrediction, pipeline, - PreTrainedModel, - PreTrainedTokenizerBase, - RobertaTokenizerFast, ) import evaluate import numpy as np @@ -27,10 +26,6 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank -def get_roberta_tokenizer(tokenizer_path): - return RobertaTokenizerFast.from_pretrained(tokenizer_path) - - def wrap_encoder(model): """Add config info and wrapper to run pipeline for RoBERTa MaskedLM.""" @@ -43,8 +38,8 @@ class EncoderQAInfer(): def __init__( self, - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, ): self.model = model @@ -56,13 +51,17 @@ def __init__( self.answer_column_name = "" self.pad_on_right = True - self.validate_arguments() + self.validate_encoder_arguments() - def validate_arguments(self): + def validate_encoder_arguments(self): """Ensure arguments compatibility with Encoder models.""" args = self.args + if not getattr(args, "is_encoder", False): + raise ValueError( + "Running encoder model but is_encoder argument is either not set or False." + ) if args.min_pad_length != 0: raise ValueError( "Argument min_pad_length should not be provided to encoders. " @@ -105,7 +104,7 @@ def prepare_validation_features(self, examples): # using a stride. This results in one example possible giving several features # when a context is long, each of those features having a context that overlaps # a bit the context of the previous feature. - tokenized_examples = self.tokenizer( + tokenized_examples = self.tokenizer.tokenize( examples[q_col_name if pad_on_right else c_col_name], examples[c_col_name if pad_on_right else q_col_name], truncation="only_second" if pad_on_right else "only_first", @@ -182,15 +181,16 @@ def process_eval_set(self): # Padding side determines if we do (question|context) or (context|question) self.pad_on_right = self.tokenizer.padding_side == "right" - if args.max_prompt_length > self.tokenizer.model_max_length: + model_max_length = self.tokenizer.tokenizer.model_max_length # TODO: add model_max_length to FMS _HFTokenizer + if args.max_prompt_length > model_max_length: dprint( f"max_prompt_length ({args.max_prompt_length}) is larger than the " - f"maximum length supported ({self.tokenizer.model_max_length}). " - f"Using max_prompt_length={self.tokenizer.model_max_length} instead." + f"maximum length supported ({model_max_length}). " + f"Using max_prompt_length={model_max_length} instead." ) self.max_prompt_length = min( args.max_seq_length, - self.tokenizer.model_max_length, + model_max_length, ) eval_examples = raw_datasets["validation"] @@ -226,7 +226,7 @@ def process_eval_set(self): # (by padding to the maximum length of the samples passed). pad_to_multiple_of = None self.data_collator = DataCollatorWithPadding( - self.tokenizer, + self.tokenizer.tokenizer, pad_to_multiple_of=pad_to_multiple_of, ) @@ -612,8 +612,8 @@ class EncoderMLMInfer(): def __init__( self, - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, ): self.model = model @@ -633,7 +633,11 @@ def run_evaluation(self, warmup=False): dprint(f"Starting evaluation ({warmup=})...") warmup_start_time = time.time() - unmasker = pipeline("fill-mask", model=self.model, tokenizer=self.tokenizer) + unmasker = pipeline( + "fill-mask", + model=self.model, + tokenizer=self.tokenizer.tokenizer, + ) output = unmasker(self.prompt) if rank == 0: dprint(f"Run completed in {time.time() - warmup_start_time:.1f} s\n---") @@ -644,8 +648,8 @@ def run_evaluation(self, warmup=False): def run_encoder_eval_qa( - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, ): """Entry point to run QuestionAnswering Evaluation of encoder model. @@ -663,8 +667,8 @@ def run_encoder_eval_qa( def run_encoder_eval_mlm( - model: PreTrainedModel, - tokenizer: PreTrainedTokenizerBase, + model: nn.Module, + tokenizer: BaseTokenizer, args: argparse.Namespace, ): """Entry point to run evaluation of encoder models.""" diff --git a/scripts/run_encoder.py b/scripts/run_encoder.py index c7e07f1..a11c94c 100644 --- a/scripts/run_encoder.py +++ b/scripts/run_encoder.py @@ -11,7 +11,6 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, world_size from aiu_fms_testing_utils.utils.args_parsing import get_args from aiu_fms_testing_utils.utils.encoders_utils import ( - get_roberta_tokenizer, wrap_encoder, run_encoder_eval_qa, run_encoder_eval_mlm, @@ -78,10 +77,7 @@ if args.is_quantized: print_model_params(model, args) -if "roberta" in args.architecture: - tokenizer = get_roberta_tokenizer(args.tokenizer) -else: - tokenizer = tokenizers.get_tokenizer(args.tokenizer) +tokenizer = tokenizers.get_tokenizer(args.tokenizer) model.eval() set_grad_enabled(False) From 3f1372911f43098e1a159e05eb7686bdcbcba0b0 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 21:07:31 -0400 Subject: [PATCH 16/30] Update AIU env var Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index ff870cd..5d4fe94 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -119,4 +119,4 @@ def set_aiu_env_vars(args: argparse.Namespace) -> None: print("must set AIU_WORLD_RANK_0") exit() os.environ.setdefault("FLEX_COMPUTE", "SENTIENT") - os.environ.setdefault("FLEX_DEVICE", "VFIO") + os.environ.setdefault("FLEX_DEVICE", "PF") # will use VF eventually From 4e731d881b6c921d19c50bd52ff2431e3e1ec98d Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 21:08:51 -0400 Subject: [PATCH 17/30] Minor args update Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/args_parsing.py | 7 +++---- aiu_fms_testing_utils/utils/quantization_setup.py | 10 +++++----- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py index 7d2992d..90aea3c 100644 --- a/aiu_fms_testing_utils/utils/args_parsing.py +++ b/aiu_fms_testing_utils/utils/args_parsing.py @@ -56,7 +56,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args_quantization.add_argument( "--quantization", type=str, - choices=["gptq", "int8"], + choices=["gptq", "int8"], # TODO: add "fp8" when available in FMS default=None, help="Type of quantization of the model checkpoint", ) @@ -102,7 +102,7 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: "--seed", type=int, default=81072, - help="Run seed (only needed if eval dataset is shuffled)", + help="Fix run seed for reproducibility", ) args_run_settings.add_argument( "--output_path", @@ -340,13 +340,12 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: args = parser.parse_args() # Add convenient arguments to parser - args.is_encoder = "bert" in args.architecture.lower() # TODO: improve this check args.is_quantized = args.quantization is not None args.is_aiu_backend = "aiu" in args.device_type args.dynamo_backend = "sendnn" if args.is_aiu_backend else "inductor" args.fused_weights = not args.unfuse_weights - if args.verbose > 0: + if args.verbose: dprint("=" * 60) dprint(args) dprint("=" * 60) diff --git a/aiu_fms_testing_utils/utils/quantization_setup.py b/aiu_fms_testing_utils/utils/quantization_setup.py index d602853..49297fa 100644 --- a/aiu_fms_testing_utils/utils/quantization_setup.py +++ b/aiu_fms_testing_utils/utils/quantization_setup.py @@ -6,7 +6,7 @@ import os # Third Party -from transformers import PreTrainedModel +from torch import nn # Local Packages from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank @@ -126,10 +126,10 @@ def select_int8_module( return linear_config -def print_model_params(model: PreTrainedModel, args: argparse.Namespace) -> None: +def print_model_params(model: nn.Module, args: argparse.Namespace) -> None: """Printout model and list of model parameters with related statistics.""" - if rank == 0 and args.verbose > 0: + if rank == 0 and args.verbose: dprint("="*60 + "\n") dprint("\n".join( f"{k:80} {str(list(v.size())):15} {str(v.dtype):18} {str(v.device):10} " @@ -138,10 +138,10 @@ def print_model_params(model: PreTrainedModel, args: argparse.Namespace) -> None )) dprint("="*60 + "\n") if args.architecture == "llama": - # TODO: unused keys behavior in FMS may change to return ERRORS dprint( "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be " - "marked as unused keys." + "marked as unused keys because of different architectural choices between " + "FMS and HF models (but model output is preserved)." ) dprint(model) dprint("="*60 + "\n") From fd70377a699f97a5ab187adcb47d61278877cbfe Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Wed, 18 Jun 2025 21:42:13 -0400 Subject: [PATCH 18/30] Relocate print_model_params function Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/model_setup.py | 21 +++++++++++++++++++ .../utils/quantization_setup.py | 21 ------------------- scripts/run_decoder.py | 3 +-- scripts/run_encoder.py | 3 +-- 4 files changed, 23 insertions(+), 25 deletions(-) diff --git a/aiu_fms_testing_utils/utils/model_setup.py b/aiu_fms_testing_utils/utils/model_setup.py index 64cffb2..4ffa9b2 100644 --- a/aiu_fms_testing_utils/utils/model_setup.py +++ b/aiu_fms_testing_utils/utils/model_setup.py @@ -112,3 +112,24 @@ def setup_model(args: argparse.Namespace) -> tuple[str | None, torch.device, str dist_strat = get_distributed_strategy(args) return default_dtype, device, dist_strat + + +def print_model_params(model: nn.Module, args: argparse.Namespace) -> None: + """Printout model and list of model parameters with related statistics.""" + + if rank == 0 and args.verbose: + dprint("="*60 + "\n") + dprint("\n".join( + f"{k:80} {str(list(v.size())):15} {str(v.dtype):18} {str(v.device):10} " + f"{v.min().item():12.4f} {v.max().item():12.4f}" + for k,v in model.state_dict().items() + )) + dprint("="*60 + "\n") + if args.architecture == "llama": + dprint( + "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be " + "marked as unused keys because of different architectural choices between " + "FMS and HF models (but model output is preserved)." + ) + dprint(model) + dprint("="*60 + "\n") diff --git a/aiu_fms_testing_utils/utils/quantization_setup.py b/aiu_fms_testing_utils/utils/quantization_setup.py index 49297fa..264d5f2 100644 --- a/aiu_fms_testing_utils/utils/quantization_setup.py +++ b/aiu_fms_testing_utils/utils/quantization_setup.py @@ -124,24 +124,3 @@ def select_int8_module( else: linear_config = {"linear_type": "torch_linear"} return linear_config - - -def print_model_params(model: nn.Module, args: argparse.Namespace) -> None: - """Printout model and list of model parameters with related statistics.""" - - if rank == 0 and args.verbose: - dprint("="*60 + "\n") - dprint("\n".join( - f"{k:80} {str(list(v.size())):15} {str(v.dtype):18} {str(v.device):10} " - f"{v.min().item():12.4f} {v.max().item():12.4f}" - for k,v in model.state_dict().items() - )) - dprint("="*60 + "\n") - if args.architecture == "llama": - dprint( - "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be " - "marked as unused keys because of different architectural choices between " - "FMS and HF models (but model output is preserved)." - ) - dprint(model) - dprint("="*60 + "\n") diff --git a/scripts/run_decoder.py b/scripts/run_decoder.py index b3dfb7b..4550c39 100644 --- a/scripts/run_decoder.py +++ b/scripts/run_decoder.py @@ -11,11 +11,10 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank from aiu_fms_testing_utils.utils.args_parsing import get_args from aiu_fms_testing_utils.utils.decoders_utils import run_decoder_eval -from aiu_fms_testing_utils.utils.model_setup import setup_model +from aiu_fms_testing_utils.utils.model_setup import setup_model, print_model_params from aiu_fms_testing_utils.utils.quantization_setup import ( import_addons, get_linear_config, - print_model_params, ) diff --git a/scripts/run_encoder.py b/scripts/run_encoder.py index a11c94c..0f9507c 100644 --- a/scripts/run_encoder.py +++ b/scripts/run_encoder.py @@ -15,11 +15,10 @@ run_encoder_eval_qa, run_encoder_eval_mlm, ) -from aiu_fms_testing_utils.utils.model_setup import setup_model +from aiu_fms_testing_utils.utils.model_setup import setup_model, print_model_params from aiu_fms_testing_utils.utils.quantization_setup import ( import_addons, get_linear_config, - print_model_params, ) From 7a5c9dfec709897b6bd4eb9cb11b0845598e47df Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 13:02:52 -0400 Subject: [PATCH 19/30] Gate transformers import Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/encoders_utils.py | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py index 7c6b9e0..433a851 100644 --- a/aiu_fms_testing_utils/utils/encoders_utils.py +++ b/aiu_fms_testing_utils/utils/encoders_utils.py @@ -9,15 +9,10 @@ # Third Party from datasets import load_dataset from fms.models.hf import to_hf_api +from fms.utils import has_package from fms.utils.tokenizers import BaseTokenizer from torch import nn from torch.utils.data import DataLoader -from transformers import ( - default_data_collator, - DataCollatorWithPadding, - EvalPrediction, - pipeline, -) import evaluate import numpy as np import torch @@ -26,6 +21,17 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank +# Optional imports (required for QA) +has_hf = has_package("transformers") +if has_hf: + from transformers import ( + default_data_collator, + DataCollatorWithPadding, + EvalPrediction, + pipeline, + ) + + def wrap_encoder(model): """Add config info and wrapper to run pipeline for RoBERTa MaskedLM.""" @@ -53,7 +59,6 @@ def __init__( self.validate_encoder_arguments() - def validate_encoder_arguments(self): """Ensure arguments compatibility with Encoder models.""" @@ -152,6 +157,12 @@ def convert_batch_to_fms_style(self, batch): def process_eval_set(self): """Pre-process evaluation dataset for QuestionAnswering task.""" + if not has_hf: + raise ImportError( + "QuestionAnswering Encoder requires transformer package but import " + "was unsuccessful." + ) + args = self.args if args.dataset_name is not None: # Downloading and loading a dataset from the hub @@ -403,8 +414,7 @@ def postprocess_qa_predictions( if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""): predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0}) - # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using - # the LogSumExp trick). + # Compute the softmax of all scores scores = np.array([pred.pop("score") for pred in predictions]) exp_scores = np.exp(scores - np.max(scores)) probs = exp_scores / exp_scores.sum() From 3ad70501c10725a26d09006bf8c87d9ce94259bd Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 13:30:30 -0400 Subject: [PATCH 20/30] Bring recent updates to inference.py into run_decoder.py Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders_utils.py | 37 +++++++++---------- scripts/run_decoder.py | 3 +- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index 0af0c6a..112a6dc 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -15,6 +15,7 @@ import torch # Local Packages +from aiu_fms_testing_utils.utils import warmup_model from aiu_fms_testing_utils.utils.aiu_setup import dprint, local_rank @@ -286,11 +287,12 @@ def infer(self, ids, warmup): elif args.timing == "per-token": if not warmup: dprint(f"First-token latency: {timings[0]*1000:.3f} ms") - dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms") dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms") - dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})") - dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})") - dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms") + if len(timings) > 1: + dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms") + dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})") + dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})") + dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms") timings = [f"{t*1000:.3f}" for t in timings] dprint(f"Per-token timing information: {', '.join(timings)} ms") if len(result.shape) == 1: @@ -305,26 +307,23 @@ def run_warmup(self, ids): dprint(f"Start compilation warmup...") pt_compile_model_start = time.time() - self.infer(ids, warmup=True) - dprint( - "PyTorch compile completed, " - f"took {time.time() - pt_compile_model_start:.2f} s." - ) - - if self.args.is_aiu_backend: - from torch_sendnn import torch_sendnn - - dprint("Executing update_lazyhandle and compiling for AIU") - update_lh_time = time.time() - torch_sendnn.update_lazyhandle() - update_lh_time = time.time() - update_lh_time - dprint(f"Update_lazyhandle completed, took {update_lh_time:.3f}s") - if self.args.device_type == "aiu": # only run warmup for AIU, not senulator + warmup_model( + self.model, + ids, + self.args.max_new_tokens, + self.args.compile_dynamic_sendnn, + **self.extra_generation_kwargs, + ) aiu_warmup_time = time.time() self.infer(ids, warmup=True) aiu_warmup_time = time.time() - aiu_warmup_time dprint(f"AIU warmup completed, took {aiu_warmup_time:.3f}s") + else: + for sample, cache in itertools.product(self.do_sample, self.use_cache): + self.infer(cache, sample, True) + pt_compile_model_time = time.time() - pt_compile_model_time + dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") def run_generation(self, ids): """Run inference generation (not a warmup).""" diff --git a/scripts/run_decoder.py b/scripts/run_decoder.py index 4550c39..cdcd93c 100644 --- a/scripts/run_decoder.py +++ b/scripts/run_decoder.py @@ -8,6 +8,7 @@ from torch import distributed, set_grad_enabled # Local Packages +from aiu_fms_testing_utils.utils import aiu_setup, warmup_model from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank from aiu_fms_testing_utils.utils.args_parsing import get_args from aiu_fms_testing_utils.utils.decoders_utils import run_decoder_eval @@ -73,7 +74,7 @@ if args.compile: dprint("Compiling model...") if args.is_aiu_backend: - model.compile(backend="sendnn_decoder") + model.compile(backend="sendnn", options={'sendnn.dynamic': args.compile_dynamic_sendnn}) else: # compiling can make first inference pass slow model.compile(mode=args.compile_mode, backend=args.compile_backend) From 92d05efdb024c2b1ad8f25b232936bce0e5a9165 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 13:31:30 -0400 Subject: [PATCH 21/30] Add new sendnn compile arg Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/args_parsing.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/aiu_fms_testing_utils/utils/args_parsing.py b/aiu_fms_testing_utils/utils/args_parsing.py index 90aea3c..dabcca4 100644 --- a/aiu_fms_testing_utils/utils/args_parsing.py +++ b/aiu_fms_testing_utils/utils/args_parsing.py @@ -167,6 +167,11 @@ def get_args(parser: argparse.ArgumentParser) -> argparse.Namespace: action="store_true", help="Use dynamic shapes with torch.compile", ) + args_compile.add_argument( + "--compile_dynamic_sendnn", + action="store_true", + help="Use dynamic shapes with aiu compile", + ) # Arguments shared between Decoder and Encoder models args_dec_enc = parser.add_argument_group("Decoders or Encoders (shared args)") From 011ec3331258574a50a29ec8905a479d0fd97512 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 13:31:53 -0400 Subject: [PATCH 22/30] Remove unified inference.py Signed-off-by: Andrea Fasoli --- scripts/inference.py | 115 ------------------------------------------- 1 file changed, 115 deletions(-) delete mode 100644 scripts/inference.py diff --git a/scripts/inference.py b/scripts/inference.py deleted file mode 100644 index baddd29..0000000 --- a/scripts/inference.py +++ /dev/null @@ -1,115 +0,0 @@ -# Standard -import argparse -import time - -# Third Party -from fms.models import get_model -from fms.utils import tokenizers -from torch import distributed, set_grad_enabled - -# Local Packages -from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank -from aiu_fms_testing_utils.utils.args_parsing import get_args -from aiu_fms_testing_utils.utils.decoders import run_decoder_eval -from aiu_fms_testing_utils.utils.encoders import ( - get_roberta_tokenizer, - wrap_encoder, - run_encoder_eval_qa, - run_encoder_eval_mlm, -) -from aiu_fms_testing_utils.utils.model_setup import setup_model -from aiu_fms_testing_utils.utils.quantization_setup import ( - import_addons, - get_linear_config, - print_model_params, -) - - -parser = argparse.ArgumentParser(description="Entry point for AIU inference") -args = get_args(parser) - -if args.is_quantized: - import_addons(args) - -if args.distributed: - distributed.init_process_group() # used by inference.py - # distributed.init_process_group(backend="gloo", rank=local_rank, world_size=world_size) - -# Main model setup -default_dtype, device, dist_strat = setup_model(args) - -model_path = args.model_path -if args.direct_quantization: - save_path = None - - # !!! insert DQ here (for RoBERTa first, then add for LLM) - - # if DQ is used, args.model_path represent FP16 ckpt but we need to load the - # newly-created INT8 ckpt. Without DQ, args.model_path is the INT8 ckpt already. - model_path = save_path - -# Retrieve linear configuration (quantized or not) to instantiate FMS model -linear_config = get_linear_config(args) - -if rank == 0: - dprint("="*60) - dprint(f"model_path={args.model_path}") - dprint(f"{linear_config=}") - dprint(f"fused_weights={args.fused_weights}") - dprint(f"data_type={default_dtype}") - dprint("="*60 + "\n") - -dprint("Loading model...") -loading_model_start = time.time() -model = get_model( - args.architecture, - args.variant, - model_path=model_path, - device_type="cpu" if args.is_aiu_backend else args.device_type, - data_type=default_dtype, - source=args.model_source, - distributed_strategy=dist_strat, - group=distributed.group.WORLD, - linear_config=linear_config, - fused_weights=args.fused_weights, -) - -if args.is_quantized: - print_model_params(model, args) - -if "roberta" in args.architecture: - tokenizer = get_roberta_tokenizer(args.tokenizer) -else: - tokenizer = tokenizers.get_tokenizer(args.tokenizer) - -model.eval() -set_grad_enabled(False) -if args.distributed: - distributed.barrier() -dprint(f"Loading model completed in {time.time() - loading_model_start:.2f} s.") - -if args.architecture == "roberta": - model = wrap_encoder(model) - -if args.compile: - dprint("Compiling model...") - if args.is_aiu_backend: - model.compile(backend="sendnn_decoder") - else: - # compiling can make first inference pass slow - model.compile(mode=args.compile_mode, backend=args.compile_backend) - dprint("Model compiled.") -else: - dprint("[WARNING] SKIP COMPILE.") - -if args.is_encoder: - if args.architecture == "roberta_question_answering": - run_encoder_eval_qa(model, tokenizer, args) - elif args.architecture == "roberta": # basic MaskedLM downstream task - run_encoder_eval_mlm(model, tokenizer, args) -else: - run_decoder_eval(model, tokenizer, args, device) - -if args.distributed: - distributed.barrier() - distributed.destroy_process_group() From 529212833813f25e59df9ef49e55e248140a1f3f Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 13:50:24 -0400 Subject: [PATCH 23/30] Small fixes Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders_utils.py | 6 +++--- aiu_fms_testing_utils/utils/model_setup.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index 112a6dc..7fdcac0 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -315,14 +315,14 @@ def run_warmup(self, ids): self.args.compile_dynamic_sendnn, **self.extra_generation_kwargs, ) - aiu_warmup_time = time.time() + aiu_warmup_start = time.time() self.infer(ids, warmup=True) - aiu_warmup_time = time.time() - aiu_warmup_time + aiu_warmup_time = time.time() - aiu_warmup_start dprint(f"AIU warmup completed, took {aiu_warmup_time:.3f}s") else: for sample, cache in itertools.product(self.do_sample, self.use_cache): self.infer(cache, sample, True) - pt_compile_model_time = time.time() - pt_compile_model_time + pt_compile_model_time = time.time() - pt_compile_model_start dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") def run_generation(self, ids): diff --git a/aiu_fms_testing_utils/utils/model_setup.py b/aiu_fms_testing_utils/utils/model_setup.py index 4ffa9b2..3816da8 100644 --- a/aiu_fms_testing_utils/utils/model_setup.py +++ b/aiu_fms_testing_utils/utils/model_setup.py @@ -7,7 +7,7 @@ import numpy as np import random import torch -from torch import distributed +from torch import nn, distributed # Local from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size From 17be9a7f94810a4abb9a18a850103c4db1a9f345 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 20:34:30 -0400 Subject: [PATCH 24/30] Remove deprecated torch dynamo config option Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/aiu_setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index 5d4fe94..d7ee71f 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -103,7 +103,6 @@ def set_aiu_env_vars(args: argparse.Namespace) -> None: ) torch._dynamo.config.assume_static_by_default = True - torch._dynamo.config.dynamic_shapes = False torch._dynamo.config.automatic_dynamic_shapes = False # os.environ.setdefault("DTCOMPILER_KEEP_EXPORT", "true") # CONFIRM IF THIS IS NEEDE From 6780770da31f7b3569b5f4f8f784f6607f66c640 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 20:35:45 -0400 Subject: [PATCH 25/30] Update type hints Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders_utils.py | 3 +- aiu_fms_testing_utils/utils/encoders_utils.py | 84 +++++++++++++------ 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index 7fdcac0..37d7638 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -260,7 +260,6 @@ def infer(self, ids, warmup): extra_generation_kwargs["only_last_token"] = True if args.device_type == "cpu": - # Bug in 2.3.1 fixed in 2.4.1 for SDPA flash cpu impl when pad too much extra_generation_kwargs["attn_algorithm"] = "math" if not args.no_early_termination and not warmup: @@ -338,7 +337,7 @@ def run_decoder_eval( tokenizer: BaseTokenizer, args: argparse.Namespace, device: torch.device, - ): + ) -> None: """Entry point to run evaluation of LLM decoder models.""" decoder_infer = DecoderInfer(model, tokenizer, args, device) diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py index 433a851..4ebeda7 100644 --- a/aiu_fms_testing_utils/utils/encoders_utils.py +++ b/aiu_fms_testing_utils/utils/encoders_utils.py @@ -7,8 +7,9 @@ import time # Third Party -from datasets import load_dataset +from datasets import Dataset, load_dataset from fms.models.hf import to_hf_api +from fms.models.hf.modeling_hf_adapter import HFModelArchitecture from fms.utils import has_package from fms.utils.tokenizers import BaseTokenizer from torch import nn @@ -32,9 +33,15 @@ ) -def wrap_encoder(model): +def wrap_encoder(model: nn.Module) -> HFModelArchitecture: """Add config info and wrapper to run pipeline for RoBERTa MaskedLM.""" + if not has_hf: + raise ImportError( + "MaskedLM Encoder requires transformer package but import " + "was unsuccessful." + ) + model.config.linear_config.pop("linear_type", None) return to_hf_api(model, task_specific_params=None) @@ -47,9 +54,9 @@ def __init__( model: nn.Module, tokenizer: BaseTokenizer, args: argparse.Namespace, - ): + ) -> None: self.model = model - self.tokenizer = tokenizer + self.tokenizer = tokenizer.tokenizer # extract original HF tokenizer self.args = args self.question_column_name = "" @@ -59,7 +66,7 @@ def __init__( self.validate_encoder_arguments() - def validate_encoder_arguments(self): + def validate_encoder_arguments(self) -> None: """Ensure arguments compatibility with Encoder models.""" args = self.args @@ -85,10 +92,14 @@ def validate_encoder_arguments(self): ) - def prepare_validation_features(self, examples): + def prepare_validation_features( + self, + examples: dict[str, list[str | dict]], + ) -> dict[str, list]: """Validation preprocessing""" args = self.args + q_col_name = self.question_column_name c_col_name = self.context_column_name pad_on_right = self.pad_on_right @@ -109,7 +120,7 @@ def prepare_validation_features(self, examples): # using a stride. This results in one example possible giving several features # when a context is long, each of those features having a context that overlaps # a bit the context of the previous feature. - tokenized_examples = self.tokenizer.tokenize( + tokenized_examples = self.tokenizer( examples[q_col_name if pad_on_right else c_col_name], examples[c_col_name if pad_on_right else q_col_name], truncation="only_second" if pad_on_right else "only_first", @@ -149,12 +160,15 @@ def prepare_validation_features(self, examples): return tokenized_examples - def convert_batch_to_fms_style(self, batch): + def convert_batch_to_fms_style( + self, + batch: dict[str, torch.Tensor], + ) -> dict[str, torch.Tensor]: """FMS uses a different standard than HF for encoder inputs.""" return {'x': batch['input_ids'], 'mask': batch['attention_mask']} - def process_eval_set(self): + def process_eval_set(self) -> None: """Pre-process evaluation dataset for QuestionAnswering task.""" if not has_hf: @@ -192,7 +206,7 @@ def process_eval_set(self): # Padding side determines if we do (question|context) or (context|question) self.pad_on_right = self.tokenizer.padding_side == "right" - model_max_length = self.tokenizer.tokenizer.model_max_length # TODO: add model_max_length to FMS _HFTokenizer + model_max_length = self.tokenizer.model_max_length if args.max_prompt_length > model_max_length: dprint( f"max_prompt_length ({args.max_prompt_length}) is larger than the " @@ -259,8 +273,8 @@ def process_eval_set(self): def postprocess_qa_predictions( self, - examples, - features, + examples: Dataset, + features: Dataset, predictions: tuple[np.ndarray, np.ndarray], version_2_with_negative: bool = False, n_best_size: int = 20, @@ -268,7 +282,7 @@ def postprocess_qa_predictions( null_score_diff_threshold: float = 0.0, output_dir: str | None = None, prefix: str | None = None, - ): + ) -> None: """ Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the original contexts. This is the base postprocessing functions for models that only return start and end logits. @@ -476,7 +490,13 @@ def postprocess_qa_predictions( return all_predictions - def post_processing_function(self, examples, features, predictions, stage="eval"): + def post_processing_function( + self, + examples: Dataset, + features: Dataset, + predictions: list[np.ndarray], + stage: str = "eval", + ) -> dict[list[str, str]]: """Post-processing: we match the start logits and end logits to answers in the original context.""" @@ -492,6 +512,7 @@ def post_processing_function(self, examples, features, predictions, stage="eval" output_dir=None, prefix=stage, ) + breakpoint() # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [ @@ -508,7 +529,12 @@ def post_processing_function(self, examples, features, predictions, stage="eval" ] return EvalPrediction(predictions=formatted_predictions, label_ids=references) - def create_and_fill_np_array(self, start_or_end_logits, dataset, max_len): + def create_and_fill_np_array( + self, + start_or_end_logits: list[np.ndarray], + dataset: Dataset, + max_len: int, + ) -> np.ndarray: """ Create and fill numpy array of size len_of_validation_data * max_length_of_output_tensor @@ -543,7 +569,7 @@ def create_and_fill_np_array(self, start_or_end_logits, dataset, max_len): return logits_concat - def run_warmup(self): + def run_warmup(self) -> None: """Run warmup cycle of compiled encoder model set for QuestionAnswering task.""" dprint(f"Starting warm-up...") @@ -559,7 +585,7 @@ def run_warmup(self): if rank == 0: dprint(f"Warmup completed in {time.time() - warmup_start_time:.1f} s\n---") - def run_evaluation(self): + def run_evaluation(self) -> None: """Run QuestionAnswering evaluation.""" args = self.args @@ -587,7 +613,7 @@ def run_evaluation(self): f"(tot = {len(eval_dataloader) * args.batch_size}, " f"bs = {args.batch_size})" ) - + breakpoint() # concatenate the numpy array max_len = max([x.shape[1] for x in all_start_logits]) start_logits_concat = self.create_and_fill_np_array( @@ -622,21 +648,27 @@ class EncoderMLMInfer(): def __init__( self, - model: nn.Module, + model: HFModelArchitecture, tokenizer: BaseTokenizer, args: argparse.Namespace, - ): + ) -> None: self.model = model self.tokenizer = tokenizer self.args = args - def process_eval_set(self): + def process_eval_set(self) -> None: """Barebone function that sets up a single example prompt (for now).""" + if not has_hf: + raise ImportError( + "MaskedLM Encoder requires transformer package but import " + "was unsuccessful." + ) + self.prompt = "the dog chased the cat while aggressively" - def run_evaluation(self, warmup=False): + def run_evaluation(self, warmup: bool = False) -> None: """Run evaluation cycle of compiled encoder model set for MaskedLM task. No output printout if warmup is True. """ @@ -658,10 +690,10 @@ def run_evaluation(self, warmup=False): def run_encoder_eval_qa( - model: nn.Module, + model: nn.Module, # FMS-style model tokenizer: BaseTokenizer, args: argparse.Namespace, -): +) -> None: """Entry point to run QuestionAnswering Evaluation of encoder model. Processing based on pytorch example: @@ -677,10 +709,10 @@ def run_encoder_eval_qa( def run_encoder_eval_mlm( - model: nn.Module, + model: HFModelArchitecture, # model wrapped by to_hf_api tokenizer: BaseTokenizer, args: argparse.Namespace, -): +) -> None: """Entry point to run evaluation of encoder models.""" encoder_mlm_infer = EncoderMLMInfer(model, tokenizer, args) From 0437c504b4d5ac71e91d0789671c54f0357a3333 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 20:36:22 -0400 Subject: [PATCH 26/30] Update skip compile message Signed-off-by: Andrea Fasoli --- scripts/run_decoder.py | 3 +-- scripts/run_encoder.py | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/run_decoder.py b/scripts/run_decoder.py index cdcd93c..48aa4a4 100644 --- a/scripts/run_decoder.py +++ b/scripts/run_decoder.py @@ -8,7 +8,6 @@ from torch import distributed, set_grad_enabled # Local Packages -from aiu_fms_testing_utils.utils import aiu_setup, warmup_model from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank from aiu_fms_testing_utils.utils.args_parsing import get_args from aiu_fms_testing_utils.utils.decoders_utils import run_decoder_eval @@ -80,7 +79,7 @@ model.compile(mode=args.compile_mode, backend=args.compile_backend) dprint("Model compiled.") else: - dprint("[WARNING] SKIP COMPILE.") + dprint("Skip model compiling. Only for debug purpose.") run_decoder_eval(model, tokenizer, args, device) diff --git a/scripts/run_encoder.py b/scripts/run_encoder.py index 0f9507c..77ac104 100644 --- a/scripts/run_encoder.py +++ b/scripts/run_encoder.py @@ -42,6 +42,7 @@ save_path = None # !!! insert DQ for encoders here + # pass default_dtype to DQ function # if DQ is used, args.model_path represent FP16 ckpt but we need to load the # newly-created INT8 ckpt. Without DQ, args.model_path is the INT8 ckpt already. @@ -96,7 +97,7 @@ model.compile(mode=args.compile_mode, backend=args.compile_backend) dprint("Model compiled.") else: - dprint("[WARNING] SKIP COMPILE.") + dprint("Skip model compiling. Only for debug purpose.") if args.architecture == "roberta_question_answering": run_encoder_eval_qa(model, tokenizer, args) From dfd6758ba0760f86acb51e6ae844932a79cf5afb Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 20:41:47 -0400 Subject: [PATCH 27/30] Adjust extra_generation_kwargs handling Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/decoders_utils.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index 37d7638..668c117 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -192,7 +192,7 @@ def process_eval_set(self): ids = prompts if isinstance(ids, list) and len(ids) == 1: ids = ids[0].unsqueeze(0) - extra_generation_kwargs = None + extra_generation_kwargs = {} self.extra_generation_kwargs = extra_generation_kwargs @@ -252,15 +252,10 @@ def infer(self, ids, warmup): max_seq_len = self.model.config.max_expected_seq_len # Add only_last_token optimization - extra_generation_kwargs = ( - {} - if self.extra_generation_kwargs is None - else self.extra_generation_kwargs - ) - extra_generation_kwargs["only_last_token"] = True + self.extra_generation_kwargs["only_last_token"] = True if args.device_type == "cpu": - extra_generation_kwargs["attn_algorithm"] = "math" + self.extra_generation_kwargs["attn_algorithm"] = "math" if not args.no_early_termination and not warmup: eos_token_id = self.tokenizer.eos_token_id @@ -277,7 +272,7 @@ def infer(self, ids, warmup): timing=args.timing, eos_token_id=eos_token_id, contiguous_cache=True, - extra_kwargs=extra_generation_kwargs, + extra_kwargs=self.extra_generation_kwargs, ) if args.timing != "": result, timings = result From f7c458e1a3da1219e45f009bdc0d442cd1cf01d3 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Thu, 19 Jun 2025 20:50:36 -0400 Subject: [PATCH 28/30] Remove INT8 DQ Signed-off-by: Andrea Fasoli --- .../utils/direct_quantization.py | 260 ------------------ scripts/run_encoder.py | 13 +- 2 files changed, 1 insertion(+), 272 deletions(-) delete mode 100644 aiu_fms_testing_utils/utils/direct_quantization.py diff --git a/aiu_fms_testing_utils/utils/direct_quantization.py b/aiu_fms_testing_utils/utils/direct_quantization.py deleted file mode 100644 index 4f7c898..0000000 --- a/aiu_fms_testing_utils/utils/direct_quantization.py +++ /dev/null @@ -1,260 +0,0 @@ -# Standard -from pathlib import Path -from tqdm import tqdm -import argparse -import os -import time - -# Third Party -from torch.utils.data import DataLoader # [R] -from transformers import ( # [R] - default_data_collator, - DataCollatorWithPadding, - EvalPrediction, - RobertaForQuestionAnswering, - RobertaForMaskedLM, - RobertaTokenizerFast, - pipeline, -) -import torch - -# Local Packages -from fms_mo import qconfig_init, qmodel_prep # [R] -from fms_mo.quant.ptq import dq_llm, get_act_scales # [R] -from fms_mo.utils.utils import prepare_input # [R] -from utils.roberta_int8_utils import ( # [R] change this - validate_arguments, - get_wikitext2, - use_default_qcfg, - process_state_dict, - mask_examples, - dequantize_int8_weights, -) - - -QUANTIZED_LAYERS_ROBERTA = [ - "attention.self.query", - "attention.self.key", - "attention.self.value", - "attention.output.dense", - "intermediate.dense", - "output.dense", -] - -# TODO: change print to dprint -# TODO: add LLM DQ -# TODO: load wikitext using FMS-MO instead of custom function - -def run_dq_roberta(args: argparse.Namespace): - """Run INT8 Direct Quantization for RoBERTa. - """ - - #------------- - # Instantiate HF RoBERTa FP16 - #------------- - print("* Begin Direct Quantization (DQ) process.") - torch.set_default_dtype(torch.float16) - tokenizer = RobertaTokenizerFast.from_pretrained(args.tokenizer) - fp16_model_path = args.fp16_ckpt_path if args.fp16_ckpt_path else 'roberta-base' - if args.architecture == "roberta": - model = RobertaForMaskedLM.from_pretrained( - fp16_model_path, - torch_dtype=torch.float16, - ) - elif args.architecture == "roberta_question_answering": - model = RobertaForQuestionAnswering.from_pretrained( - fp16_model_path, - torch_dtype=torch.float16, - ) - else: - raise NotImplementedError( - f"Variant {args.architecture} is not supported for Direct Quantization" - ) - model.to("cpu") - print("* FP16 model loaded to CPU.") - - train_dataset, test_dataset = get_wikitext2(tokenizer) - dq_dataloader = DataLoader( - train_dataset[:args.num_dq_samples], - shuffle=True, - collate_fn=default_data_collator, - batch_size=1, - ) - print(f"* Dataset for DQ loaded (samples = {len(dq_dataloader.dataset)}).") - - #------------- - # Set fms_mo configuration - #------------- - qcfg = qconfig_init(recipe=args.int8_qcfg_path, args=args) - - # preferred method is to update qconfig from recipe, providing --int8_qcfg_path - # but the following will set some defaults if config json is not passed - if not args.int8_qcfg_path: - print("* Using a default quantization configuration for missing parameters.") - qcfg = use_default_qcfg(qcfg) - qcfg["logger"] = print - qcfg["qw_mode"] = "maxperCh" if args.weight_per_channel else "max" - if args.activ_quant_type == "per_token": - qcfg["qa_mode"] = "pertokenmax" - elif args.activ_quant_type == "per_tensor_symm": - qcfg["qa_mode"] = "maxsym" - else: - qcfg["qa_mode"] = "max" - qcfg["a_init_method"] = "max" - qcfg["qw_mode_calib"] = "max" - qcfg["qa_mode_calib"] = "max" - - if args.verbose: - print("=" * 60) - print("QUANTIZATION CONFIGURATION") - print("\n".join(f"{k:60} {v}" for k,v in qcfg.items() if not isinstance(v, dict))) - - #------------- - # Prepare inputs as list and generate quantized model with fms_mo - # This is not an FMS model. fms_mo model can run Direct Quantization - #------------- - examples = None - examples_for_prep = None - if qcfg["qmodel_calibration"]: - if args.activ_quant_type == "per_tensor_asymm": - print("=" * 60) - print(f"qmodel_calibration = {qcfg['qmodel_calibration']}") - print(f"qmodel_calibration_new = {qcfg['qmodel_calibration_new']}") - raise NotImplementedError( - "Direct Quantization (DQ) using `qmodel_calibration` is not compatible " - "with INT8 asymmetric quantization of activations in fms-mo. " - "Please pass `qmodel_calibration_new` argument instead." - ) - examples = qcfg["qmodel_calibration"] - elif qcfg["qmodel_calibration_new"]: - examples = qcfg["qmodel_calibration_new"] - if examples: - examples_for_prep = [next(iter(dq_dataloader)) for _ in range(examples)] - - #------------- - # Prepare quantized model using fms_mo - #------------- - print("=" * 60) - print(f"* Begin preparation of quantized model.") - if qcfg["qmodel_calibration"]: - print("* Calibration to be applied during this preparation step.") - prep_time_start = time.time() - qmodel_prep( - model, - examples_for_prep, - qcfg, - dev="cpu", # always run Direct Quantization on CPU, not AIU - use_layer_name_pattern_matching=False, - save_fname='roberta-base-w8a8', - ) - if qcfg["qmodel_calibration"]: - print( - "* Quantized model has been instantiated and pre-calibrated " - f"(took {time.time() - prep_time_start:.1f} s)." - ) - else: - print( - "* Quantized model has been instantiated and needs calibration " - f"(took {time.time() - prep_time_start:.1f} s)." - ) - - #------------- - # Apply smoothquant - #------------- - if qcfg['smoothq']: - sq_time_start = time.time() - print("* Being applying SmoothQuant scales.") - assert qcfg['smoothq'] == True, "doing smoothq" - if not os.path.exists(qcfg['act_scale_path']): - print( - "generate new smoothq activation scales " - f"at {qcfg['act_scale_path']}" - ) - smoothq_alpha_requested = None - if qcfg["smoothq_alpha"] != 0: - smoothq_alpha_requested = qcfg["smoothq_alpha"] - qcfg["smoothq_alpha"] = 0 - print("[WARNNG] using smoothq_alpha = 0 for scale generation") - act_scales = get_act_scales(model, dq_dataloader, qcfg, device="cpu") - torch.save(act_scales, qcfg['act_scale_path']) - if smoothq_alpha_requested: - qcfg["smoothq_alpha"] = smoothq_alpha_requested - print(f"smoothq_alpha set back to {qcfg['smoothq_alpha']}") - else: - print( - f"using smoothq activation scales from {qcfg['act_scale_path']}" - ) - act_scales = torch.load(qcfg['act_scale_path'], map_location='cpu') - - dq_llm(model, act_scales, qcfg) - print(f"* SmoothQuant scales applied (took = {time.time() - sq_time_start:.1f} s).") - print("=="*20) - else: - print("* SmoothQuant is DISABLED.") - - #------------- - # Run calibration = Direct Quantization DQ - #------------- - if qcfg['qmodel_calibration_new'] > 0: - calib_time_start = time.time() - print("* Begin calibration of activation quantized parameters.") - pbar = tqdm( - dq_dataloader, - desc="* Calibration progress", - total = qcfg['qmodel_calibration_new'] - ) - for data_mb, _ in zip(pbar, range(qcfg['qmodel_calibration_new'])): - data_mb = prepare_input( - device=model.device, - data=data_mb, - ) - with torch.no_grad(): - model(**data_mb) - print(f"* Calibration completed (took = {time.time() - calib_time_start:.1f} s).") - - if args.verbose: - print("=" * 60) - print("* PARAMETERS") - print("\n".join( - f"{k:80} {str(list(v.size())):15} {v.dtype}" - for k,v in model.named_parameters() - )) - print("* BUFFERS") - print("\n".join( - f"{k:80} {str(list(v.size())):15} {v.dtype}" - for k,v in model.named_buffers() - )) - - #------------- - # Save checkpoint with integer weights (AIU requirement) - #------------- - keys_to_ignore = [ - "num_module_called", - "smoothq_act_scale", - "smoothq_alpha", - "calib_counter", - "obsrv_clipval", - "obsrv_clipvaln", - "obsrv_w_clipval", - ] - - print(f"Begin processing model state dictionary for saving.") - new_sd = process_state_dict( - model=model, - quantized_layers=QUANTIZED_LAYERS_ROBERTA, - keys_to_ignore=keys_to_ignore, - verbose=args.verbose, - ) - - task = "mlm" if args.architecture == "roberta" else "qa" - smoothq_str = qcfg['smoothq_alpha'] if qcfg['smoothq'] else "no" - save_path = str( - Path(args.output_path) / - f"roberta-base_{task}_w8-{qcfg['qw_mode']}_a8-{qcfg['qa_mode']}" - f"_bmm32_smoothq-{smoothq_str}_dq.pt" - ) - torch.save(new_sd, save_path) - print(f"Model saved to {save_path}") - - tokenizer.save_pretrained(args.output_path) - print(f"Tokenizer saved to {args.output_path}") \ No newline at end of file diff --git a/scripts/run_encoder.py b/scripts/run_encoder.py index 77ac104..8001468 100644 --- a/scripts/run_encoder.py +++ b/scripts/run_encoder.py @@ -37,17 +37,6 @@ # Main model setup default_dtype, device, dist_strat = setup_model(args) -model_path = args.model_path -if args.int8_direct_quantization: - save_path = None - - # !!! insert DQ for encoders here - # pass default_dtype to DQ function - - # if DQ is used, args.model_path represent FP16 ckpt but we need to load the - # newly-created INT8 ckpt. Without DQ, args.model_path is the INT8 ckpt already. - model_path = save_path - # Retrieve linear configuration (quantized or not) to instantiate FMS model linear_config = get_linear_config(args) @@ -64,7 +53,7 @@ model = get_model( args.architecture, args.variant, - model_path=model_path, + model_path=args.model_path, device_type="cpu" if args.is_aiu_backend else args.device_type, data_type=default_dtype, source=args.model_source, From 3434641806043d9e80098c20edf55260d9939d99 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Fri, 20 Jun 2025 13:37:32 -0400 Subject: [PATCH 29/30] Update import of ids_for_prompt and fix some formatting Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/__init__.py | 28 ++++---- aiu_fms_testing_utils/utils/decoders_utils.py | 69 ++++++++++--------- 2 files changed, 49 insertions(+), 48 deletions(-) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 1bc76ac..9e79d8b 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -37,17 +37,17 @@ def __download_file(url, filename): try: response = requests.get(url, stream=True) response.raise_for_status() - + with open(filename, 'wb') as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) print(f"Successfully downloaded {filename}") - + except requests.exceptions.RequestException as e: print(f"An error occurred: {e}") def __sample_requests( - prompt_list: List[str], + prompt_list: List[str], num_requests: int, tokenizer: BaseTokenizer, prompt_length_min: int = 32, @@ -67,16 +67,14 @@ def __sample_requests( # Tokenize the prompts and completions. prompt = prompt_list[i] prompt_token_ids = ids_for_prompt(prompt, tokenizer) - + prompt_len = len(prompt_token_ids) if prompt_len < prompt_length_min or prompt_len > prompt_length_max: # Prune too short or too long sequences. continue filtered_dataset.append((prompt, prompt_len)) - - return filtered_dataset - + return filtered_dataset def sample_sharegpt_requests( dataset_path: str, @@ -96,15 +94,15 @@ def sample_sharegpt_requests( # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset = [data["conversations"][0]["value"] for data in dataset] - + return __sample_requests(dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed) def sample_squad_v2_qa_requests( dataset_path: str, - num_requests: int, - tokenizer: BaseTokenizer, - prompt_length_min: int = 32, - prompt_length_max: int = 64, + num_requests: int, + tokenizer: BaseTokenizer, + prompt_length_min: int = 32, + prompt_length_max: int = 64, seed: Optional[int] = None ) -> List[Tuple[str, int]]: from datasets import load_dataset @@ -113,10 +111,10 @@ def sample_squad_v2_qa_requests( ds = load_dataset(dataset_path)['train'] else: ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)['train'] - - + + ds = [f"{data['context']}\n{data['question']}" for data in ds] return __sample_requests(ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed) - + diff --git a/aiu_fms_testing_utils/utils/decoders_utils.py b/aiu_fms_testing_utils/utils/decoders_utils.py index 668c117..fe6316d 100644 --- a/aiu_fms_testing_utils/utils/decoders_utils.py +++ b/aiu_fms_testing_utils/utils/decoders_utils.py @@ -15,7 +15,7 @@ import torch # Local Packages -from aiu_fms_testing_utils.utils import warmup_model +from aiu_fms_testing_utils.utils import ids_for_prompt, warmup_model from aiu_fms_testing_utils.utils.aiu_setup import dprint, local_rank @@ -34,12 +34,10 @@ def __init__( self.args = args self.device = device - self.add_special_tokens = False self.has_padding = True self.max_len = 0 self.extra_generation_kwargs = {} - # !!! Inference arguments (hardcoded, as in the original script) self.do_sample = [False] self.use_cache = [args.no_use_cache] # True/False identical with greedy iff `torch.use_deterministic_algorithms(True)` @@ -58,16 +56,6 @@ def validate_decoder_arguments(self): f"Architecture {args.architecture} should be run as an encoder model." ) - def ids_for_prompt(self, prompt): - """Process textual prompt and return tokenized ids.""" - - tokens = self.tokenizer.tokenize(prompt) - ids = self.tokenizer.convert_tokens_to_ids(tokens) - if self.add_special_tokens: - ids = [self.tokenizer.bos_token_id] + ids - ids = torch.tensor(ids, dtype=torch.long, device=self.device) - return ids - def truncate_prompts_to_max_length(self, prompts, max_len, max_allowed_length): """Truncate a series of prompts to a selected max length. This function ensures prompt truncation prior to padding the input ids.""" @@ -83,10 +71,6 @@ def process_eval_set(self): """ args = self.args - self.add_special_tokens = ( - self.tokenizer.bos_token_id != self.tokenizer.eos_token_id - ) - if args.prompt_path != "": # Before creating the Path object, check if prompt_path has a glob pattern if isinstance(args.prompt_path, str): @@ -114,50 +98,69 @@ def process_eval_set(self): prompt_file_paths = [prompt_path] # Check if we found some files - assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}" + assert len(prompt_file_paths) > 0, ( + f"Can't find any prompt files at {prompt_path}" + ) # Check if we have enough files - assert ( - len(prompt_file_paths) >= args.batch_size - ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + assert len(prompt_file_paths) >= args.batch_size, ( + f"Not enough prompt files at {prompt_path} " + f"for a batch size of {args.batch_size}" + ) prompts = [] for i, prompt_file_path in enumerate(prompt_file_paths): if i == args.batch_size: break - prompts.append(self.ids_for_prompt(prompt_file_path.read_text(encoding="utf-8"))) + prompts.append( + ids_for_prompt( + prompt_file_path.read_text(encoding="utf-8"), + self.tokenizer, + ) + ) else: if args.prompt_type == "chat": - template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:" - + template = ( + "Below is an instruction that describes a task. Write a response " + "that appropriately completes the request.\n\n### Instruction:" + "\n{}\n\n### Response:" + ) prompt1 = template.format( "Provide a list of instructions for preparing chicken soup." ) prompt2 = template.format("Explain some popular greetings in Spanish.") prompt3 = template.format("Explain to me why ignorance is bliss.") prompt4 = template.format( - "I have just come into a very large sum of money. Provide me a list of things that I can do with my new found wealth." + "I have just come into a very large sum of money. Provide me a " + "list of things that I can do with my new found wealth." ) elif args.prompt_type == "code": - template = "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:\n{}\n[/INST]" + template = ( + "[INST] Write code to solve the following coding problem that " + "obeys the constraints and passes the example test cases. " + "Please wrap your code answer using ```:\n{}\n[/INST]" + ) prompt1 = template.format("Write a bubble sort function in python.") prompt2 = template.format( - "Using the Java streams API, write a simple function which will get the cumulative sum of a list of integers." + "Using the Java streams API, write a simple function which will " + "get the cumulative sum of a list of integers." ) prompt3 = template.format( - "In bash, how do I list all directories and sub-directories which contain a .py file." + "In bash, how do I list all directories and sub-directories which " + "contain a .py file." ) prompt4 = template.format( - "Write a simple decorator in python which will modify all string inputs to ints if possible." + "Write a simple decorator in python which will modify all string " + "inputs to ints if possible." ) else: dprint("prompt_type must be one of chat or code") exit() - prompt1 = self.ids_for_prompt(prompt1) - prompt2 = self.ids_for_prompt(prompt2) - prompt3 = self.ids_for_prompt(prompt3) - prompt4 = self.ids_for_prompt(prompt4) + prompt1 = ids_for_prompt(prompt1, self.tokenizer) + prompt2 = ids_for_prompt(prompt2, self.tokenizer) + prompt3 = ids_for_prompt(prompt3, self.tokenizer) + prompt4 = ids_for_prompt(prompt4, self.tokenizer) prompts = [prompt1, prompt2, prompt3, prompt4] prompts = prompts * ((args.batch_size // 4) + 1) prompts = prompts[: args.batch_size] From a543198f73c2815ed2f74e9270848deff3c77c86 Mon Sep 17 00:00:00 2001 From: Andrea Fasoli Date: Fri, 20 Jun 2025 19:21:47 -0400 Subject: [PATCH 30/30] Minor changes Signed-off-by: Andrea Fasoli --- aiu_fms_testing_utils/utils/encoders_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/aiu_fms_testing_utils/utils/encoders_utils.py b/aiu_fms_testing_utils/utils/encoders_utils.py index 4ebeda7..52c700d 100644 --- a/aiu_fms_testing_utils/utils/encoders_utils.py +++ b/aiu_fms_testing_utils/utils/encoders_utils.py @@ -72,7 +72,8 @@ def validate_encoder_arguments(self) -> None: args = self.args if not getattr(args, "is_encoder", False): raise ValueError( - "Running encoder model but is_encoder argument is either not set or False." + "Running encoder model but is_encoder argument is not set to True. " + "Verify your launch script." ) if args.min_pad_length != 0: raise ValueError( @@ -84,7 +85,7 @@ def validate_encoder_arguments(self) -> None: "Argument fixed_prompt_length should not be provided to encoders. " "To pad the input sequence, use --pad_to_max_length flag instead." ) - if args.max_new_tokens != 100: + if args.max_new_tokens != 100: # default value for decoder models raise ValueError( "Argument max_new_token should not be provided to encoders. " "To define the max length of a generated answer in QuestionAnswering " @@ -512,7 +513,7 @@ def post_processing_function( output_dir=None, prefix=stage, ) - breakpoint() + # Format the result to the format the metric expects. if args.version_2_with_negative: formatted_predictions = [ @@ -613,7 +614,7 @@ def run_evaluation(self) -> None: f"(tot = {len(eval_dataloader) * args.batch_size}, " f"bs = {args.batch_size})" ) - breakpoint() + # concatenate the numpy array max_len = max([x.shape[1] for x in all_start_logits]) start_logits_concat = self.create_and_fill_np_array(