From 50208479c654f70be32ec57a7f01edf85b29be51 Mon Sep 17 00:00:00 2001 From: kcirred <16872435+kcirred@users.noreply.github.com> Date: Tue, 24 Jun 2025 17:29:25 +0000 Subject: [PATCH] added lint.yml and ran ruff format Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com> --- .github/workflows/lint.yml | 15 ++ aiu_fms_testing_utils/testing/validation.py | 165 ++++++++++++++------ aiu_fms_testing_utils/utils/__init__.py | 85 ++++++---- aiu_fms_testing_utils/utils/aiu_setup.py | 7 +- aiu_fms_testing_utils/utils/paged.py | 24 ++- scripts/generate_metrics.py | 140 ++++++++++++----- scripts/inference.py | 102 ++++++++---- scripts/roberta.py | 71 +++++---- scripts/small-toy.py | 98 +++++++----- scripts/validation.py | 79 ++++++---- tests/models/conftest.py | 3 +- tests/models/test_decoders.py | 72 +++++++-- tests/models/test_encoders.py | 102 ++++++++---- tests/models/test_model_expectations.py | 14 +- tests/models/test_scripts.py | 99 ++++++++---- tests/resources/get_thresholds.py | 11 +- tests/testing/test_validation.py | 24 ++- tests/utils/test_paged.py | 6 +- 18 files changed, 774 insertions(+), 343 deletions(-) create mode 100644 .github/workflows/lint.yml diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..6a3f785 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,15 @@ +name: Lint + +on: [pull_request] + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/ruff-action@v3 + with: + src: "." + version: "~= 0.9.5" + - run: ruff check + - run: ruff format --check diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py index a66a5fe..82caf59 100644 --- a/aiu_fms_testing_utils/testing/validation.py +++ b/aiu_fms_testing_utils/testing/validation.py @@ -6,44 +6,77 @@ from aiu_fms_testing_utils.utils.aiu_setup import dprint import os -class LogitsExtractorHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]): +class LogitsExtractorHook( + Callable[ + [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], + Tuple[torch.Tensor, MutableMapping[str, Any]], + ] +): def __init__(self): super().__init__() self.extracted_logits: Optional[torch.Tensor] = None - def __call__(self, token_position: torch.Tensor, logits: torch.Tensor, next_val: torch.Tensor, kwargs): + def __call__( + self, + token_position: torch.Tensor, + logits: torch.Tensor, + next_val: torch.Tensor, + kwargs, + ): if self.extracted_logits is None: self.extracted_logits = logits.unsqueeze(1) else: - self.extracted_logits = torch.cat((self.extracted_logits, logits.unsqueeze(1)), dim=1) + self.extracted_logits = torch.cat( + (self.extracted_logits, logits.unsqueeze(1)), dim=1 + ) return next_val, kwargs -class StaticTokenInjectorHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]): - def __init__(self, static_tokens: List[torch.Tensor], device_type: str="cpu"): +class StaticTokenInjectorHook( + Callable[ + [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], + Tuple[torch.Tensor, MutableMapping[str, Any]], + ] +): + def __init__(self, static_tokens: List[torch.Tensor], device_type: str = "cpu"): super().__init__() - self.static_tokens = torch.tensor(static_tokens, device=device_type).t() # transposing so batch tokens per token_position + self.static_tokens = torch.tensor( + static_tokens, device=device_type + ).t() # transposing so batch tokens per token_position - def __call__(self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs): + def __call__( + self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs + ): next_val.copy_(self.static_tokens[token_position].unsqueeze(1)) return next_val, kwargs -class GoldenTokenHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]): - def __init__(self, static_tokens: torch.Tensor, device_type: str="cpu"): +class GoldenTokenHook( + Callable[ + [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], + Tuple[torch.Tensor, MutableMapping[str, Any]], + ] +): + def __init__(self, static_tokens: torch.Tensor, device_type: str = "cpu"): super().__init__() self.logits_extractor = LogitsExtractorHook() self.extracted_logits = None - self.token_injector = StaticTokenInjectorHook(static_tokens, device_type=device_type) + self.token_injector = StaticTokenInjectorHook( + static_tokens, device_type=device_type + ) - def __call__(self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs): - next_val, kwargs = self.logits_extractor(token_position, logits, next_val, kwargs) + def __call__( + self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs + ): + next_val, kwargs = self.logits_extractor( + token_position, logits, next_val, kwargs + ) self.extracted_logits = self.logits_extractor.extracted_logits return self.token_injector(token_position, logits, next_val, kwargs) -class ValidationInfo: +class ValidationInfo: def __init__(self, validation_info_list): super().__init__() @@ -54,7 +87,10 @@ def __iter__(self): yield vi def get_info(self, info_name): - return [[t.unsqueeze(0) for t in sentence[info_name]] for sentence in self._validation_info_list] + return [ + [t.unsqueeze(0) for t in sentence[info_name]] + for sentence in self._validation_info_list + ] def save(self, save_dir_path: str): """Save the validation information into a directory. @@ -86,12 +122,17 @@ def save(self, save_dir_path: str): def __len__(self): return len(self._validation_info_list) - -def get_default_validation_prefix(model_id: str, max_new_tokens: int, batch_size: int, seq_length: int, dtype: str): + + +def get_default_validation_prefix( + model_id: str, max_new_tokens: int, batch_size: int, seq_length: int, dtype: str +): return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}" -def load_validation_information(validation_path, validation_files_type, batch_size, tokenizer=None): +def load_validation_information( + validation_path, validation_files_type, batch_size, tokenizer=None +): """Load the validation information from a directory The files will be assumed to be in the following structure: @@ -107,7 +148,7 @@ def load_validation_information(validation_path, validation_files_type, batch_si if containing only tokens - torch.tensor if containing tokens and logits - dict[tokens -> torch.tensor, logits -> torch.tensor] if containing text - str - + :param validation_path: path to validation info files :param validation_files_type: validation file type to load, one of text, tokens, or logits :param batch_size: the number of prompts to load @@ -115,9 +156,7 @@ def load_validation_information(validation_path, validation_files_type, batch_si :return: a new validation info """ if isinstance(validation_path, str): - validation_files_path, sep, glob_pattern = validation_path.partition( - "*" - ) + validation_files_path, sep, glob_pattern = validation_path.partition("*") else: sep = "" glob_pattern = "" @@ -146,14 +185,14 @@ def load_validation_information(validation_path, validation_files_type, batch_si validation_files_paths = [validation_files_path] # Check if we found some files - assert ( - len(validation_files_paths) > 0 - ), f"Can't find any validation files at {validation_files_path}" + assert len(validation_files_paths) > 0, ( + f"Can't find any validation files at {validation_files_path}" + ) # Check if we have enough files - assert ( - len(validation_files_paths) >= batch_size - ), f"Not enough validation files at {validation_files_path} for a batch size of {batch_size}" + assert len(validation_files_paths) >= batch_size, ( + f"Not enough validation files at {validation_files_path} for a batch size of {batch_size}" + ) validation_info = [] for i, validation_file_path in enumerate(validation_files_paths): @@ -161,7 +200,9 @@ def load_validation_information(validation_path, validation_files_type, batch_si break if validation_files_type == "text": if tokenizer is None: - raise ValueError("must provide a tokenizer when validation_files_type=text") + raise ValueError( + "must provide a tokenizer when validation_files_type=text" + ) # Text format will get tokenized validation_info.append( { @@ -187,7 +228,19 @@ def load_validation_information(validation_path, validation_files_type, batch_si return ValidationInfo(validation_info) -def extract_validation_information(model, input_ids, max_new_tokens, post_iteration_hook, attn_algorithm=None, eos_token_id = None, only_last_token=False, timing="", attn_type="sdpa", **padding_kwargs): + +def extract_validation_information( + model, + input_ids, + max_new_tokens, + post_iteration_hook, + attn_algorithm=None, + eos_token_id=None, + only_last_token=False, + timing="", + attn_type="sdpa", + **padding_kwargs, +): max_seq_len = model.config.max_expected_seq_len attention_specific_kwargs = {} if attn_type == "paged": @@ -195,6 +248,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat else: # TODO: Add a unified generation dependent on attn_type from fms.utils.generation import generate + attention_specific_kwargs["contiguous_cache"] = True attention_specific_kwargs["max_seq_len"] = max_seq_len @@ -215,7 +269,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat eos_token_id=eos_token_id, timing=timing, extra_kwargs=extra_generation_kwargs, - **attention_specific_kwargs + **attention_specific_kwargs, ) if timing != "": @@ -226,7 +280,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat if timing == "e2e": dprint(f"E2E timing information: {timings[0]:.3f}s") elif timing == "per-token": - timings = [f"{t*1000:.3f}" for t in timings] + timings = [f"{t * 1000:.3f}" for t in timings] dprint(f"Per-token timing information: {', '.join(timings)} ms") if len(result.shape) == 1: @@ -235,26 +289,32 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat if hasattr(post_iteration_hook, "extracted_logits"): validation_info = [ {"tokens": t.to("cpu"), "logits": l.to("cpu")} - for t, l in zip(torch.unbind(result), torch.unbind(post_iteration_hook.extracted_logits)) + for t, l in zip( + torch.unbind(result), torch.unbind(post_iteration_hook.extracted_logits) + ) ] else: validation_info = [{"tokens": t.to("cpu")} for t in torch.unbind(result)] return ValidationInfo(validation_info) + def validate_level_0(aiu_tokens_per_sentence, validation_tokens_per_sentence): failed_cases = [] for sentence_idx, (aiu_sentence, validation_sentence) in enumerate( - zip(aiu_tokens_per_sentence, validation_tokens_per_sentence) + zip(aiu_tokens_per_sentence, validation_tokens_per_sentence) ): for token_idx, (aiu_token, validation_token) in enumerate( - zip(aiu_sentence, validation_sentence) + zip(aiu_sentence, validation_sentence) ): if aiu_token != validation_token: failed_cases.append((sentence_idx, token_idx)) return failed_cases -def top_k_loss_calculator(top_k: int, loss_f: Callable[[torch.Tensor, torch.Tensor], float]): + +def top_k_loss_calculator( + top_k: int, loss_f: Callable[[torch.Tensor, torch.Tensor], float] +): """ Function which will take the top_k logits indexes / values from a reference validation info and retrieve the same indexes from the test validation info logits and perform a loss function over the 2 tensors @@ -262,32 +322,38 @@ def top_k_loss_calculator(top_k: int, loss_f: Callable[[torch.Tensor, torch.Tens :param top_k: number of values to take from reference :param loss_f: a loss function between the reference and test logits """ + def loss_func(reference_logits, test_logits): reference_logits_prob = reference_logits.to(dtype=torch.float32) test_logits_prob = test_logits.to(dtype=torch.float32) - reference_values, reference_indices = torch.topk(reference_logits_prob, top_k, dim=1) + reference_values, reference_indices = torch.topk( + reference_logits_prob, top_k, dim=1 + ) test_values = test_logits_prob[:, reference_indices.squeeze(0)] return loss_f(reference_values, test_values) + return loss_func -def capture_level_1_metrics(reference_logits_per_sentence, test_logits_per_sentence, metrics_calculator=None): +def capture_level_1_metrics( + reference_logits_per_sentence, test_logits_per_sentence, metrics_calculator=None +): loss_metrics = [] for sentence_idx, (reference_sentence, test_sentence) in enumerate( - zip(reference_logits_per_sentence, test_logits_per_sentence) + zip(reference_logits_per_sentence, test_logits_per_sentence) ): for token_idx, (reference_logits, test_logits) in enumerate( - zip(reference_sentence, test_sentence) + zip(reference_sentence, test_sentence) ): # computing cross entropy loss per token if metrics_calculator is None: loss_fn = torch.nn.CrossEntropyLoss() metrics_value = loss_fn( reference_logits.to(dtype=torch.float32), - test_logits.softmax(dim=1).to(dtype=torch.float32) + test_logits.softmax(dim=1).to(dtype=torch.float32), ) else: metrics_value = metrics_calculator(reference_logits, test_logits) @@ -295,15 +361,16 @@ def capture_level_1_metrics(reference_logits_per_sentence, test_logits_per_sente loss_metrics.append((sentence_idx, token_idx, metrics_value)) return loss_metrics - + + def filter_failed_level_1_cases(level_1_loss_metrics, fail_f, print_failed=False): failed_cases = [] - for (sentence_idx, token_idx, metrics_value) in level_1_loss_metrics: + for sentence_idx, token_idx, metrics_value in level_1_loss_metrics: if fail_f(metrics_value): failed_cases.append((sentence_idx, token_idx, metrics_value)) if print_failed: dprint( - f"In sentence {sentence_idx+1}, the metric for token {token_idx} is {metrics_value}" + f"In sentence {sentence_idx + 1}, the metric for token {token_idx} is {metrics_value}" ) return failed_cases @@ -313,6 +380,12 @@ def print_failed_cases(failed_cases, aiu_tokens, validation_tokens, tokenizer): aiu_token = aiu_tokens[sentence_index][token_index] validation_token = validation_tokens[sentence_index][token_index] - aiu_str = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(aiu_token)) - validation_str = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(validation_token)) - print(f"In sentence {sentence_index+1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}") \ No newline at end of file + aiu_str = tokenizer.convert_tokens_to_string( + tokenizer.convert_ids_to_tokens(aiu_token) + ) + validation_str = tokenizer.convert_tokens_to_string( + tokenizer.convert_ids_to_tokens(validation_token) + ) + print( + f"In sentence {sentence_index + 1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}" + ) diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py index 99cac86..87f2259 100644 --- a/aiu_fms_testing_utils/utils/__init__.py +++ b/aiu_fms_testing_utils/utils/__init__.py @@ -9,16 +9,26 @@ import json import random -def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, attn_type="sdpa", **padding_kwargs): + +def warmup_model( + model: nn.Module, + input_ids: torch.Tensor, + max_new_tokens: int, + compile_dynamic_sendnn=False, + attn_type="sdpa", + **padding_kwargs, +): import torch_sendnn + attention_specific_kwargs = {} if attn_type == "paged": from aiu_fms_testing_utils.utils.paged import generate, adjust_inputs_to_batch else: # TODO: Add a unified generation dependent on attn_type from fms.utils.generation import generate + attention_specific_kwargs["contiguous_cache"] = True - + dprint("AIU warmup") pt_compile_model_time = time.time() @@ -30,15 +40,26 @@ def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, _max_new_tokens = 2 # always warmup with batch size 2 when using attn_type=paged if attn_type == "paged": - _warmup_input_ids, _padding_kwargs = adjust_inputs_to_batch(input_ids, **padding_kwargs) + _warmup_input_ids, _padding_kwargs = adjust_inputs_to_batch( + input_ids, **padding_kwargs + ) extra_kwargs = {**_padding_kwargs, "only_last_token": attn_type != "paged"} with torch_sendnn.warmup_mode(): - generate(model, _warmup_input_ids, max_new_tokens=_max_new_tokens, use_cache=True, do_sample=False, extra_kwargs=extra_kwargs, **attention_specific_kwargs) + generate( + model, + _warmup_input_ids, + max_new_tokens=_max_new_tokens, + use_cache=True, + do_sample=False, + extra_kwargs=extra_kwargs, + **attention_specific_kwargs, + ) pt_compile_model_time = time.time() - pt_compile_model_time dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s") + def ids_for_prompt(prompt, tokenizer): tokens = tokenizer.tokenize(prompt) ids = tokenizer.convert_tokens_to_ids(tokens) @@ -47,26 +68,28 @@ def ids_for_prompt(prompt, tokenizer): ids = torch.tensor(ids, dtype=torch.long, device="cpu") return ids + def __download_file(url, filename): try: response = requests.get(url, stream=True) response.raise_for_status() - - with open(filename, 'wb') as file: + + with open(filename, "wb") as file: for chunk in response.iter_content(chunk_size=8192): file.write(chunk) print(f"Successfully downloaded {filename}") - + except requests.exceptions.RequestException as e: print(f"An error occurred: {e}") + def __sample_requests( - prompt_list: List[str], + prompt_list: List[str], num_requests: int, tokenizer: BaseTokenizer, prompt_length_min: int = 32, prompt_length_max: int = 64, - seed: Optional[int] = None + seed: Optional[int] = None, ): # Shuffle the dataset. if seed is not None: @@ -81,15 +104,14 @@ def __sample_requests( # Tokenize the prompts and completions. prompt = prompt_list[i] prompt_token_ids = ids_for_prompt(prompt, tokenizer) - + prompt_len = len(prompt_token_ids) if prompt_len < prompt_length_min or prompt_len > prompt_length_max: # Prune too short or too long sequences. continue filtered_dataset.append((prompt, prompt_len)) - + return filtered_dataset - def sample_sharegpt_requests( @@ -98,39 +120,44 @@ def sample_sharegpt_requests( tokenizer: BaseTokenizer, prompt_length_min: int = 32, prompt_length_max: int = 64, - seed: Optional[int] = None + seed: Optional[int] = None, ) -> List[Tuple[str, int]]: if not os.path.exists(dataset_path): print("downloading share-gpt dataset as it does not exist") - __download_file("https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", dataset_path) + __download_file( + "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", + dataset_path, + ) # Load the dataset. - with open(dataset_path, encoding='utf-8') as f: + with open(dataset_path, encoding="utf-8") as f: dataset = json.load(f) # Filter out the conversations with less than 2 turns. dataset = [data for data in dataset if len(data["conversations"]) >= 2] dataset = [data["conversations"][0]["value"] for data in dataset] - - return __sample_requests(dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed) + + return __sample_requests( + dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed + ) + def sample_squad_v2_qa_requests( dataset_path: str, - num_requests: int, - tokenizer: BaseTokenizer, - prompt_length_min: int = 32, - prompt_length_max: int = 64, - seed: Optional[int] = None + num_requests: int, + tokenizer: BaseTokenizer, + prompt_length_min: int = 32, + prompt_length_max: int = 64, + seed: Optional[int] = None, ) -> List[Tuple[str, int]]: from datasets import load_dataset if os.path.exists(dataset_path): - ds = load_dataset(dataset_path)['train'] + ds = load_dataset(dataset_path)["train"] else: - ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)['train'] - - - ds = [f"{data['context']}\n{data['question']}" for data in ds] + ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)["train"] - return __sample_requests(ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed) - + ds = [f"{data['context']}\n{data['question']}" for data in ds] + return __sample_requests( + ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed + ) diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py index fb9a3df..bbd8848 100644 --- a/aiu_fms_testing_utils/utils/aiu_setup.py +++ b/aiu_fms_testing_utils/utils/aiu_setup.py @@ -3,21 +3,24 @@ # ============================================================== # Common utilities # ============================================================== -#------------- +# ------------- # Discover the world size and my rank (envars set by torchrun) # https://pytorch.org/docs/stable/elastic/run.html#environment-variables -#------------- +# ------------- local_rank = int(os.getenv("LOCAL_RANK", 0)) rank = int(os.getenv("RANK", 0)) world_rank = rank world_size = int(os.getenv("WORLD_SIZE", 1)) + def dprint_str(text): return f"[{rank:2d}/{world_size:2d}]: {text}" + def dprint(text): print(dprint_str(text)) + # ============================================================== # Common setup # ============================================================== diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py index 239d229..7228677 100644 --- a/aiu_fms_testing_utils/utils/paged.py +++ b/aiu_fms_testing_utils/utils/paged.py @@ -5,11 +5,12 @@ import torch import fms.utils.spyre.paged + def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs): """ - Adjusts the inputs to a batch. Batch size 1 cannot be handled since we want a symbolic shape for the batch + Adjusts the inputs to a batch. Batch size 1 cannot be handled since we want a symbolic shape for the batch and pytorch automatically sets size 1 dimensions as static - + Note: This is fixed in pytorch 2.7 """ input_ids = input_ids[0].repeat(2, 1) @@ -23,6 +24,7 @@ def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs): kwargs["position_ids"] = position_ids[0].repeat(2, 1) return input_ids, kwargs + # FIXME: We should use default generate, but that will require a larger re-work of generate def generate( model: Union[Callable, torch.nn.Module], @@ -88,7 +90,7 @@ def generate( if isinstance(input_ids, torch.Tensor): if len(input_ids.shape) == 1: input_ids = input_ids.unsqueeze(0) - + is_batch = input_ids.shape[0] > 1 # our model requires batch dimension if not is_batch: @@ -106,8 +108,18 @@ def generate( result = input_ids next_input = input_ids BLOCK_SIZE = 64 - _MAX_BATCH = int(os.environ.setdefault("VLLM_DT_MAX_BATCH_SIZE", str(input_ids.size(0)))) - _MAX_CONTEXT_LENGTH = int(os.environ.setdefault("VLLM_DT_MAX_CONTEXT_LEN", str((((input_ids.size(1) + max_new_tokens - 1) // BLOCK_SIZE) + 1) * BLOCK_SIZE))) + _MAX_BATCH = int( + os.environ.setdefault("VLLM_DT_MAX_BATCH_SIZE", str(input_ids.size(0))) + ) + _MAX_CONTEXT_LENGTH = int( + os.environ.setdefault( + "VLLM_DT_MAX_CONTEXT_LEN", + str( + (((input_ids.size(1) + max_new_tokens - 1) // BLOCK_SIZE) + 1) + * BLOCK_SIZE + ), + ) + ) NUM_BLOCKS = (_MAX_BATCH * _MAX_CONTEXT_LENGTH) // BLOCK_SIZE max_seq_len = input_ids.size(1) + max_new_tokens if hasattr(model, "head"): @@ -332,4 +344,4 @@ def generate( if timing != "": return result, times - return result \ No newline at end of file + return result diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py index f50ec59..9aae44d 100644 --- a/scripts/generate_metrics.py +++ b/scripts/generate_metrics.py @@ -7,8 +7,17 @@ import torch from torch import distributed as dist -from aiu_fms_testing_utils.testing.validation import capture_level_1_metrics, extract_validation_information, LogitsExtractorHook, get_default_validation_prefix, load_validation_information, print_failed_cases, \ - validate_level_0, GoldenTokenHook, top_k_loss_calculator +from aiu_fms_testing_utils.testing.validation import ( + capture_level_1_metrics, + extract_validation_information, + LogitsExtractorHook, + get_default_validation_prefix, + load_validation_information, + print_failed_cases, + validate_level_0, + GoldenTokenHook, + top_k_loss_calculator, +) from aiu_fms_testing_utils.utils import ids_for_prompt, sample_sharegpt_requests from fms.models import get_model from fms.utils import tokenizers @@ -83,19 +92,19 @@ "--topk_per_token", type=int, help="top k values per token to generate loss on", - default=20 + default=20, ) parser.add_argument( "--num_test_tokens_per_sequence", type=int, help="number of tokens in test. For instance, if max_new_tokens=128 and num_test_tokens_per_sequence=256, this means we will generate data over 2 sample prompts. If not set, will be set to max_new_tokens", - default=None + default=None, ) parser.add_argument( "--extra_get_model_kwargs", - nargs='*', + nargs="*", default={}, - help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,..." + help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,...", ) parser.add_argument( "--distributed", @@ -105,7 +114,7 @@ parser.add_argument( "--skip_computation", action="store_true", - help="Set this if the output is already assumed to be computed and would like to regenerate metrics without model loading or computation" + help="Set this if the output is already assumed to be computed and would like to regenerate metrics without model loading or computation", ) local_rank = int(os.getenv("LOCAL_RANK", 0)) world_size = int(os.getenv("WORLD_SIZE", 1)) @@ -120,14 +129,20 @@ extra_get_model_kwargs = {} for a in args.extra_get_model_kwargs: - a_split = a.split("=") - try: + a_split = a.split("=") + try: extra_get_model_kwargs[a_split[0]] = ast.literal_eval(a_split[1]) - except ValueError: + except ValueError: extra_get_model_kwargs[a_split[0]] = a_split[1] # this follows the same pattern of naming in test_shapes. This way we can save and re-use for quicker shape testing. -prefix = get_default_validation_prefix(args.variant, args.max_new_tokens, args.batch_size, args.min_pad_length, args.default_dtype) +prefix = get_default_validation_prefix( + args.variant, + args.max_new_tokens, + args.batch_size, + args.min_pad_length, + args.default_dtype, +) if os.path.exists(os.path.join(args.output_dir, f"{prefix}.prob_mean.csv")): print("skipping metric generation as it has already been done") exit(0) @@ -148,11 +163,12 @@ torch.set_grad_enabled(False) + def find_eos_index(reference_tokens, eos_token_id): result = [] for sentence in reference_tokens: found_eos = False - for token_idx, token in enumerate(sentence[args.min_pad_length:]): + for token_idx, token in enumerate(sentence[args.min_pad_length :]): if token.item() == eos_token_id: found_eos = True result.append(token_idx) @@ -161,13 +177,20 @@ def find_eos_index(reference_tokens, eos_token_id): result.append(args.max_new_tokens) return result + def filter_before_eos(l, filter_indexes): from itertools import groupby - filtered_results = [list(g)[:filter_indexes[k]] for k, g in groupby(l, key=lambda x: x[0])] + + filtered_results = [ + list(g)[: filter_indexes[k]] for k, g in groupby(l, key=lambda x: x[0]) + ] return [item for sublist in filtered_results for item in sublist] + def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): - prompts_and_sizes = sample_sharegpt_requests(args.sharegpt_path, batch_size, tokenizer, seq_length // 2, seq_length, seed) + prompts_and_sizes = sample_sharegpt_requests( + args.sharegpt_path, batch_size, tokenizer, seq_length // 2, seq_length, seed + ) prompt_list = [] for prompt, _ in prompts_and_sizes: prompt_list.append(ids_for_prompt(prompt, tokenizer)) @@ -175,13 +198,15 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length) return input_ids, padding_kwargs + def write_csv(l, path, metric): - with open(path, 'w') as f: - f.write(f'{metric}\n') + with open(path, "w") as f: + f.write(f"{metric}\n") for t in l: - f.write(f"{t[2].item()}\n") + f.write(f"{t[2].item()}\n") f.close() + # prepare the cuda model if not args.skip_computation: cuda_model = get_model( @@ -212,7 +237,9 @@ def write_csv(l, path, metric): cpu_model.eval() print("loaded cpu model") - ids, padding_kwargs = __prepare_inputs(args.batch_size, args.min_pad_length, tokenizer) + ids, padding_kwargs = __prepare_inputs( + args.batch_size, args.min_pad_length, tokenizer + ) # first test validation level 0 cpu_validation_info = extract_validation_information( @@ -221,7 +248,7 @@ def write_csv(l, path, metric): args.max_new_tokens, LogitsExtractorHook(), attn_algorithm="math", - **padding_kwargs + **padding_kwargs, ) cpu_static_tokens = cpu_validation_info.get_info("tokens") print("extracted cpu validation information") @@ -236,24 +263,41 @@ def write_csv(l, path, metric): args.max_new_tokens, None, only_last_token=True, - **{k: v.to("cuda") for k,v in padding_kwargs.items()} + **{k: v.to("cuda") for k, v in padding_kwargs.items()}, ) cuda_static_tokens = cuda_validation_info.get_info("tokens") failed_responses = validate_level_0(cpu_static_tokens, cuda_static_tokens) print("extracted cuda validation information level 0") if local_rank == 0: - if len(failed_responses) != 0: - print_failed_cases(failed_responses, cpu_static_tokens, cuda_static_tokens, tokenizer) + if len(failed_responses) != 0: + print_failed_cases( + failed_responses, cpu_static_tokens, cuda_static_tokens, tokenizer + ) num_test_tokens_per_sequence = args.num_test_tokens_per_sequence if num_test_tokens_per_sequence is None: num_test_tokens_per_sequence = args.max_new_tokens -cross_entropy = lambda r, t: torch.nn.CrossEntropyLoss()(r, t.softmax(dim=1).to(dtype=torch.float32)) -prob_mean = lambda r, t: torch.mean((r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32)) - 1.0) -prob_std = lambda r, t: torch.std(r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32)) -diff_mean = lambda r, t: torch.mean(torch.abs(r.softmax(dim=1).to(dtype=torch.float32) - t.softmax(dim=1).to(dtype=torch.float32))) +cross_entropy = lambda r, t: torch.nn.CrossEntropyLoss()( + r, t.softmax(dim=1).to(dtype=torch.float32) +) +prob_mean = lambda r, t: torch.mean( + ( + r.softmax(dim=1).to(dtype=torch.float32) + / t.softmax(dim=1).to(dtype=torch.float32) + ) + - 1.0 +) +prob_std = lambda r, t: torch.std( + r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32) +) +diff_mean = lambda r, t: torch.mean( + torch.abs( + r.softmax(dim=1).to(dtype=torch.float32) + - t.softmax(dim=1).to(dtype=torch.float32) + ) +) prob_mean_metrics = [] prob_std_metrics = [] @@ -265,10 +309,16 @@ def write_csv(l, path, metric): cuda_path = os.path.join(args.output_dir, f"{prefix}.cuda_validation_info.{i}.out") if os.path.exists(cpu_path) and os.path.exists(cuda_path): print(f"found the logits at {cpu_path}, reusing") - cpu_validation_info = load_validation_information(cpu_path, "logits", args.batch_size, tokenizer) - cuda_validation_info = load_validation_information(cuda_path, "logits", args.batch_size, tokenizer) + cpu_validation_info = load_validation_information( + cpu_path, "logits", args.batch_size, tokenizer + ) + cuda_validation_info = load_validation_information( + cuda_path, "logits", args.batch_size, tokenizer + ) elif not args.skip_computation: - ids, padding_kwargs = __prepare_inputs(args.batch_size, args.min_pad_length, tokenizer, i) + ids, padding_kwargs = __prepare_inputs( + args.batch_size, args.min_pad_length, tokenizer, i + ) # only need to compute this once if we aren't generating more test data if num_test_tokens_per_sequence > args.max_new_tokens: @@ -278,7 +328,7 @@ def write_csv(l, path, metric): args.max_new_tokens, LogitsExtractorHook(), attn_algorithm="math", - **padding_kwargs + **padding_kwargs, ) # generate aiu validation info @@ -288,7 +338,7 @@ def write_csv(l, path, metric): args.max_new_tokens, GoldenTokenHook(cpu_validation_info.get_info("tokens"), "cuda"), only_last_token=True, - **{k: v.to("cuda") for k,v in padding_kwargs.items()} + **{k: v.to("cuda") for k, v in padding_kwargs.items()}, ) print("extracted cuda validation information level 1") @@ -296,8 +346,10 @@ def write_csv(l, path, metric): if local_rank == 0: cpu_validation_info.save(cpu_path) cuda_validation_info.save(cuda_path) - - eos_indexes = find_eos_index(cpu_validation_info.get_info("tokens"), tokenizer.eos_token_id) + + eos_indexes = find_eos_index( + cpu_validation_info.get_info("tokens"), tokenizer.eos_token_id + ) level_1_metrics = capture_level_1_metrics( cpu_validation_info.get_info("logits"), cuda_validation_info.get_info("logits"), @@ -327,7 +379,21 @@ def write_csv(l, path, metric): prob_diff_metrics.extend(filter_before_eos(level_1_metrics, eos_indexes)) if local_rank == 0: - write_csv(prob_mean_metrics, os.path.join(args.output_dir, f"{prefix}.prob_mean.csv"), "prob_mean") - write_csv(prob_std_metrics, os.path.join(args.output_dir, f"{prefix}.prob_std.csv"), "prob_std") - write_csv(prob_ce_loss_metrics, os.path.join(args.output_dir, f"{prefix}.ce.csv"), "ce") - write_csv(prob_diff_metrics, os.path.join(args.output_dir, f"{prefix}.diff_mean.csv"), "diff_mean") + write_csv( + prob_mean_metrics, + os.path.join(args.output_dir, f"{prefix}.prob_mean.csv"), + "prob_mean", + ) + write_csv( + prob_std_metrics, + os.path.join(args.output_dir, f"{prefix}.prob_std.csv"), + "prob_std", + ) + write_csv( + prob_ce_loss_metrics, os.path.join(args.output_dir, f"{prefix}.ce.csv"), "ce" + ) + write_csv( + prob_diff_metrics, + os.path.join(args.output_dir, f"{prefix}.diff_mean.csv"), + "diff_mean", + ) diff --git a/scripts/inference.py b/scripts/inference.py index f67754a..d77069d 100644 --- a/scripts/inference.py +++ b/scripts/inference.py @@ -36,7 +36,7 @@ type=str, choices=["cuda", "cpu", "aiu", "aiu-senulator"], default="cuda", - help="The device to run the model on" + help="The device to run the model on", ) parser.add_argument( "--architecture", @@ -213,10 +213,11 @@ help="Number of iterations of inference to perform. Used for variance performance capture.", ) parser.add_argument( - '-v', '--verbose', - action='count', + "-v", + "--verbose", + action="count", default=0, - help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)" + help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)", ) parser.add_argument( "--attention_type", @@ -236,12 +237,14 @@ if "aiu" in args.device_type: try: from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear + print("Loaded `aiu_addons` functionalities") except: raise ImportError("Failed to import GPTQ addons from fms-mo.") elif args.quantization == "int8": try: from fms_mo.aiu_addons.i8i8 import i8i8_aiu_adapter, i8i8_aiu_linear + print("Loaded `aiu_addons` functionalities") except: raise ImportError("Failed to import INT8 addons from fms-mo.") @@ -356,7 +359,9 @@ fused_weights = not args.unfuse_weights if args.quantization == "gptq": if fused_weights and is_aiu_backend: - raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights") + raise ValueError( + "GPTQ checkpoints on AIU must always run with --unfuse_weights" + ) if default_dtype is not None: raise ValueError( "GPTQ default_dtype must be None to preserve the checkpoint data types." @@ -373,7 +378,7 @@ qconfig_path = args.model_path + "/quantize_config.json" if os.path.exists(qconfig_path): - with open(qconfig_path, 'r') as f: + with open(qconfig_path, "r") as f: dprint(f"loading quantization config from {qconfig_path}") qconfig = json.load(f) group_size = qconfig["group_size"] @@ -397,7 +402,9 @@ } elif args.quantization == "int8": if fused_weights and is_aiu_backend: - raise ValueError("INT8 checkpoints on AIU must always run with --unfuse_weights") + raise ValueError( + "INT8 checkpoints on AIU must always run with --unfuse_weights" + ) if default_dtype is not None: raise ValueError( "INT8 default_dtype must be None to preserve the checkpoint data types." @@ -425,17 +432,15 @@ def select_int8_module( elif any("roberta" in p.lower() for p in [args.model_path, args.architecture]): smoothquant_layers = ["query", "key", "value", "w1"] else: - raise NotImplementedError( - "INT8 architecture does not support smoothquant." - ) + raise NotImplementedError("INT8 architecture does not support smoothquant.") else: smoothquant_layers = [] linear_config = { "linear_type": partial( select_int8_module, - smoothquant = args.int8_smoothquant, - smoothquant_layers = smoothquant_layers, + smoothquant=args.int8_smoothquant, + smoothquant_layers=smoothquant_layers, ), "weight_per_channel": args.int8_weight_per_channel, "activ_quant_type": args.int8_activ_quant_type, @@ -443,12 +448,12 @@ def select_int8_module( else: linear_config = {"linear_type": "torch_linear"} -dprint("="*60) +dprint("=" * 60) dprint(f"model_path={args.model_path}") dprint(f"{linear_config=}") dprint(f"{fused_weights=}") dprint(f"data_type={default_dtype}") -dprint("="*60 + "\n") +dprint("=" * 60 + "\n") model = get_model( args.architecture, @@ -465,13 +470,27 @@ def select_int8_module( if args.quantization in ["gptq", "int8"]: if rank == 0 and args.verbose > 0: - dprint("PARAMS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_parameters())) - dprint("BUFFERS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_buffers())) - dprint("="*60 + "\n") + dprint( + "PARAMS:\n" + + "\n".join( + f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" + for k, v in model.named_parameters() + ) + ) + dprint( + "BUFFERS:\n" + + "\n".join( + f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" + for k, v in model.named_buffers() + ) + ) + dprint("=" * 60 + "\n") if args.architecture == "llama": - dprint("[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys.") + dprint( + "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys." + ) dprint(model) - dprint("="*60 + "\n") + dprint("=" * 60 + "\n") tokenizer = tokenizers.get_tokenizer(args.tokenizer) model.eval() @@ -482,7 +501,9 @@ def select_int8_module( if args.compile: dprint("compiling model") if is_aiu_backend: - model.compile(backend="sendnn", options={'sendnn.dynamic': args.compile_dynamic_sendnn}) + model.compile( + backend="sendnn", options={"sendnn.dynamic": args.compile_dynamic_sendnn} + ) else: # compiling can make first inference pass slow model.compile(mode=args.compile_mode, backend=args.compile_backend) @@ -538,9 +559,9 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length): assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}" # Check if we have enough files - assert ( - len(prompt_file_paths) >= args.batch_size - ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + assert len(prompt_file_paths) >= args.batch_size, ( + f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + ) prompts = [] for i, prompt_file_path in enumerate(prompt_file_paths): @@ -672,7 +693,7 @@ def infer(use_cache, do_sample, warmup): timing=args.timing, eos_token_id=eos_token_id, extra_kwargs=extra_generation_kwargs, - **attention_specific_kwargs + **attention_specific_kwargs, ) if args.timing != "": result, timings = result @@ -680,14 +701,24 @@ def infer(use_cache, do_sample, warmup): dprint(f"E2E timing information: {timings[0]:.3f}s") elif args.timing == "per-token": if not warmup: - dprint(f"First-token latency: {timings[0]*1000:.3f} ms") - dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms") + dprint(f"First-token latency: {timings[0] * 1000:.3f} ms") + dprint( + f"Average next-token latency (including first token): {np.mean(timings) * 1000:.3f} ms" + ) if len(timings) > 1: - dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms") - dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})") - dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})") - dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms") - timings = [f"{t*1000:.3f}" for t in timings] + dprint( + f"Average next-token latency: {np.mean(timings[1:]) * 1000:.3f} ms" + ) + dprint( + f"Max next-token latency: {np.max(timings[1:]) * 1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})" + ) + dprint( + f"Min next-token latency: {np.min(timings[1:]) * 1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})" + ) + dprint( + f"Std deviation of next-token latencies: {np.std(timings[1:]) * 1000:.3f} ms" + ) + timings = [f"{t * 1000:.3f}" for t in timings] dprint(f"Per-token timing information: {', '.join(timings)} ms") if len(result.shape) == 1: result = result.unsqueeze(0) @@ -706,7 +737,14 @@ def infer(use_cache, do_sample, warmup): dprint(f"compilation warmup") pt_compile_model_time = time.time() if args.device_type == "aiu": # only run warmup for AIU, no need for senulator - warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, attn_type=args.attention_type, **extra_generation_kwargs) + warmup_model( + model, + ids, + args.max_new_tokens, + args.compile_dynamic_sendnn, + attn_type=args.attention_type, + **extra_generation_kwargs, + ) aiu_warmup_time = time.time() for sample, cache in itertools.product(do_sample, use_cache): infer(cache, sample, True) diff --git a/scripts/roberta.py b/scripts/roberta.py index 124b09f..a7b9c18 100644 --- a/scripts/roberta.py +++ b/scripts/roberta.py @@ -36,55 +36,68 @@ # ============================================================== if __name__ == "__main__": # Number of batches to create - NUM_BATCHES=1 + NUM_BATCHES = 1 - #------------- + # ------------- # Command line argument parsing - #------------- - parser = argparse.ArgumentParser(description="PyTorch Small Toy Tensor Parallel Example") - parser.add_argument( "--backend", help="PyTorch Dynamo compiler backend", default='cpu', choices=['cpu', 'aiu']) + # ------------- + parser = argparse.ArgumentParser( + description="PyTorch Small Toy Tensor Parallel Example" + ) + parser.add_argument( + "--backend", + help="PyTorch Dynamo compiler backend", + default="cpu", + choices=["cpu", "aiu"], + ) pargs = parser.parse_args() - if pargs.backend == 'aiu': - dynamo_backend = 'sendnn' + if pargs.backend == "aiu": + dynamo_backend = "sendnn" else: - dynamo_backend = 'inductor' + dynamo_backend = "inductor" is_distributed = world_size > 1 if is_distributed: # Initialize the process group - torch.distributed.init_process_group(backend="gloo", rank=world_rank, world_size=world_size) + torch.distributed.init_process_group( + backend="gloo", rank=world_rank, world_size=world_size + ) # Looks like a string compare, but is actually comparing the components # https://github.com/pytorch/pytorch/blob/b5be4d8c053e22672719b9a33386b071daf9860d/torch/torch_version.py#L10-L16 - if torch.__version__ < '2.3.0': + if torch.__version__ < "2.3.0": # Fix until PyTorch 2.3 - torch._C._distributed_c10d._register_process_group("default", torch.distributed.group.WORLD) + torch._C._distributed_c10d._register_process_group( + "default", torch.distributed.group.WORLD + ) - #------------- + # ------------- # Setup AIU specific environment variables - #------------- + # ------------- if "sendnn" in dynamo_backend: aiu_setup.aiu_dist_setup(world_rank, world_size) - #------------- + # ------------- # Display some diagnostics - #------------- + # ------------- if 0 == world_rank: - dprint("-"*60) - dprint(f"Python Version : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}") + dprint("-" * 60) + dprint( + f"Python Version : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ) dprint(f"PyTorch Version : {torch.__version__}") dprint(f"Dynamo Backend : {pargs.backend} -> {dynamo_backend}") - if pargs.backend == 'aiu': + if pargs.backend == "aiu": for peer_rank in range(world_size): - pcie_env_str="AIU_WORLD_RANK_"+str(peer_rank) + pcie_env_str = "AIU_WORLD_RANK_" + str(peer_rank) dprint(f"PCI Addr. for Rank {peer_rank} : {os.environ[pcie_env_str]}") - print("-"*60) + print("-" * 60) if is_distributed: torch.distributed.barrier() - #------------- + # ------------- # Create the model - #------------- + # ------------- if 0 == world_rank: dprint(f"Creating the model...") # model_name = "roberta-base" @@ -111,20 +124,20 @@ # variant=model_name # ) - #------------- + # ------------- # Compile the model - #------------- + # ------------- if 0 == world_rank: dprint(f"Compiling the model...") the_compiled_model = torch.compile(hf_model_fms, backend=dynamo_backend) - the_compiled_model.eval() # inference only mode + the_compiled_model.eval() # inference only mode torch.set_grad_enabled(False) - #------------- + # ------------- # Run the model # - First run the compiler will activate to create the artifacts # - Second run there is no compiler involved - #------------- + # ------------- if is_distributed: torch.distributed.barrier() @@ -150,9 +163,9 @@ if 0 == world_rank: dprint(f"Answer: ({the_output[0]['score']:6.5f}) {the_output[0]['sequence']}") - #------------- + # ------------- # Cleanup - #------------- + # ------------- if 0 == world_rank: dprint(f"Done") if is_distributed: diff --git a/scripts/small-toy.py b/scripts/small-toy.py index a6965e4..a7cf0d2 100644 --- a/scripts/small-toy.py +++ b/scripts/small-toy.py @@ -18,6 +18,7 @@ # Import AIU Libraries from torch_sendnn import torch_sendnn + # ============================================================== # Toy Encoder Model # ============================================================== @@ -33,21 +34,30 @@ def __init__(self): self._linear_nets = torch.nn.ModuleList() for n in range(self.LAYERS_N): torch.manual_seed(42) - block = FeedForwardBlock(self.INPUT_N, hidden_grow_factor=self.HIDDEN_FACTOR, activation_fn=torch.nn.ReLU(), p_dropout=0) + block = FeedForwardBlock( + self.INPUT_N, + hidden_grow_factor=self.HIDDEN_FACTOR, + activation_fn=torch.nn.ReLU(), + p_dropout=0, + ) self._linear_nets.append(block) self._linear_nets.append(torch.nn.ReLU()) def copy_weights(self, par_model, seq_model): self_parent_layer = self if par_model is None else par_model with torch.no_grad(): - for (seq_name, seq_layer), (self_name, self_layer) in zip(seq_model.named_children(), self_parent_layer.named_children()): + for (seq_name, seq_layer), (self_name, self_layer) in zip( + seq_model.named_children(), self_parent_layer.named_children() + ): if hasattr(self_layer, "load_weights"): - self_layer.load_weights( { - "w1.weight": seq_layer.w1.weight, - "w1.bias": seq_layer.w1.bias, - "w2.weight": seq_layer.w2.weight, - "w2.bias": seq_layer.w2.bias, - }) + self_layer.load_weights( + { + "w1.weight": seq_layer.w1.weight, + "w1.bias": seq_layer.w1.bias, + "w2.weight": seq_layer.w2.weight, + "w2.bias": seq_layer.w2.bias, + } + ) else: self.copy_weights(self_layer, seq_layer) @@ -57,60 +67,74 @@ def forward(self, x): _in = net(_in) return _in + # ============================================================== # Main # ============================================================== if __name__ == "__main__": # Number of batches to create - NUM_BATCHES=1 + NUM_BATCHES = 1 - #------------- + # ------------- # Command line argument parsing - #------------- - parser = argparse.ArgumentParser(description="PyTorch Small Toy Tensor Parallel Example") - parser.add_argument( "--backend", help="PyTorch Dynamo compiler backend", default='cpu', choices=['cpu', 'aiu']) + # ------------- + parser = argparse.ArgumentParser( + description="PyTorch Small Toy Tensor Parallel Example" + ) + parser.add_argument( + "--backend", + help="PyTorch Dynamo compiler backend", + default="cpu", + choices=["cpu", "aiu"], + ) pargs = parser.parse_args() - if pargs.backend == 'aiu': - dynamo_backend = 'sendnn' + if pargs.backend == "aiu": + dynamo_backend = "sendnn" else: - dynamo_backend = 'inductor' + dynamo_backend = "inductor" is_distributed = world_size > 1 if is_distributed: # Initialize the process group - torch.distributed.init_process_group(backend="gloo", rank=world_rank, world_size=world_size) + torch.distributed.init_process_group( + backend="gloo", rank=world_rank, world_size=world_size + ) # Looks like a string compare, but is actually comparing the components # https://github.com/pytorch/pytorch/blob/b5be4d8c053e22672719b9a33386b071daf9860d/torch/torch_version.py#L10-L16 - if torch.__version__ < '2.3.0': + if torch.__version__ < "2.3.0": # Fix until PyTorch 2.3 - torch._C._distributed_c10d._register_process_group("default", torch.distributed.group.WORLD) + torch._C._distributed_c10d._register_process_group( + "default", torch.distributed.group.WORLD + ) - #------------- + # ------------- # Setup AIU specific environment variables - #------------- + # ------------- if "sendnn" in dynamo_backend: aiu_setup.aiu_dist_setup(world_rank, world_size) - #------------- + # ------------- # Display some diagnostics - #------------- + # ------------- if 0 == world_rank: - dprint("-"*60) - dprint(f"Python Version : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}") + dprint("-" * 60) + dprint( + f"Python Version : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}" + ) dprint(f"PyTorch Version : {torch.__version__}") dprint(f"Dynamo Backend : {pargs.backend} -> {dynamo_backend}") - if pargs.backend == 'aiu': + if pargs.backend == "aiu": for peer_rank in range(world_size): - pcie_env_str="AIU_WORLD_RANK_"+str(peer_rank) + pcie_env_str = "AIU_WORLD_RANK_" + str(peer_rank) dprint(f"PCI Addr. for Rank {peer_rank} : {os.environ[pcie_env_str]}") - print("-"*60) + print("-" * 60) if is_distributed: torch.distributed.barrier() - #------------- + # ------------- # Create the model - #------------- + # ------------- if 0 == world_rank: dprint(f"Creating the model...") the_model = ToyModelFM() @@ -118,20 +142,20 @@ def forward(self, x): # Create a Tensor Parallel version of the model apply_tp(the_model, torch.distributed.group.WORLD) - #------------- + # ------------- # Compile the model - #------------- + # ------------- if 0 == world_rank: dprint(f"Compiling the model...") the_compiled_model = torch.compile(the_model, backend=dynamo_backend) - the_compiled_model.eval() # inference only mode + the_compiled_model.eval() # inference only mode torch.set_grad_enabled(False) - #------------- + # ------------- # Run the model # - First run the compiler will activate to create the artifacts # - Second run there is no compiler involved - #------------- + # ------------- if is_distributed: torch.distributed.barrier() @@ -148,9 +172,9 @@ def forward(self, x): dprint(f"Running model: Second Time...") the_outputs = the_compiled_model(the_inputs) - #------------- + # ------------- # Cleanup - #------------- + # ------------- if 0 == world_rank: dprint(f"Done") if is_distributed: diff --git a/scripts/validation.py b/scripts/validation.py index c5b1449..bdbc01f 100644 --- a/scripts/validation.py +++ b/scripts/validation.py @@ -15,7 +15,17 @@ from fms.utils.generation import pad_input_ids from torch import distributed as dist from aiu_fms_testing_utils.utils import warmup_model -from aiu_fms_testing_utils.testing.validation import LogitsExtractorHook, capture_level_1_metrics, extract_validation_information, StaticTokenInjectorHook, GoldenTokenHook, filter_failed_level_1_cases, validate_level_0, load_validation_information, print_failed_cases +from aiu_fms_testing_utils.testing.validation import ( + LogitsExtractorHook, + capture_level_1_metrics, + extract_validation_information, + StaticTokenInjectorHook, + GoldenTokenHook, + filter_failed_level_1_cases, + validate_level_0, + load_validation_information, + print_failed_cases, +) from aiu_fms_testing_utils.utils import aiu_setup from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size @@ -28,7 +38,7 @@ type=str, choices=["aiu", "aiu-senulator"], default="aiu", - help="The device to run the model on" + help="The device to run the model on", ) parser.add_argument("--validation_device", type=str, default="cpu") parser.add_argument( @@ -212,22 +222,22 @@ "--save_validation_info_path", type=str, default=None, - help="If set, will save the validation info into the path specified for later use" + help="If set, will save the validation info into the path specified for later use", ) parser.add_argument( "--extra_get_model_kwargs", - nargs='*', + nargs="*", default={}, - help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,..." + help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,...", ) args = parser.parse_args() extra_get_model_kwargs = {} for a in args.extra_get_model_kwargs: - a_split = a.split("=") - try: + a_split = a.split("=") + try: extra_get_model_kwargs[a_split[0]] = ast.literal_eval(a_split[1]) - except ValueError: + except ValueError: extra_get_model_kwargs[a_split[0]] = a_split[1] # this is a test model config @@ -243,7 +253,9 @@ needs_validation_generation = args.validation_files_path == "" needs_validation_forward = ( - not needs_validation_generation and args.validation_files_type in ["text", "tokens"] and args.validation_level == 1 + not needs_validation_generation + and args.validation_files_type in ["text", "tokens"] + and args.validation_level == 1 ) needs_validation_run = needs_validation_forward or needs_validation_generation @@ -251,11 +263,10 @@ if args.quantization == "gptq": try: - # validation script always loads AIU addon from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear - print("Loaded `aiu_addons` functionalities") + print("Loaded `aiu_addons` functionalities") except ImportError: print("Failed to import addon packages") @@ -354,7 +365,7 @@ if args.quantization == "gptq": qconfig_path = args.model_path + "/quantize_config.json" if os.path.exists(qconfig_path): - with open(qconfig_path, 'r') as f: + with open(qconfig_path, "r") as f: dprint(f"loading quantization config from {qconfig_path}") qconfig = json.load(f) group_size = qconfig["group_size"] @@ -395,8 +406,10 @@ # model, the adapter will take care of converting key/values from # ckpt into the appropriate form for the model if fused_weights: - raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights") - default_dtype=None # GPTQ dtype always comes from ckpt, can't be enforced + raise ValueError( + "GPTQ checkpoints on AIU must always run with --unfuse_weights" + ) + default_dtype = None # GPTQ dtype always comes from ckpt, can't be enforced else: linear_config = {"linear_type": "torch_linear"} linear_config_validation = {"linear_type": "torch_linear"} @@ -412,7 +425,7 @@ group=dist.group.WORLD, linear_config=linear_config, fused_weights=fused_weights, - **extra_get_model_kwargs + **extra_get_model_kwargs, ) if args.quantization == "gptq": @@ -422,14 +435,12 @@ "and rotary embeddings, in GPTQ LLaMA models" ) dprint(model) - dprint("="*60 + "\n") + dprint("=" * 60 + "\n") if needs_validation_run: if args.quantization != "gptq": data_type_validation = ( - torch.float32 - if validation_device == aiu_device - else default_dtype + torch.float32 if validation_device == aiu_device else default_dtype ) else: data_type_validation = default_dtype @@ -444,7 +455,7 @@ group=dist.group.WORLD, linear_config=linear_config_validation, fused_weights=fused_weights, - **extra_get_model_kwargs + **extra_get_model_kwargs, ) validation_model.load_state_dict(model.state_dict()) if args.quantization == "gptq": @@ -454,7 +465,7 @@ "rotary embeddings, in GPTQ LLaMA models" ) dprint(validation_model) - dprint("="*60 + "\n") + dprint("=" * 60 + "\n") tokenizer = tokenizers.get_tokenizer(args.tokenizer) model.eval() @@ -526,9 +537,9 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length): assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}" # Check if we have enough files - assert ( - len(prompt_file_paths) >= args.batch_size - ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + assert len(prompt_file_paths) >= args.batch_size, ( + f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}" + ) prompts = [] for i, prompt_file_path in enumerate(prompt_file_paths): @@ -594,6 +605,7 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length): ids = prompts padding_kwargs = {} + def print_result(result, result_idx: int = 0, file_prefix: str = ""): if local_rank != 0: return @@ -644,7 +656,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""): # Truncate each answer to its prompt length + max_new_tokens for i, prompt in enumerate(prompts): prompt_len = prompt.size(0) - val_tokens[i] = val_tokens[i][:prompt_len+val_num_gen_tokens] + val_tokens[i] = val_tokens[i][: prompt_len + val_num_gen_tokens] if has_padding: val_ids, padding_val_kwargs = pad_input_ids( @@ -683,10 +695,12 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""): args.max_new_tokens, LogitsExtractorHook(), attn_algorithm="math", - **padding_kwargs + **padding_kwargs, ) -warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **padding_kwargs) +warmup_model( + model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **padding_kwargs +) ### AIU generation loop static_tokens = validation_info.get_info("tokens") @@ -699,10 +713,10 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""): ids, args.max_new_tokens, post_iteration_hook, - eos_token_id = None if args.no_early_termination else tokenizer.eos_token_id, + eos_token_id=None if args.no_early_termination else tokenizer.eos_token_id, only_last_token=True, timing=args.timing, - **padding_kwargs + **padding_kwargs, ) if args.save_validation_info_path is not None: @@ -714,11 +728,12 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""): failed_cases = validate_level_0(aiu_static_tokens, static_tokens) else: level_1_metrics = capture_level_1_metrics( - validation_info.get_info("logits"), - aiu_validation_info.get_info("logits") + validation_info.get_info("logits"), aiu_validation_info.get_info("logits") ) - failed_cases = filter_failed_level_1_cases(level_1_metrics, lambda m: m >= args.logits_loss_threshold) + failed_cases = filter_failed_level_1_cases( + level_1_metrics, lambda m: m >= args.logits_loss_threshold + ) validation_passed = len(failed_cases) == 0 diff --git a/tests/models/conftest.py b/tests/models/conftest.py index e93db8f..5ede12b 100644 --- a/tests/models/conftest.py +++ b/tests/models/conftest.py @@ -4,6 +4,7 @@ import os import pytest + def pytest_sessionstart(session): """ Called after the Session object has been created and @@ -23,6 +24,7 @@ def pytest_sessionstart(session): os.environ.setdefault("DTLOG_LEVEL", "error") os.environ.setdefault("DT_DEEPRT_VERBOSE", "-1") + def pytest_addoption(parser): parser.addoption( "--runslow", action="store_true", default=False, help="run slow tests" @@ -43,4 +45,3 @@ def pytest_generate_tests(metafunc): option_value = metafunc.config.option.capture_expectation if "capture_expectation" in metafunc.fixturenames and option_value is not None: metafunc.parametrize("capture_expectation", [option_value]) - diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py index 25ded59..56ffe7c 100644 --- a/tests/models/test_decoders.py +++ b/tests/models/test_decoders.py @@ -34,7 +34,9 @@ except ImportError: GPTQ_ENABLED = False -MICRO_MODELS_HOME = os.environ.get("FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models") +MICRO_MODELS_HOME = os.environ.get( + "FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models" +) # Add models to test here LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct" @@ -44,11 +46,19 @@ LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct" micro_model_mapping = { - LLAMA_3p1_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"), - GRANITE_3p2_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"), + LLAMA_3p1_8B_INSTRUCT: os.path.join( + MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000" + ), + GRANITE_3p2_8B_INSTRUCT: os.path.join( + MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000" + ), # FIXME: Because this uses the same config as 3.2, re-using here, but should update - GRANITE_3p3_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"), - LLAMA_3p1_70B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000") + GRANITE_3p3_8B_INSTRUCT: os.path.join( + MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000" + ), + LLAMA_3p1_70B_INSTRUCT: os.path.join( + MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000" + ), } SHARE_GPT_DATASET_PATH = os.environ.get( @@ -127,14 +137,18 @@ for metric in skip_assertions.split(","): metric = metric.lower() if metric not in {"ce", "mean_diff"}: - pytest.fail("FMS_TEST_SHAPES_SKIP_ASSERTIONS can only accept metrics ce and mean_diff") + pytest.fail( + "FMS_TEST_SHAPES_SKIP_ASSERTIONS can only accept metrics ce and mean_diff" + ) _skip_assertions.append(metric) skip_assertions = set(_skip_assertions) compile_dynamic_sendnn = ATTN_TYPE == "paged" if compile_dynamic_sendnn: - os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str((((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64) + os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str( + (((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64 + ) os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(common_batch_sizes)) common_shapes = list( @@ -294,8 +308,10 @@ def __load_validation_info( else: return None + class PersistentModel: """This class will either get a model that is pre-compiled (if compile_dynamic_sendnn) or re-create the model for each test""" + def __init__(self): self.model = None @@ -310,15 +326,17 @@ def get_or_create(self, is_gptq, **kwargs): self.__maybe_reset_model(model, is_gptq) model.eval() - model.compile(backend="sendnn", options={'sendnn.dynamic': compile_dynamic_sendnn}) + model.compile( + backend="sendnn", options={"sendnn.dynamic": compile_dynamic_sendnn} + ) if compile_dynamic_sendnn: self.model = model - + return model else: return self.model - + # TODO: This was added as we require a special reset for gptq models. Ideally, we would be able to do something like this reset when calling reset_parameters() on the model # however the gptq modules are yet to support this @staticmethod @@ -344,6 +362,7 @@ def __maybe_reset_model(model, is_gptq): res /= 20.0 param.copy_(res) + @pytest.fixture def persistent_model(): return PersistentModel() @@ -352,7 +371,9 @@ def persistent_model(): @pytest.mark.parametrize( "model_path,batch_size,seq_length,max_new_tokens", common_shapes ) -def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persistent_model): +def test_common_shapes( + model_path, batch_size, seq_length, max_new_tokens, persistent_model +): torch.manual_seed(42) torch.set_grad_enabled(False) os.environ["COMPILATION_MODE"] = "offline_decoder" @@ -396,7 +417,9 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi tokenizer = tokenizers.get_tokenizer(model_path) # prepare the AIU model - model = persistent_model.get_or_create(is_gptq, **gptq_kwargs_aiu, **get_model_kwargs) + model = persistent_model.get_or_create( + is_gptq, **gptq_kwargs_aiu, **get_model_kwargs + ) # prepare the cpu model validation_model = get_model( @@ -416,7 +439,14 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) # warmup aiu model - warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, attn_type=ATTN_TYPE, **padding_kwargs) + warmup_model( + model, + input_ids, + max_new_tokens, + compile_dynamic_sendnn, + attn_type=ATTN_TYPE, + **padding_kwargs, + ) # generate cpu validation info cpu_validation_info = __load_validation_info( @@ -448,7 +478,13 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi # first test validation level 0 aiu_validation_info = extract_validation_information( - model, input_ids, max_new_tokens, None, only_last_token=ATTN_TYPE != "paged", attn_type=ATTN_TYPE, **padding_kwargs + model, + input_ids, + max_new_tokens, + None, + only_last_token=ATTN_TYPE != "paged", + attn_type=ATTN_TYPE, + **padding_kwargs, ) dprint("aiu validation info extracted for validation level 0") @@ -461,7 +497,6 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi # if level 0 fails validation, validate level 1 if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0: - if failed_validation_level_0: dprint("failed validation level 0, testing validation level 1") else: @@ -526,7 +561,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): max_new_tokens, GoldenTokenHook(cpu_static_tokens), only_last_token=ATTN_TYPE != "paged", - attn_type=ATTN_TYPE, + attn_type=ATTN_TYPE, **padding_kwargs, ) dprint(f"aiu validation info extracted for validation level 1 - iter={i}") @@ -554,7 +589,10 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor): # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds if USE_MICRO_MODELS: ce_threshold, diff_threshold = fail_thresholds.get( - (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold) + (model_path, True), + fail_thresholds.get( + (model_path, False), default_metrics_threshold + ), ) else: ce_threshold, diff_threshold = fail_thresholds.get( diff --git a/tests/models/test_encoders.py b/tests/models/test_encoders.py index 67a032c..8a3eb06 100644 --- a/tests/models/test_encoders.py +++ b/tests/models/test_encoders.py @@ -1,4 +1,8 @@ -from fms.testing.comparison import ModelSignatureParams, compare_model_signatures, get_signature +from fms.testing.comparison import ( + ModelSignatureParams, + compare_model_signatures, + get_signature, +) from fms.utils import tokenizers import pytest from fms.models import get_model @@ -13,11 +17,17 @@ # Add models to test here ROBERTA_SQUAD_V2 = "deepset/roberta-base-squad2" -SQUAD_V2_DATASET_PATH = os.environ.get("SQUAD_V2_DATASET_PATH", os.path.expanduser("~/squad_v2")) -common_model_paths = os.environ.get("FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ROBERTA_SQUAD_V2]) +SQUAD_V2_DATASET_PATH = os.environ.get( + "SQUAD_V2_DATASET_PATH", os.path.expanduser("~/squad_v2") +) +common_model_paths = os.environ.get( + "FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ROBERTA_SQUAD_V2] +) common_batch_sizes = os.environ.get("FMS_TEST_SHAPES_COMMON_BATCH_SIZES", [1, 2, 4, 8]) common_seq_lengths = os.environ.get("FMS_TEST_SHAPES_COMMON_SEQ_LENGTHS", [64, 512]) -validation_diff_threshold = os.environ.get("FMS_TEST_SHAPES_VALIDATION_DIFF_THRESHOLD", .01) +validation_diff_threshold = os.environ.get( + "FMS_TEST_SHAPES_VALIDATION_DIFF_THRESHOLD", 0.01 +) # pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/roberta,/tmp/models/roberta-base-squad2" if isinstance(common_model_paths, str): @@ -36,18 +46,30 @@ if isinstance(validation_diff_threshold, str): validation_diff_threshold = float(validation_diff_threshold) -common_shapes = list(itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths)) +common_shapes = list( + itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths) +) def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0): - prompts_and_sizes = sample_squad_v2_qa_requests(SQUAD_V2_DATASET_PATH, batch_size, tokenizer, int(seq_length / 2), seq_length, seed) + prompts_and_sizes = sample_squad_v2_qa_requests( + SQUAD_V2_DATASET_PATH, + batch_size, + tokenizer, + int(seq_length / 2), + seq_length, + seed, + ) prompt_list = [] for prompt, _ in prompts_and_sizes: prompt_list.append(ids_for_prompt(prompt, tokenizer)) - input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length, is_causal_mask=False) + input_ids, padding_kwargs = pad_input_ids( + prompt_list, min_pad_length=seq_length, is_causal_mask=False + ) return input_ids, padding_kwargs + def __generate_diffs(model_params_1, model_params_2): model_params_1.model.eval() model_params_2.model.eval() @@ -57,7 +79,7 @@ def __generate_diffs(model_params_1, model_params_2): optional_params=model_params_1.other_params, logits_getter_fn=model_params_1.logits_getter_fn, inp=model_params_1.inp, - device=model_params_1.inp.device + device=model_params_1.inp.device, ) signature2 = get_signature( model_params_2.model, @@ -65,7 +87,7 @@ def __generate_diffs(model_params_1, model_params_2): optional_params=model_params_2.other_params, logits_getter_fn=model_params_2.logits_getter_fn, inp=model_params_2.inp, - device=model_params_2.inp.device + device=model_params_2.inp.device, ) signature = np.array(signature) @@ -73,21 +95,25 @@ def __generate_diffs(model_params_1, model_params_2): return np.mean(np.abs(signature2 - signature)) + @pytest.fixture(autouse=True) def reset_compiler(): - yield # run the test + yield # run the test torch.compiler.reset() torch._dynamo.reset() - os.environ.pop('COMPILATION_MODE', None) + os.environ.pop("COMPILATION_MODE", None) + @pytest.mark.parametrize("model_path,batch_size,seq_length", common_shapes) def test_common_shapes(model_path, batch_size, seq_length): os.environ["COMPILATION_MODE"] = "offline" - - dprint(f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}") + + dprint( + f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}" + ) tokenizer = tokenizers.get_tokenizer(model_path) - + if os.path.exists(model_path): model_path_kwargs = {"model_path": model_path} else: @@ -98,7 +124,7 @@ def test_common_shapes(model_path, batch_size, seq_length): architecture="hf_pretrained", device_type="cpu", fused_weights=False, - **model_path_kwargs + **model_path_kwargs, ) model.eval() @@ -111,34 +137,56 @@ def test_common_shapes(model_path, batch_size, seq_length): device_type="cpu", data_type=torch.float32, fused_weights=False, - **model_path_kwargs + **model_path_kwargs, ) # prepare input_ids input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer) # warmup model - logits_getter_fn = lambda x: x if isinstance(x, torch.Tensor) else torch.cat(list(x), dim=-1) - aiu_msp = ModelSignatureParams(model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs) - get_signature(aiu_msp.model, aiu_msp.params, aiu_msp.inp, aiu_msp.other_params, aiu_msp.logits_getter_fn) + logits_getter_fn = ( + lambda x: x if isinstance(x, torch.Tensor) else torch.cat(list(x), dim=-1) + ) + aiu_msp = ModelSignatureParams( + model, + ["x"], + logits_getter_fn=logits_getter_fn, + inp=input_ids, + other_params=padding_kwargs, + ) + get_signature( + aiu_msp.model, + aiu_msp.params, + aiu_msp.inp, + aiu_msp.other_params, + aiu_msp.logits_getter_fn, + ) # get the average diff over multiple samples diffs = [] for i in range(20): # prepare input_ids - input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer, seed=i) + input_ids, padding_kwargs = __prepare_inputs( + batch_size, seq_length, tokenizer, seed=i + ) aiu_msp = ModelSignatureParams( - model, - ["x"], - logits_getter_fn=logits_getter_fn, - inp=input_ids, - other_params=padding_kwargs + model, + ["x"], + logits_getter_fn=logits_getter_fn, + inp=input_ids, + other_params=padding_kwargs, + ) + cpu_msp = ModelSignatureParams( + validation_model, + ["x"], + logits_getter_fn=logits_getter_fn, + inp=input_ids, + other_params=padding_kwargs, ) - cpu_msp = ModelSignatureParams(validation_model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs) diffs.append(__generate_diffs(aiu_msp, cpu_msp)) abs_mean_diff = sum(diffs) / len(diffs) print(f"absolute mean diff: {abs_mean_diff}") - assert abs_mean_diff < validation_diff_threshold \ No newline at end of file + assert abs_mean_diff < validation_diff_threshold diff --git a/tests/models/test_model_expectations.py b/tests/models/test_model_expectations.py index 5cfcd57..64b24d4 100644 --- a/tests/models/test_model_expectations.py +++ b/tests/models/test_model_expectations.py @@ -20,7 +20,12 @@ MISTRAL_7B_INSTRUCT = "mistralai/Mistral-7B-Instruct-v0.3" ROBERTA_SQUAD_v2 = "deepset/roberta-base-squad2" -micro_models = {LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_GUARDIAN_3p1_8B, MISTRAL_7B_INSTRUCT} +micro_models = { + LLAMA_3p1_8B_INSTRUCT, + GRANITE_3p2_8B_INSTRUCT, + GRANITE_GUARDIAN_3p1_8B, + MISTRAL_7B_INSTRUCT, +} class AIUModelFixtureMixin(ModelFixtureMixin): @@ -51,7 +56,12 @@ def model(self, uninitialized_model): return uninitialized_model -decoder_models = [LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_GUARDIAN_3p1_8B, MISTRAL_7B_INSTRUCT] +decoder_models = [ + LLAMA_3p1_8B_INSTRUCT, + GRANITE_3p2_8B_INSTRUCT, + GRANITE_GUARDIAN_3p1_8B, + MISTRAL_7B_INSTRUCT, +] class TestAIUDecoderModels( diff --git a/tests/models/test_scripts.py b/tests/models/test_scripts.py index e0f2ab4..5601c5d 100644 --- a/tests/models/test_scripts.py +++ b/tests/models/test_scripts.py @@ -3,8 +3,9 @@ from pathlib import Path import itertools import math + FMS_DIR = Path(__file__).parent -AIU_FMS_DIR = os.path.join(FMS_DIR,"../../../aiu-fms-testing-utils/") +AIU_FMS_DIR = os.path.join(FMS_DIR, "../../../aiu-fms-testing-utils/") VALIDATION_FILE_PATH = os.path.join(AIU_FMS_DIR, "scripts", "validation.py") INFERENCE_FILE_PATH = os.path.join(AIU_FMS_DIR, "scripts", "inference.py") @@ -17,40 +18,68 @@ GRANITE_3_8B_CODE_BASE = f"{model_dir}/granite-3-8b-base" # pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/granite-3-8b-base,/tmp/models/granite-7b-base" -if os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == None or os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == "": +if ( + os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == None + or os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == "" +): common_model_paths = [LLAMA_194M] else: - common_model_paths = os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS").split(',') + common_model_paths = os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS").split(",") -common_batch_sizes = [1,8] +common_batch_sizes = [1, 8] common_seq_lengths = [64] common_max_new_tokens = [8] -common_params = list(itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths, common_max_new_tokens)) +common_params = list( + itertools.product( + common_model_paths, + common_batch_sizes, + common_seq_lengths, + common_max_new_tokens, + ) +) common_asserts = [ - "### Response: Chicken soup is a popular soup that is", - "### Response: I am sorry, but I am not", - "### Response: I am ignorant of the fact that I", - "### Response: I have just come into a very large", - ] + "### Response: Chicken soup is a popular soup that is", + "### Response: I am sorry, but I am not", + "### Response: I am ignorant of the fact that I", + "### Response: I have just come into a very large", +] current_env = os.environ.copy() -current_env["DT_OPT"]="varsub=1,lxopt=1,opfusion=1,arithfold=1,dataopt=1,patchinit=1,patchprog=1,autopilot=1,weipreload=0,kvcacheopt=1,progshareopt=1" +current_env["DT_OPT"] = ( + "varsub=1,lxopt=1,opfusion=1,arithfold=1,dataopt=1,patchinit=1,patchprog=1,autopilot=1,weipreload=0,kvcacheopt=1,progshareopt=1" +) -def execute_script(execute_cmd): - current_env['MAX_SHAREDPROG_ITERS'] = f"{common_max_new_tokens[0]}" - with Popen(execute_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True, env=current_env) as p: +def execute_script(execute_cmd): + current_env["MAX_SHAREDPROG_ITERS"] = f"{common_max_new_tokens[0]}" + + with Popen( + execute_cmd, + stdin=PIPE, + stdout=PIPE, + stderr=PIPE, + universal_newlines=True, + env=current_env, + ) as p: output, error = p.communicate() if p.returncode == 0: return output else: raise Exception(error) + # we are forcing the number of layers to be 2 to reduce the size of the model as we do not care about output, but just consistency between cpu and aiu -def execute_validation(validation_level, model_path, max_new_tokens, batch_size, seq_length, logits_loss_threshold=0.0): +def execute_validation( + validation_level, + model_path, + max_new_tokens, + batch_size, + seq_length, + logits_loss_threshold=0.0, +): execute_cmd = [ - 'python3', + "python3", VALIDATION_FILE_PATH, "--architecture=hf_pretrained", f"--model_path={model_path}", @@ -62,13 +91,14 @@ def execute_validation(validation_level, model_path, max_new_tokens, batch_size, "--no_early_termination", f"--validation_level={validation_level}", f"--logits_loss_threshold={logits_loss_threshold}", - "--compile_dynamic" + "--compile_dynamic", ] return execute_script(execute_cmd) + def execute_inference(model_path, max_new_tokens, batch_size, seq_length): execute_cmd = [ - 'python3', + "python3", INFERENCE_FILE_PATH, "--architecture=hf_pretrained", f"--model_path={model_path}", @@ -80,23 +110,24 @@ def execute_inference(model_path, max_new_tokens, batch_size, seq_length): "--no_early_termination", "--compile_dynamic", "--compile", - "--device_type=aiu" + "--device_type=aiu", ] return execute_script(execute_cmd) -@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens", common_params) + +@pytest.mark.parametrize( + "model_path,batch_size,seq_length,max_new_tokens", common_params +) def test_level_1_validation_script(model_path, batch_size, seq_length, max_new_tokens): result_text = execute_validation( - 1, - model_path, - max_new_tokens, - batch_size, - seq_length, - 64.0 + 1, model_path, max_new_tokens, batch_size, seq_length, 64.0 ) assert "The validation has passed!" in result_text -@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens", common_params) + +@pytest.mark.parametrize( + "model_path,batch_size,seq_length,max_new_tokens", common_params +) def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_tokens): result_text = execute_validation( 0, @@ -107,6 +138,7 @@ def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_t ) assert "The validation has passed!" in result_text + common_asserts = [ "### Response: Chicken soup is a popular soup that is", "### Response: I am sorry, but I am not", @@ -114,18 +146,25 @@ def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_t "### Response: I have just come into a very large", ] + def __repeat_batch_asserts(bs: int) -> list[str]: n_repeats = int(math.ceil(bs / len(common_asserts))) return (common_asserts * n_repeats)[:bs] + # add the asserts based on batch size # for batches greater than common_asserts, repeat common_asserts since this follows inference behavior -common_inference_params = [common_param + (__repeat_batch_asserts(common_param[1]),) for common_param in common_params] +common_inference_params = [ + common_param + (__repeat_batch_asserts(common_param[1]),) + for common_param in common_params +] -@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,asserts", common_inference_params) +@pytest.mark.parametrize( + "model_path,batch_size,seq_length,max_new_tokens,asserts", common_inference_params +) def test_inference_script(model_path, max_new_tokens, seq_length, batch_size, asserts): result_text = execute_inference(model_path, max_new_tokens, batch_size, seq_length) for common_assert in asserts: - assert common_assert in result_text \ No newline at end of file + assert common_assert in result_text diff --git a/tests/resources/get_thresholds.py b/tests/resources/get_thresholds.py index 7dedb70..474dbd4 100644 --- a/tests/resources/get_thresholds.py +++ b/tests/resources/get_thresholds.py @@ -4,23 +4,21 @@ import argparse import os -parser = argparse.ArgumentParser( - description="Script to get thresholds metrics" -) +parser = argparse.ArgumentParser(description="Script to get thresholds metrics") parser.add_argument( "--models", type=str, default=[], - nargs='+', + nargs="+", required=True, - help="List of models id separated by space. Eg.: ibm-granite/granite-20b-code-instruct-8k /tmp/models/granite-20b-code-cobol-v1" + help="List of models id separated by space. Eg.: ibm-granite/granite-20b-code-instruct-8k /tmp/models/granite-20b-code-cobol-v1", ) parser.add_argument( "--metrics", type=str, default=[], - nargs='+', + nargs="+", required=True, help="List of metrics separated by space. Eg.: diff_mean ce", ) @@ -43,7 +41,6 @@ metric_list = [] for metric_file in metric_files: - with open(metric_file, "r") as file: next(file) for line in file: diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py index 047590e..90cf2e7 100644 --- a/tests/testing/test_validation.py +++ b/tests/testing/test_validation.py @@ -1,11 +1,19 @@ import tempfile import pytest -from aiu_fms_testing_utils.testing.validation import LogitsExtractorHook, extract_validation_information, load_validation_information +from aiu_fms_testing_utils.testing.validation import ( + LogitsExtractorHook, + extract_validation_information, + load_validation_information, +) from fms.models import get_model from fms.utils.generation import pad_input_ids import torch -@pytest.mark.parametrize("validation_type,post_iteration_hook", [("logits", LogitsExtractorHook()), ("tokens", None)]) + +@pytest.mark.parametrize( + "validation_type,post_iteration_hook", + [("logits", LogitsExtractorHook()), ("tokens", None)], +) def test_validation_info_round_trip(validation_type, post_iteration_hook): # prepare a small cpu model model = get_model( @@ -22,7 +30,11 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook): # prepare input_ids prompt_list = [] for i in range(batch_size): - prompt_list.append(torch.randint(0, model.config.src_vocab_size, (seq_length - 2 * i,), dtype=torch.long)) + prompt_list.append( + torch.randint( + 0, model.config.src_vocab_size, (seq_length - 2 * i,), dtype=torch.long + ) + ) input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length) @@ -33,14 +45,16 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook): max_new_tokens, post_iteration_hook, attn_algorithm="math", - **padding_kwargs + **padding_kwargs, ) with tempfile.TemporaryDirectory() as workdir: output_path = f"{workdir}/validation_info" generated_validation_info.save(output_path) - loaded_validation_info = load_validation_information(output_path, validation_type, batch_size) + loaded_validation_info = load_validation_information( + output_path, validation_type, batch_size + ) assert len(generated_validation_info) == len(loaded_validation_info) diff --git a/tests/utils/test_paged.py b/tests/utils/test_paged.py index 519042a..67cba9b 100644 --- a/tests/utils/test_paged.py +++ b/tests/utils/test_paged.py @@ -1,13 +1,11 @@ import torch from fms.models import get_model -from fms.utils.generation import ( - pad_input_ids, - generate -) +from fms.utils.generation import pad_input_ids, generate from aiu_fms_testing_utils.utils.paged import generate as paged_generate from fms.utils.tokenizers import get_tokenizer import pytest + def test_paged_equivalence(): torch.manual_seed(0) with torch.no_grad():