From 50208479c654f70be32ec57a7f01edf85b29be51 Mon Sep 17 00:00:00 2001
From: kcirred <16872435+kcirred@users.noreply.github.com>
Date: Tue, 24 Jun 2025 17:29:25 +0000
Subject: [PATCH] added lint.yml and ran ruff format

Signed-off-by: kcirred <16872435+kcirred@users.noreply.github.com>
---
 .github/workflows/lint.yml                  |  15 ++
 aiu_fms_testing_utils/testing/validation.py | 165 ++++++++++++++------
 aiu_fms_testing_utils/utils/__init__.py     |  85 ++++++----
 aiu_fms_testing_utils/utils/aiu_setup.py    |   7 +-
 aiu_fms_testing_utils/utils/paged.py        |  24 ++-
 scripts/generate_metrics.py                 | 140 ++++++++++++-----
 scripts/inference.py                        | 102 ++++++++----
 scripts/roberta.py                          |  71 +++++----
 scripts/small-toy.py                        |  98 +++++++-----
 scripts/validation.py                       |  79 ++++++----
 tests/models/conftest.py                    |   3 +-
 tests/models/test_decoders.py               |  72 +++++++--
 tests/models/test_encoders.py               | 102 ++++++++----
 tests/models/test_model_expectations.py     |  14 +-
 tests/models/test_scripts.py                |  99 ++++++++----
 tests/resources/get_thresholds.py           |  11 +-
 tests/testing/test_validation.py            |  24 ++-
 tests/utils/test_paged.py                   |   6 +-
 18 files changed, 774 insertions(+), 343 deletions(-)
 create mode 100644 .github/workflows/lint.yml

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000..6a3f785
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,15 @@
+name: Lint
+
+on: [pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: astral-sh/ruff-action@v3
+        with:
+          src: "."
+          version: "~= 0.9.5"
+      - run: ruff check
+      - run: ruff format --check
diff --git a/aiu_fms_testing_utils/testing/validation.py b/aiu_fms_testing_utils/testing/validation.py
index a66a5fe..82caf59 100644
--- a/aiu_fms_testing_utils/testing/validation.py
+++ b/aiu_fms_testing_utils/testing/validation.py
@@ -6,44 +6,77 @@
 from aiu_fms_testing_utils.utils.aiu_setup import dprint
 import os
 
-class LogitsExtractorHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]):
 
+class LogitsExtractorHook(
+    Callable[
+        [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]],
+        Tuple[torch.Tensor, MutableMapping[str, Any]],
+    ]
+):
     def __init__(self):
         super().__init__()
         self.extracted_logits: Optional[torch.Tensor] = None
 
-    def __call__(self, token_position: torch.Tensor, logits: torch.Tensor, next_val: torch.Tensor, kwargs):
+    def __call__(
+        self,
+        token_position: torch.Tensor,
+        logits: torch.Tensor,
+        next_val: torch.Tensor,
+        kwargs,
+    ):
         if self.extracted_logits is None:
             self.extracted_logits = logits.unsqueeze(1)
         else:
-            self.extracted_logits = torch.cat((self.extracted_logits, logits.unsqueeze(1)), dim=1)
+            self.extracted_logits = torch.cat(
+                (self.extracted_logits, logits.unsqueeze(1)), dim=1
+            )
         return next_val, kwargs
 
-class StaticTokenInjectorHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]):
 
-    def __init__(self, static_tokens: List[torch.Tensor], device_type: str="cpu"):
+class StaticTokenInjectorHook(
+    Callable[
+        [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]],
+        Tuple[torch.Tensor, MutableMapping[str, Any]],
+    ]
+):
+    def __init__(self, static_tokens: List[torch.Tensor], device_type: str = "cpu"):
         super().__init__()
-        self.static_tokens = torch.tensor(static_tokens, device=device_type).t() # transposing so batch tokens per token_position
+        self.static_tokens = torch.tensor(
+            static_tokens, device=device_type
+        ).t()  # transposing so batch tokens per token_position
 
-    def __call__(self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs):
+    def __call__(
+        self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs
+    ):
         next_val.copy_(self.static_tokens[token_position].unsqueeze(1))
         return next_val, kwargs
 
-class GoldenTokenHook(Callable[[int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]], Tuple[torch.Tensor, MutableMapping[str, Any]],]):
 
-    def __init__(self, static_tokens: torch.Tensor, device_type: str="cpu"):
+class GoldenTokenHook(
+    Callable[
+        [int, torch.Tensor, torch.Tensor, MutableMapping[str, Any]],
+        Tuple[torch.Tensor, MutableMapping[str, Any]],
+    ]
+):
+    def __init__(self, static_tokens: torch.Tensor, device_type: str = "cpu"):
         super().__init__()
         self.logits_extractor = LogitsExtractorHook()
         self.extracted_logits = None
-        self.token_injector = StaticTokenInjectorHook(static_tokens, device_type=device_type)
+        self.token_injector = StaticTokenInjectorHook(
+            static_tokens, device_type=device_type
+        )
 
-    def __call__(self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs):
-        next_val, kwargs = self.logits_extractor(token_position, logits, next_val, kwargs)
+    def __call__(
+        self, token_position: int, logits: torch.Tensor, next_val: torch.Tensor, kwargs
+    ):
+        next_val, kwargs = self.logits_extractor(
+            token_position, logits, next_val, kwargs
+        )
         self.extracted_logits = self.logits_extractor.extracted_logits
         return self.token_injector(token_position, logits, next_val, kwargs)
 
-class ValidationInfo:
 
+class ValidationInfo:
     def __init__(self, validation_info_list):
         super().__init__()
 
@@ -54,7 +87,10 @@ def __iter__(self):
             yield vi
 
     def get_info(self, info_name):
-        return [[t.unsqueeze(0) for t in sentence[info_name]] for sentence in self._validation_info_list]
+        return [
+            [t.unsqueeze(0) for t in sentence[info_name]]
+            for sentence in self._validation_info_list
+        ]
 
     def save(self, save_dir_path: str):
         """Save the validation information into a directory.
@@ -86,12 +122,17 @@ def save(self, save_dir_path: str):
 
     def __len__(self):
         return len(self._validation_info_list)
-    
-def get_default_validation_prefix(model_id: str, max_new_tokens: int, batch_size: int, seq_length: int, dtype: str):
+
+
+def get_default_validation_prefix(
+    model_id: str, max_new_tokens: int, batch_size: int, seq_length: int, dtype: str
+):
     return f"{model_id.replace('/', '--')}_max-new-tokens-{max_new_tokens}_batch-size-{batch_size}_seq-length-{seq_length}_dtype-{dtype}"
 
 
-def load_validation_information(validation_path, validation_files_type, batch_size, tokenizer=None):
+def load_validation_information(
+    validation_path, validation_files_type, batch_size, tokenizer=None
+):
     """Load the validation information from a directory
 
     The files will be assumed to be in the following structure:
@@ -107,7 +148,7 @@ def load_validation_information(validation_path, validation_files_type, batch_si
     if containing only tokens - torch.tensor
     if containing tokens and logits - dict[tokens -> torch.tensor, logits -> torch.tensor]
     if containing text - str
-    
+
     :param validation_path: path to validation info files
     :param validation_files_type: validation file type to load, one of text, tokens, or logits
     :param batch_size: the number of prompts to load
@@ -115,9 +156,7 @@ def load_validation_information(validation_path, validation_files_type, batch_si
     :return: a new validation info
     """
     if isinstance(validation_path, str):
-        validation_files_path, sep, glob_pattern = validation_path.partition(
-            "*"
-        )
+        validation_files_path, sep, glob_pattern = validation_path.partition("*")
     else:
         sep = ""
         glob_pattern = ""
@@ -146,14 +185,14 @@ def load_validation_information(validation_path, validation_files_type, batch_si
         validation_files_paths = [validation_files_path]
 
     # Check if we found some files
-    assert (
-        len(validation_files_paths) > 0
-    ), f"Can't find any validation files at {validation_files_path}"
+    assert len(validation_files_paths) > 0, (
+        f"Can't find any validation files at {validation_files_path}"
+    )
 
     # Check if we have enough files
-    assert (
-        len(validation_files_paths) >= batch_size
-    ), f"Not enough validation files at {validation_files_path} for a batch size of {batch_size}"
+    assert len(validation_files_paths) >= batch_size, (
+        f"Not enough validation files at {validation_files_path} for a batch size of {batch_size}"
+    )
 
     validation_info = []
     for i, validation_file_path in enumerate(validation_files_paths):
@@ -161,7 +200,9 @@ def load_validation_information(validation_path, validation_files_type, batch_si
             break
         if validation_files_type == "text":
             if tokenizer is None:
-                raise ValueError("must provide a tokenizer when validation_files_type=text")
+                raise ValueError(
+                    "must provide a tokenizer when validation_files_type=text"
+                )
             # Text format will get tokenized
             validation_info.append(
                 {
@@ -187,7 +228,19 @@ def load_validation_information(validation_path, validation_files_type, batch_si
 
     return ValidationInfo(validation_info)
 
-def extract_validation_information(model, input_ids, max_new_tokens, post_iteration_hook, attn_algorithm=None, eos_token_id = None, only_last_token=False, timing="", attn_type="sdpa", **padding_kwargs):
+
+def extract_validation_information(
+    model,
+    input_ids,
+    max_new_tokens,
+    post_iteration_hook,
+    attn_algorithm=None,
+    eos_token_id=None,
+    only_last_token=False,
+    timing="",
+    attn_type="sdpa",
+    **padding_kwargs,
+):
     max_seq_len = model.config.max_expected_seq_len
     attention_specific_kwargs = {}
     if attn_type == "paged":
@@ -195,6 +248,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat
     else:
         # TODO: Add a unified generation dependent on attn_type
         from fms.utils.generation import generate
+
         attention_specific_kwargs["contiguous_cache"] = True
         attention_specific_kwargs["max_seq_len"] = max_seq_len
 
@@ -215,7 +269,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat
         eos_token_id=eos_token_id,
         timing=timing,
         extra_kwargs=extra_generation_kwargs,
-        **attention_specific_kwargs
+        **attention_specific_kwargs,
     )
 
     if timing != "":
@@ -226,7 +280,7 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat
         if timing == "e2e":
             dprint(f"E2E timing information: {timings[0]:.3f}s")
         elif timing == "per-token":
-            timings = [f"{t*1000:.3f}" for t in timings]
+            timings = [f"{t * 1000:.3f}" for t in timings]
             dprint(f"Per-token timing information: {', '.join(timings)} ms")
 
     if len(result.shape) == 1:
@@ -235,26 +289,32 @@ def extract_validation_information(model, input_ids, max_new_tokens, post_iterat
     if hasattr(post_iteration_hook, "extracted_logits"):
         validation_info = [
             {"tokens": t.to("cpu"), "logits": l.to("cpu")}
-            for t, l in zip(torch.unbind(result), torch.unbind(post_iteration_hook.extracted_logits))
+            for t, l in zip(
+                torch.unbind(result), torch.unbind(post_iteration_hook.extracted_logits)
+            )
         ]
     else:
         validation_info = [{"tokens": t.to("cpu")} for t in torch.unbind(result)]
     return ValidationInfo(validation_info)
 
+
 def validate_level_0(aiu_tokens_per_sentence, validation_tokens_per_sentence):
     failed_cases = []
 
     for sentence_idx, (aiu_sentence, validation_sentence) in enumerate(
-            zip(aiu_tokens_per_sentence, validation_tokens_per_sentence)
+        zip(aiu_tokens_per_sentence, validation_tokens_per_sentence)
     ):
         for token_idx, (aiu_token, validation_token) in enumerate(
-                zip(aiu_sentence, validation_sentence)
+            zip(aiu_sentence, validation_sentence)
         ):
             if aiu_token != validation_token:
                 failed_cases.append((sentence_idx, token_idx))
     return failed_cases
 
-def top_k_loss_calculator(top_k: int, loss_f: Callable[[torch.Tensor, torch.Tensor], float]):
+
+def top_k_loss_calculator(
+    top_k: int, loss_f: Callable[[torch.Tensor, torch.Tensor], float]
+):
     """
     Function which will take the top_k logits indexes / values from a reference validation info and retrieve the same indexes from the test validation info logits
     and perform a loss function over the 2 tensors
@@ -262,32 +322,38 @@ def top_k_loss_calculator(top_k: int, loss_f: Callable[[torch.Tensor, torch.Tens
     :param top_k: number of values to take from reference
     :param loss_f: a loss function between the reference and test logits
     """
+
     def loss_func(reference_logits, test_logits):
         reference_logits_prob = reference_logits.to(dtype=torch.float32)
         test_logits_prob = test_logits.to(dtype=torch.float32)
 
-        reference_values, reference_indices = torch.topk(reference_logits_prob, top_k, dim=1)
+        reference_values, reference_indices = torch.topk(
+            reference_logits_prob, top_k, dim=1
+        )
         test_values = test_logits_prob[:, reference_indices.squeeze(0)]
 
         return loss_f(reference_values, test_values)
+
     return loss_func
 
 
-def capture_level_1_metrics(reference_logits_per_sentence, test_logits_per_sentence, metrics_calculator=None):
+def capture_level_1_metrics(
+    reference_logits_per_sentence, test_logits_per_sentence, metrics_calculator=None
+):
     loss_metrics = []
 
     for sentence_idx, (reference_sentence, test_sentence) in enumerate(
-            zip(reference_logits_per_sentence, test_logits_per_sentence)
+        zip(reference_logits_per_sentence, test_logits_per_sentence)
     ):
         for token_idx, (reference_logits, test_logits) in enumerate(
-                zip(reference_sentence, test_sentence)
+            zip(reference_sentence, test_sentence)
         ):
             # computing cross entropy loss per token
             if metrics_calculator is None:
                 loss_fn = torch.nn.CrossEntropyLoss()
                 metrics_value = loss_fn(
                     reference_logits.to(dtype=torch.float32),
-                    test_logits.softmax(dim=1).to(dtype=torch.float32)
+                    test_logits.softmax(dim=1).to(dtype=torch.float32),
                 )
             else:
                 metrics_value = metrics_calculator(reference_logits, test_logits)
@@ -295,15 +361,16 @@ def capture_level_1_metrics(reference_logits_per_sentence, test_logits_per_sente
             loss_metrics.append((sentence_idx, token_idx, metrics_value))
 
     return loss_metrics
-    
+
+
 def filter_failed_level_1_cases(level_1_loss_metrics, fail_f, print_failed=False):
     failed_cases = []
-    for (sentence_idx, token_idx, metrics_value) in level_1_loss_metrics:
+    for sentence_idx, token_idx, metrics_value in level_1_loss_metrics:
         if fail_f(metrics_value):
             failed_cases.append((sentence_idx, token_idx, metrics_value))
             if print_failed:
                 dprint(
-                    f"In sentence {sentence_idx+1}, the metric for token {token_idx} is {metrics_value}"
+                    f"In sentence {sentence_idx + 1}, the metric for token {token_idx} is {metrics_value}"
                 )
     return failed_cases
 
@@ -313,6 +380,12 @@ def print_failed_cases(failed_cases, aiu_tokens, validation_tokens, tokenizer):
         aiu_token = aiu_tokens[sentence_index][token_index]
         validation_token = validation_tokens[sentence_index][token_index]
 
-        aiu_str = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(aiu_token))
-        validation_str = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(validation_token))
-        print(f"In sentence {sentence_index+1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}")
\ No newline at end of file
+        aiu_str = tokenizer.convert_tokens_to_string(
+            tokenizer.convert_ids_to_tokens(aiu_token)
+        )
+        validation_str = tokenizer.convert_tokens_to_string(
+            tokenizer.convert_ids_to_tokens(validation_token)
+        )
+        print(
+            f"In sentence {sentence_index + 1}/{len(aiu_tokens)}, token {token_index}, AIU outputs {aiu_token} instead of {validation_token} -- AIU val={aiu_str} -- CPU val={validation_str}"
+        )
diff --git a/aiu_fms_testing_utils/utils/__init__.py b/aiu_fms_testing_utils/utils/__init__.py
index 99cac86..87f2259 100644
--- a/aiu_fms_testing_utils/utils/__init__.py
+++ b/aiu_fms_testing_utils/utils/__init__.py
@@ -9,16 +9,26 @@
 import json
 import random
 
-def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int, compile_dynamic_sendnn = False, attn_type="sdpa", **padding_kwargs):
+
+def warmup_model(
+    model: nn.Module,
+    input_ids: torch.Tensor,
+    max_new_tokens: int,
+    compile_dynamic_sendnn=False,
+    attn_type="sdpa",
+    **padding_kwargs,
+):
     import torch_sendnn
+
     attention_specific_kwargs = {}
     if attn_type == "paged":
         from aiu_fms_testing_utils.utils.paged import generate, adjust_inputs_to_batch
     else:
         # TODO: Add a unified generation dependent on attn_type
         from fms.utils.generation import generate
+
         attention_specific_kwargs["contiguous_cache"] = True
-    
+
     dprint("AIU warmup")
     pt_compile_model_time = time.time()
 
@@ -30,15 +40,26 @@ def warmup_model(model: nn.Module, input_ids: torch.Tensor, max_new_tokens: int,
         _max_new_tokens = 2
         # always warmup with batch size 2 when using attn_type=paged
         if attn_type == "paged":
-            _warmup_input_ids, _padding_kwargs = adjust_inputs_to_batch(input_ids, **padding_kwargs)
+            _warmup_input_ids, _padding_kwargs = adjust_inputs_to_batch(
+                input_ids, **padding_kwargs
+            )
 
     extra_kwargs = {**_padding_kwargs, "only_last_token": attn_type != "paged"}
 
     with torch_sendnn.warmup_mode():
-        generate(model, _warmup_input_ids, max_new_tokens=_max_new_tokens, use_cache=True, do_sample=False, extra_kwargs=extra_kwargs, **attention_specific_kwargs)
+        generate(
+            model,
+            _warmup_input_ids,
+            max_new_tokens=_max_new_tokens,
+            use_cache=True,
+            do_sample=False,
+            extra_kwargs=extra_kwargs,
+            **attention_specific_kwargs,
+        )
     pt_compile_model_time = time.time() - pt_compile_model_time
     dprint(f"PT compile complete, took {pt_compile_model_time:.3f}s")
 
+
 def ids_for_prompt(prompt, tokenizer):
     tokens = tokenizer.tokenize(prompt)
     ids = tokenizer.convert_tokens_to_ids(tokens)
@@ -47,26 +68,28 @@ def ids_for_prompt(prompt, tokenizer):
     ids = torch.tensor(ids, dtype=torch.long, device="cpu")
     return ids
 
+
 def __download_file(url, filename):
     try:
         response = requests.get(url, stream=True)
         response.raise_for_status()
-        
-        with open(filename, 'wb') as file:
+
+        with open(filename, "wb") as file:
             for chunk in response.iter_content(chunk_size=8192):
                 file.write(chunk)
         print(f"Successfully downloaded {filename}")
-    
+
     except requests.exceptions.RequestException as e:
         print(f"An error occurred: {e}")
 
+
 def __sample_requests(
-    prompt_list: List[str], 
+    prompt_list: List[str],
     num_requests: int,
     tokenizer: BaseTokenizer,
     prompt_length_min: int = 32,
     prompt_length_max: int = 64,
-    seed: Optional[int] = None
+    seed: Optional[int] = None,
 ):
     # Shuffle the dataset.
     if seed is not None:
@@ -81,15 +104,14 @@ def __sample_requests(
         # Tokenize the prompts and completions.
         prompt = prompt_list[i]
         prompt_token_ids = ids_for_prompt(prompt, tokenizer)
-        
+
         prompt_len = len(prompt_token_ids)
         if prompt_len < prompt_length_min or prompt_len > prompt_length_max:
             # Prune too short or too long sequences.
             continue
         filtered_dataset.append((prompt, prompt_len))
-    
+
     return filtered_dataset
-    
 
 
 def sample_sharegpt_requests(
@@ -98,39 +120,44 @@ def sample_sharegpt_requests(
     tokenizer: BaseTokenizer,
     prompt_length_min: int = 32,
     prompt_length_max: int = 64,
-    seed: Optional[int] = None
+    seed: Optional[int] = None,
 ) -> List[Tuple[str, int]]:
     if not os.path.exists(dataset_path):
         print("downloading share-gpt dataset as it does not exist")
-        __download_file("https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json", dataset_path)
+        __download_file(
+            "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json",
+            dataset_path,
+        )
 
     # Load the dataset.
-    with open(dataset_path, encoding='utf-8') as f:
+    with open(dataset_path, encoding="utf-8") as f:
         dataset = json.load(f)
     # Filter out the conversations with less than 2 turns.
     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
     dataset = [data["conversations"][0]["value"] for data in dataset]
-    
-    return __sample_requests(dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed)
+
+    return __sample_requests(
+        dataset, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed
+    )
+
 
 def sample_squad_v2_qa_requests(
     dataset_path: str,
-    num_requests: int, 
-    tokenizer: BaseTokenizer, 
-    prompt_length_min: int = 32, 
-    prompt_length_max: int = 64, 
-    seed: Optional[int] = None
+    num_requests: int,
+    tokenizer: BaseTokenizer,
+    prompt_length_min: int = 32,
+    prompt_length_max: int = 64,
+    seed: Optional[int] = None,
 ) -> List[Tuple[str, int]]:
     from datasets import load_dataset
 
     if os.path.exists(dataset_path):
-        ds = load_dataset(dataset_path)['train']
+        ds = load_dataset(dataset_path)["train"]
     else:
-        ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)['train']
-        
-    
-    ds = [f"{data['context']}\n{data['question']}" for data in ds]
+        ds = load_dataset("rajpurkar/squad_v2", cache_dir=dataset_path)["train"]
 
-    return __sample_requests(ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed)
-    
+    ds = [f"{data['context']}\n{data['question']}" for data in ds]
 
+    return __sample_requests(
+        ds, num_requests, tokenizer, prompt_length_min, prompt_length_max, seed
+    )
diff --git a/aiu_fms_testing_utils/utils/aiu_setup.py b/aiu_fms_testing_utils/utils/aiu_setup.py
index fb9a3df..bbd8848 100644
--- a/aiu_fms_testing_utils/utils/aiu_setup.py
+++ b/aiu_fms_testing_utils/utils/aiu_setup.py
@@ -3,21 +3,24 @@
 # ==============================================================
 # Common utilities
 # ==============================================================
-#-------------
+# -------------
 # Discover the world size and my rank (envars set by torchrun)
 # https://pytorch.org/docs/stable/elastic/run.html#environment-variables
-#-------------
+# -------------
 local_rank = int(os.getenv("LOCAL_RANK", 0))
 rank = int(os.getenv("RANK", 0))
 world_rank = rank
 world_size = int(os.getenv("WORLD_SIZE", 1))
 
+
 def dprint_str(text):
     return f"[{rank:2d}/{world_size:2d}]: {text}"
 
+
 def dprint(text):
     print(dprint_str(text))
 
+
 # ==============================================================
 # Common setup
 # ==============================================================
diff --git a/aiu_fms_testing_utils/utils/paged.py b/aiu_fms_testing_utils/utils/paged.py
index 239d229..7228677 100644
--- a/aiu_fms_testing_utils/utils/paged.py
+++ b/aiu_fms_testing_utils/utils/paged.py
@@ -5,11 +5,12 @@
 import torch
 import fms.utils.spyre.paged
 
+
 def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs):
     """
-    Adjusts the inputs to a batch. Batch size 1 cannot be handled since we want a symbolic shape for the batch 
+    Adjusts the inputs to a batch. Batch size 1 cannot be handled since we want a symbolic shape for the batch
     and pytorch automatically sets size 1 dimensions as static
-    
+
     Note: This is fixed in pytorch 2.7
     """
     input_ids = input_ids[0].repeat(2, 1)
@@ -23,6 +24,7 @@ def adjust_inputs_to_batch(input_ids: torch.Tensor, **padding_kwargs):
         kwargs["position_ids"] = position_ids[0].repeat(2, 1)
     return input_ids, kwargs
 
+
 # FIXME: We should use default generate, but that will require a larger re-work of generate
 def generate(
     model: Union[Callable, torch.nn.Module],
@@ -88,7 +90,7 @@ def generate(
     if isinstance(input_ids, torch.Tensor):
         if len(input_ids.shape) == 1:
             input_ids = input_ids.unsqueeze(0)
-        
+
         is_batch = input_ids.shape[0] > 1
         # our model requires batch dimension
         if not is_batch:
@@ -106,8 +108,18 @@ def generate(
     result = input_ids
     next_input = input_ids
     BLOCK_SIZE = 64
-    _MAX_BATCH = int(os.environ.setdefault("VLLM_DT_MAX_BATCH_SIZE", str(input_ids.size(0))))
-    _MAX_CONTEXT_LENGTH = int(os.environ.setdefault("VLLM_DT_MAX_CONTEXT_LEN", str((((input_ids.size(1) + max_new_tokens - 1) // BLOCK_SIZE) + 1) * BLOCK_SIZE)))
+    _MAX_BATCH = int(
+        os.environ.setdefault("VLLM_DT_MAX_BATCH_SIZE", str(input_ids.size(0)))
+    )
+    _MAX_CONTEXT_LENGTH = int(
+        os.environ.setdefault(
+            "VLLM_DT_MAX_CONTEXT_LEN",
+            str(
+                (((input_ids.size(1) + max_new_tokens - 1) // BLOCK_SIZE) + 1)
+                * BLOCK_SIZE
+            ),
+        )
+    )
     NUM_BLOCKS = (_MAX_BATCH * _MAX_CONTEXT_LENGTH) // BLOCK_SIZE
     max_seq_len = input_ids.size(1) + max_new_tokens
     if hasattr(model, "head"):
@@ -332,4 +344,4 @@ def generate(
 
     if timing != "":
         return result, times
-    return result
\ No newline at end of file
+    return result
diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py
index f50ec59..9aae44d 100644
--- a/scripts/generate_metrics.py
+++ b/scripts/generate_metrics.py
@@ -7,8 +7,17 @@
 
 import torch
 from torch import distributed as dist
-from aiu_fms_testing_utils.testing.validation import capture_level_1_metrics, extract_validation_information, LogitsExtractorHook, get_default_validation_prefix, load_validation_information, print_failed_cases, \
-    validate_level_0, GoldenTokenHook, top_k_loss_calculator
+from aiu_fms_testing_utils.testing.validation import (
+    capture_level_1_metrics,
+    extract_validation_information,
+    LogitsExtractorHook,
+    get_default_validation_prefix,
+    load_validation_information,
+    print_failed_cases,
+    validate_level_0,
+    GoldenTokenHook,
+    top_k_loss_calculator,
+)
 from aiu_fms_testing_utils.utils import ids_for_prompt, sample_sharegpt_requests
 from fms.models import get_model
 from fms.utils import tokenizers
@@ -83,19 +92,19 @@
     "--topk_per_token",
     type=int,
     help="top k values per token to generate loss on",
-    default=20
+    default=20,
 )
 parser.add_argument(
     "--num_test_tokens_per_sequence",
     type=int,
     help="number of tokens in test. For instance, if max_new_tokens=128 and num_test_tokens_per_sequence=256, this means we will generate data over 2 sample prompts. If not set, will be set to max_new_tokens",
-    default=None
+    default=None,
 )
 parser.add_argument(
     "--extra_get_model_kwargs",
-    nargs='*',
+    nargs="*",
     default={},
-    help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,..."
+    help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,...",
 )
 parser.add_argument(
     "--distributed",
@@ -105,7 +114,7 @@
 parser.add_argument(
     "--skip_computation",
     action="store_true",
-    help="Set this if the output is already assumed to be computed and would like to regenerate metrics without model loading or computation"
+    help="Set this if the output is already assumed to be computed and would like to regenerate metrics without model loading or computation",
 )
 local_rank = int(os.getenv("LOCAL_RANK", 0))
 world_size = int(os.getenv("WORLD_SIZE", 1))
@@ -120,14 +129,20 @@
 
 extra_get_model_kwargs = {}
 for a in args.extra_get_model_kwargs:
-     a_split = a.split("=")
-     try:
+    a_split = a.split("=")
+    try:
         extra_get_model_kwargs[a_split[0]] = ast.literal_eval(a_split[1])
-     except ValueError:
+    except ValueError:
         extra_get_model_kwargs[a_split[0]] = a_split[1]
 
 # this follows the same pattern of naming in test_shapes. This way we can save and re-use for quicker shape testing.
-prefix = get_default_validation_prefix(args.variant, args.max_new_tokens, args.batch_size, args.min_pad_length, args.default_dtype)
+prefix = get_default_validation_prefix(
+    args.variant,
+    args.max_new_tokens,
+    args.batch_size,
+    args.min_pad_length,
+    args.default_dtype,
+)
 if os.path.exists(os.path.join(args.output_dir, f"{prefix}.prob_mean.csv")):
     print("skipping metric generation as it has already been done")
     exit(0)
@@ -148,11 +163,12 @@
 
 torch.set_grad_enabled(False)
 
+
 def find_eos_index(reference_tokens, eos_token_id):
     result = []
     for sentence in reference_tokens:
         found_eos = False
-        for token_idx, token in enumerate(sentence[args.min_pad_length:]):
+        for token_idx, token in enumerate(sentence[args.min_pad_length :]):
             if token.item() == eos_token_id:
                 found_eos = True
                 result.append(token_idx)
@@ -161,13 +177,20 @@ def find_eos_index(reference_tokens, eos_token_id):
             result.append(args.max_new_tokens)
     return result
 
+
 def filter_before_eos(l, filter_indexes):
     from itertools import groupby
-    filtered_results = [list(g)[:filter_indexes[k]] for k, g in groupby(l, key=lambda x: x[0])]
+
+    filtered_results = [
+        list(g)[: filter_indexes[k]] for k, g in groupby(l, key=lambda x: x[0])
+    ]
     return [item for sublist in filtered_results for item in sublist]
 
+
 def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
-    prompts_and_sizes = sample_sharegpt_requests(args.sharegpt_path, batch_size, tokenizer, seq_length // 2, seq_length, seed)
+    prompts_and_sizes = sample_sharegpt_requests(
+        args.sharegpt_path, batch_size, tokenizer, seq_length // 2, seq_length, seed
+    )
     prompt_list = []
     for prompt, _ in prompts_and_sizes:
         prompt_list.append(ids_for_prompt(prompt, tokenizer))
@@ -175,13 +198,15 @@ def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
     input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
     return input_ids, padding_kwargs
 
+
 def write_csv(l, path, metric):
-    with open(path, 'w') as f:
-        f.write(f'{metric}\n')
+    with open(path, "w") as f:
+        f.write(f"{metric}\n")
         for t in l:
-            f.write(f"{t[2].item()}\n") 
+            f.write(f"{t[2].item()}\n")
         f.close()
 
+
 # prepare the cuda model
 if not args.skip_computation:
     cuda_model = get_model(
@@ -212,7 +237,9 @@ def write_csv(l, path, metric):
     cpu_model.eval()
     print("loaded cpu model")
 
-    ids, padding_kwargs = __prepare_inputs(args.batch_size, args.min_pad_length, tokenizer)
+    ids, padding_kwargs = __prepare_inputs(
+        args.batch_size, args.min_pad_length, tokenizer
+    )
 
     # first test validation level 0
     cpu_validation_info = extract_validation_information(
@@ -221,7 +248,7 @@ def write_csv(l, path, metric):
         args.max_new_tokens,
         LogitsExtractorHook(),
         attn_algorithm="math",
-        **padding_kwargs
+        **padding_kwargs,
     )
     cpu_static_tokens = cpu_validation_info.get_info("tokens")
     print("extracted cpu validation information")
@@ -236,24 +263,41 @@ def write_csv(l, path, metric):
         args.max_new_tokens,
         None,
         only_last_token=True,
-        **{k: v.to("cuda") for k,v in padding_kwargs.items()}
+        **{k: v.to("cuda") for k, v in padding_kwargs.items()},
     )
     cuda_static_tokens = cuda_validation_info.get_info("tokens")
     failed_responses = validate_level_0(cpu_static_tokens, cuda_static_tokens)
 
     print("extracted cuda validation information level 0")
     if local_rank == 0:
-        if len(failed_responses) != 0:    
-            print_failed_cases(failed_responses, cpu_static_tokens, cuda_static_tokens, tokenizer)
+        if len(failed_responses) != 0:
+            print_failed_cases(
+                failed_responses, cpu_static_tokens, cuda_static_tokens, tokenizer
+            )
 
 num_test_tokens_per_sequence = args.num_test_tokens_per_sequence
 if num_test_tokens_per_sequence is None:
     num_test_tokens_per_sequence = args.max_new_tokens
 
-cross_entropy = lambda r, t: torch.nn.CrossEntropyLoss()(r, t.softmax(dim=1).to(dtype=torch.float32))
-prob_mean = lambda r, t: torch.mean((r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32)) - 1.0)
-prob_std = lambda r, t: torch.std(r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32))
-diff_mean = lambda r, t: torch.mean(torch.abs(r.softmax(dim=1).to(dtype=torch.float32) - t.softmax(dim=1).to(dtype=torch.float32)))
+cross_entropy = lambda r, t: torch.nn.CrossEntropyLoss()(
+    r, t.softmax(dim=1).to(dtype=torch.float32)
+)
+prob_mean = lambda r, t: torch.mean(
+    (
+        r.softmax(dim=1).to(dtype=torch.float32)
+        / t.softmax(dim=1).to(dtype=torch.float32)
+    )
+    - 1.0
+)
+prob_std = lambda r, t: torch.std(
+    r.softmax(dim=1).to(dtype=torch.float32) / t.softmax(dim=1).to(dtype=torch.float32)
+)
+diff_mean = lambda r, t: torch.mean(
+    torch.abs(
+        r.softmax(dim=1).to(dtype=torch.float32)
+        - t.softmax(dim=1).to(dtype=torch.float32)
+    )
+)
 
 prob_mean_metrics = []
 prob_std_metrics = []
@@ -265,10 +309,16 @@ def write_csv(l, path, metric):
     cuda_path = os.path.join(args.output_dir, f"{prefix}.cuda_validation_info.{i}.out")
     if os.path.exists(cpu_path) and os.path.exists(cuda_path):
         print(f"found the logits at {cpu_path}, reusing")
-        cpu_validation_info = load_validation_information(cpu_path, "logits", args.batch_size, tokenizer)
-        cuda_validation_info = load_validation_information(cuda_path, "logits", args.batch_size, tokenizer)
+        cpu_validation_info = load_validation_information(
+            cpu_path, "logits", args.batch_size, tokenizer
+        )
+        cuda_validation_info = load_validation_information(
+            cuda_path, "logits", args.batch_size, tokenizer
+        )
     elif not args.skip_computation:
-        ids, padding_kwargs = __prepare_inputs(args.batch_size, args.min_pad_length, tokenizer, i)
+        ids, padding_kwargs = __prepare_inputs(
+            args.batch_size, args.min_pad_length, tokenizer, i
+        )
 
         # only need to compute this once if we aren't generating more test data
         if num_test_tokens_per_sequence > args.max_new_tokens:
@@ -278,7 +328,7 @@ def write_csv(l, path, metric):
                 args.max_new_tokens,
                 LogitsExtractorHook(),
                 attn_algorithm="math",
-                **padding_kwargs
+                **padding_kwargs,
             )
 
         # generate aiu validation info
@@ -288,7 +338,7 @@ def write_csv(l, path, metric):
             args.max_new_tokens,
             GoldenTokenHook(cpu_validation_info.get_info("tokens"), "cuda"),
             only_last_token=True,
-            **{k: v.to("cuda") for k,v in padding_kwargs.items()}
+            **{k: v.to("cuda") for k, v in padding_kwargs.items()},
         )
 
         print("extracted cuda validation information level 1")
@@ -296,8 +346,10 @@ def write_csv(l, path, metric):
         if local_rank == 0:
             cpu_validation_info.save(cpu_path)
             cuda_validation_info.save(cuda_path)
-    
-    eos_indexes = find_eos_index(cpu_validation_info.get_info("tokens"), tokenizer.eos_token_id)
+
+    eos_indexes = find_eos_index(
+        cpu_validation_info.get_info("tokens"), tokenizer.eos_token_id
+    )
     level_1_metrics = capture_level_1_metrics(
         cpu_validation_info.get_info("logits"),
         cuda_validation_info.get_info("logits"),
@@ -327,7 +379,21 @@ def write_csv(l, path, metric):
     prob_diff_metrics.extend(filter_before_eos(level_1_metrics, eos_indexes))
 
 if local_rank == 0:
-    write_csv(prob_mean_metrics, os.path.join(args.output_dir, f"{prefix}.prob_mean.csv"), "prob_mean")
-    write_csv(prob_std_metrics, os.path.join(args.output_dir, f"{prefix}.prob_std.csv"), "prob_std")
-    write_csv(prob_ce_loss_metrics, os.path.join(args.output_dir, f"{prefix}.ce.csv"), "ce")
-    write_csv(prob_diff_metrics, os.path.join(args.output_dir, f"{prefix}.diff_mean.csv"), "diff_mean")
+    write_csv(
+        prob_mean_metrics,
+        os.path.join(args.output_dir, f"{prefix}.prob_mean.csv"),
+        "prob_mean",
+    )
+    write_csv(
+        prob_std_metrics,
+        os.path.join(args.output_dir, f"{prefix}.prob_std.csv"),
+        "prob_std",
+    )
+    write_csv(
+        prob_ce_loss_metrics, os.path.join(args.output_dir, f"{prefix}.ce.csv"), "ce"
+    )
+    write_csv(
+        prob_diff_metrics,
+        os.path.join(args.output_dir, f"{prefix}.diff_mean.csv"),
+        "diff_mean",
+    )
diff --git a/scripts/inference.py b/scripts/inference.py
index f67754a..d77069d 100644
--- a/scripts/inference.py
+++ b/scripts/inference.py
@@ -36,7 +36,7 @@
     type=str,
     choices=["cuda", "cpu", "aiu", "aiu-senulator"],
     default="cuda",
-    help="The device to run the model on"
+    help="The device to run the model on",
 )
 parser.add_argument(
     "--architecture",
@@ -213,10 +213,11 @@
     help="Number of iterations of inference to perform. Used for variance performance capture.",
 )
 parser.add_argument(
-    '-v', '--verbose',
-    action='count',
+    "-v",
+    "--verbose",
+    action="count",
     default=0,
-    help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)"
+    help="Set verbosity level (pass flag as `-v`, `-vv`, `-vvv`)",
 )
 parser.add_argument(
     "--attention_type",
@@ -236,12 +237,14 @@
     if "aiu" in args.device_type:
         try:
             from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear
+
             print("Loaded `aiu_addons` functionalities")
         except:
             raise ImportError("Failed to import GPTQ addons from fms-mo.")
 elif args.quantization == "int8":
     try:
         from fms_mo.aiu_addons.i8i8 import i8i8_aiu_adapter, i8i8_aiu_linear
+
         print("Loaded `aiu_addons` functionalities")
     except:
         raise ImportError("Failed to import INT8 addons from fms-mo.")
@@ -356,7 +359,9 @@
 fused_weights = not args.unfuse_weights
 if args.quantization == "gptq":
     if fused_weights and is_aiu_backend:
-        raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights")
+        raise ValueError(
+            "GPTQ checkpoints on AIU must always run with --unfuse_weights"
+        )
     if default_dtype is not None:
         raise ValueError(
             "GPTQ default_dtype must be None to preserve the checkpoint data types."
@@ -373,7 +378,7 @@
 
     qconfig_path = args.model_path + "/quantize_config.json"
     if os.path.exists(qconfig_path):
-        with open(qconfig_path, 'r') as f:
+        with open(qconfig_path, "r") as f:
             dprint(f"loading quantization config from {qconfig_path}")
             qconfig = json.load(f)
             group_size = qconfig["group_size"]
@@ -397,7 +402,9 @@
     }
 elif args.quantization == "int8":
     if fused_weights and is_aiu_backend:
-        raise ValueError("INT8 checkpoints on AIU must always run with --unfuse_weights")
+        raise ValueError(
+            "INT8 checkpoints on AIU must always run with --unfuse_weights"
+        )
     if default_dtype is not None:
         raise ValueError(
             "INT8 default_dtype must be None to preserve the checkpoint data types."
@@ -425,17 +432,15 @@ def select_int8_module(
         elif any("roberta" in p.lower() for p in [args.model_path, args.architecture]):
             smoothquant_layers = ["query", "key", "value", "w1"]
         else:
-            raise NotImplementedError(
-                "INT8 architecture does not support smoothquant."
-            )
+            raise NotImplementedError("INT8 architecture does not support smoothquant.")
     else:
         smoothquant_layers = []
 
     linear_config = {
         "linear_type": partial(
             select_int8_module,
-            smoothquant = args.int8_smoothquant,
-            smoothquant_layers = smoothquant_layers,
+            smoothquant=args.int8_smoothquant,
+            smoothquant_layers=smoothquant_layers,
         ),
         "weight_per_channel": args.int8_weight_per_channel,
         "activ_quant_type": args.int8_activ_quant_type,
@@ -443,12 +448,12 @@ def select_int8_module(
 else:
     linear_config = {"linear_type": "torch_linear"}
 
-dprint("="*60)
+dprint("=" * 60)
 dprint(f"model_path={args.model_path}")
 dprint(f"{linear_config=}")
 dprint(f"{fused_weights=}")
 dprint(f"data_type={default_dtype}")
-dprint("="*60 + "\n")
+dprint("=" * 60 + "\n")
 
 model = get_model(
     args.architecture,
@@ -465,13 +470,27 @@ def select_int8_module(
 
 if args.quantization in ["gptq", "int8"]:
     if rank == 0 and args.verbose > 0:
-        dprint("PARAMS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_parameters()))
-        dprint("BUFFERS:\n" + "\n".join(f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}" for k,v in model.named_buffers()))
-        dprint("="*60 + "\n")
+        dprint(
+            "PARAMS:\n"
+            + "\n".join(
+                f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}"
+                for k, v in model.named_parameters()
+            )
+        )
+        dprint(
+            "BUFFERS:\n"
+            + "\n".join(
+                f"{k:60} {str(v.dtype):15} {str(v.device):10} {list(v.size())}"
+                for k, v in model.named_buffers()
+            )
+        )
+        dprint("=" * 60 + "\n")
     if args.architecture == "llama":
-        dprint("[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys.")
+        dprint(
+            "[NOTE] In Llama models, it's OK for bias and rotary embeddings to be marked as unused keys."
+        )
     dprint(model)
-    dprint("="*60 + "\n")
+    dprint("=" * 60 + "\n")
 
 tokenizer = tokenizers.get_tokenizer(args.tokenizer)
 model.eval()
@@ -482,7 +501,9 @@ def select_int8_module(
 if args.compile:
     dprint("compiling model")
     if is_aiu_backend:
-        model.compile(backend="sendnn", options={'sendnn.dynamic': args.compile_dynamic_sendnn})
+        model.compile(
+            backend="sendnn", options={"sendnn.dynamic": args.compile_dynamic_sendnn}
+        )
     else:
         # compiling can make first inference pass slow
         model.compile(mode=args.compile_mode, backend=args.compile_backend)
@@ -538,9 +559,9 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
     assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}"
 
     # Check if we have enough files
-    assert (
-        len(prompt_file_paths) >= args.batch_size
-    ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}"
+    assert len(prompt_file_paths) >= args.batch_size, (
+        f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}"
+    )
 
     prompts = []
     for i, prompt_file_path in enumerate(prompt_file_paths):
@@ -672,7 +693,7 @@ def infer(use_cache, do_sample, warmup):
         timing=args.timing,
         eos_token_id=eos_token_id,
         extra_kwargs=extra_generation_kwargs,
-        **attention_specific_kwargs
+        **attention_specific_kwargs,
     )
     if args.timing != "":
         result, timings = result
@@ -680,14 +701,24 @@ def infer(use_cache, do_sample, warmup):
             dprint(f"E2E timing information: {timings[0]:.3f}s")
         elif args.timing == "per-token":
             if not warmup:
-                dprint(f"First-token latency: {timings[0]*1000:.3f} ms")
-                dprint(f"Average next-token latency (including first token): {np.mean(timings)*1000:.3f} ms")
+                dprint(f"First-token latency: {timings[0] * 1000:.3f} ms")
+                dprint(
+                    f"Average next-token latency (including first token): {np.mean(timings) * 1000:.3f} ms"
+                )
                 if len(timings) > 1:
-                    dprint(f"Average next-token latency: {np.mean(timings[1:])*1000:.3f} ms")
-                    dprint(f"Max next-token latency: {np.max(timings[1:])*1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})")
-                    dprint(f"Min next-token latency: {np.min(timings[1:])*1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})")
-                    dprint(f"Std deviation of next-token latencies: {np.std(timings[1:])*1000:.3f} ms")
-            timings = [f"{t*1000:.3f}" for t in timings]
+                    dprint(
+                        f"Average next-token latency: {np.mean(timings[1:]) * 1000:.3f} ms"
+                    )
+                    dprint(
+                        f"Max next-token latency: {np.max(timings[1:]) * 1000:.3f} ms (token #{np.argmax(timings[1:]) + 2})"
+                    )
+                    dprint(
+                        f"Min next-token latency: {np.min(timings[1:]) * 1000:.3f} ms (token #{np.argmin(timings[1:]) + 2})"
+                    )
+                    dprint(
+                        f"Std deviation of next-token latencies: {np.std(timings[1:]) * 1000:.3f} ms"
+                    )
+            timings = [f"{t * 1000:.3f}" for t in timings]
             dprint(f"Per-token timing information: {', '.join(timings)} ms")
     if len(result.shape) == 1:
         result = result.unsqueeze(0)
@@ -706,7 +737,14 @@ def infer(use_cache, do_sample, warmup):
     dprint(f"compilation warmup")
     pt_compile_model_time = time.time()
     if args.device_type == "aiu":  # only run warmup for AIU, no need for senulator
-        warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, attn_type=args.attention_type, **extra_generation_kwargs)
+        warmup_model(
+            model,
+            ids,
+            args.max_new_tokens,
+            args.compile_dynamic_sendnn,
+            attn_type=args.attention_type,
+            **extra_generation_kwargs,
+        )
         aiu_warmup_time = time.time()
         for sample, cache in itertools.product(do_sample, use_cache):
             infer(cache, sample, True)
diff --git a/scripts/roberta.py b/scripts/roberta.py
index 124b09f..a7b9c18 100644
--- a/scripts/roberta.py
+++ b/scripts/roberta.py
@@ -36,55 +36,68 @@
 # ==============================================================
 if __name__ == "__main__":
     # Number of batches to create
-    NUM_BATCHES=1
+    NUM_BATCHES = 1
 
-    #-------------
+    # -------------
     # Command line argument parsing
-    #-------------
-    parser = argparse.ArgumentParser(description="PyTorch Small Toy Tensor Parallel Example")
-    parser.add_argument(      "--backend",       help="PyTorch Dynamo compiler backend", default='cpu', choices=['cpu', 'aiu'])
+    # -------------
+    parser = argparse.ArgumentParser(
+        description="PyTorch Small Toy Tensor Parallel Example"
+    )
+    parser.add_argument(
+        "--backend",
+        help="PyTorch Dynamo compiler backend",
+        default="cpu",
+        choices=["cpu", "aiu"],
+    )
     pargs = parser.parse_args()
 
-    if pargs.backend == 'aiu':
-        dynamo_backend = 'sendnn'
+    if pargs.backend == "aiu":
+        dynamo_backend = "sendnn"
     else:
-        dynamo_backend = 'inductor'
+        dynamo_backend = "inductor"
 
     is_distributed = world_size > 1
     if is_distributed:
         # Initialize the process group
-        torch.distributed.init_process_group(backend="gloo", rank=world_rank, world_size=world_size)
+        torch.distributed.init_process_group(
+            backend="gloo", rank=world_rank, world_size=world_size
+        )
         # Looks like a string compare, but is actually comparing the components
         # https://github.com/pytorch/pytorch/blob/b5be4d8c053e22672719b9a33386b071daf9860d/torch/torch_version.py#L10-L16
-        if torch.__version__ < '2.3.0':
+        if torch.__version__ < "2.3.0":
             # Fix until PyTorch 2.3
-            torch._C._distributed_c10d._register_process_group("default", torch.distributed.group.WORLD)
+            torch._C._distributed_c10d._register_process_group(
+                "default", torch.distributed.group.WORLD
+            )
 
-    #-------------
+    # -------------
     # Setup AIU specific environment variables
-    #-------------
+    # -------------
     if "sendnn" in dynamo_backend:
         aiu_setup.aiu_dist_setup(world_rank, world_size)
 
-    #-------------
+    # -------------
     # Display some diagnostics
-    #-------------
+    # -------------
     if 0 == world_rank:
-        dprint("-"*60)
-        dprint(f"Python Version  : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
+        dprint("-" * 60)
+        dprint(
+            f"Python Version  : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+        )
         dprint(f"PyTorch Version : {torch.__version__}")
         dprint(f"Dynamo Backend  : {pargs.backend} -> {dynamo_backend}")
-        if pargs.backend == 'aiu':
+        if pargs.backend == "aiu":
             for peer_rank in range(world_size):
-                pcie_env_str="AIU_WORLD_RANK_"+str(peer_rank)
+                pcie_env_str = "AIU_WORLD_RANK_" + str(peer_rank)
                 dprint(f"PCI Addr. for Rank {peer_rank} : {os.environ[pcie_env_str]}")
-        print("-"*60)
+        print("-" * 60)
     if is_distributed:
         torch.distributed.barrier()
 
-    #-------------
+    # -------------
     # Create the model
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Creating the model...")
     # model_name = "roberta-base"
@@ -111,20 +124,20 @@
     #     variant=model_name
     # )
 
-    #-------------
+    # -------------
     # Compile the model
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Compiling the model...")
     the_compiled_model = torch.compile(hf_model_fms, backend=dynamo_backend)
-    the_compiled_model.eval() # inference only mode
+    the_compiled_model.eval()  # inference only mode
     torch.set_grad_enabled(False)
 
-    #-------------
+    # -------------
     # Run the model
     # - First run the compiler will activate to create the artifacts
     # - Second run there is no compiler involved
-    #-------------
+    # -------------
     if is_distributed:
         torch.distributed.barrier()
 
@@ -150,9 +163,9 @@
     if 0 == world_rank:
         dprint(f"Answer: ({the_output[0]['score']:6.5f}) {the_output[0]['sequence']}")
 
-    #-------------
+    # -------------
     # Cleanup
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Done")
     if is_distributed:
diff --git a/scripts/small-toy.py b/scripts/small-toy.py
index a6965e4..a7cf0d2 100644
--- a/scripts/small-toy.py
+++ b/scripts/small-toy.py
@@ -18,6 +18,7 @@
 # Import AIU Libraries
 from torch_sendnn import torch_sendnn
 
+
 # ==============================================================
 # Toy Encoder Model
 # ==============================================================
@@ -33,21 +34,30 @@ def __init__(self):
         self._linear_nets = torch.nn.ModuleList()
         for n in range(self.LAYERS_N):
             torch.manual_seed(42)
-            block = FeedForwardBlock(self.INPUT_N, hidden_grow_factor=self.HIDDEN_FACTOR, activation_fn=torch.nn.ReLU(), p_dropout=0)
+            block = FeedForwardBlock(
+                self.INPUT_N,
+                hidden_grow_factor=self.HIDDEN_FACTOR,
+                activation_fn=torch.nn.ReLU(),
+                p_dropout=0,
+            )
             self._linear_nets.append(block)
         self._linear_nets.append(torch.nn.ReLU())
 
     def copy_weights(self, par_model, seq_model):
         self_parent_layer = self if par_model is None else par_model
         with torch.no_grad():
-            for (seq_name, seq_layer), (self_name, self_layer) in zip(seq_model.named_children(), self_parent_layer.named_children()):
+            for (seq_name, seq_layer), (self_name, self_layer) in zip(
+                seq_model.named_children(), self_parent_layer.named_children()
+            ):
                 if hasattr(self_layer, "load_weights"):
-                    self_layer.load_weights( {
-                        "w1.weight": seq_layer.w1.weight,
-                        "w1.bias": seq_layer.w1.bias,
-                        "w2.weight": seq_layer.w2.weight,
-                        "w2.bias": seq_layer.w2.bias,
-                        })
+                    self_layer.load_weights(
+                        {
+                            "w1.weight": seq_layer.w1.weight,
+                            "w1.bias": seq_layer.w1.bias,
+                            "w2.weight": seq_layer.w2.weight,
+                            "w2.bias": seq_layer.w2.bias,
+                        }
+                    )
                 else:
                     self.copy_weights(self_layer, seq_layer)
 
@@ -57,60 +67,74 @@ def forward(self, x):
             _in = net(_in)
         return _in
 
+
 # ==============================================================
 # Main
 # ==============================================================
 if __name__ == "__main__":
     # Number of batches to create
-    NUM_BATCHES=1
+    NUM_BATCHES = 1
 
-    #-------------
+    # -------------
     # Command line argument parsing
-    #-------------
-    parser = argparse.ArgumentParser(description="PyTorch Small Toy Tensor Parallel Example")
-    parser.add_argument(      "--backend",       help="PyTorch Dynamo compiler backend", default='cpu', choices=['cpu', 'aiu'])
+    # -------------
+    parser = argparse.ArgumentParser(
+        description="PyTorch Small Toy Tensor Parallel Example"
+    )
+    parser.add_argument(
+        "--backend",
+        help="PyTorch Dynamo compiler backend",
+        default="cpu",
+        choices=["cpu", "aiu"],
+    )
     pargs = parser.parse_args()
 
-    if pargs.backend == 'aiu':
-        dynamo_backend = 'sendnn'
+    if pargs.backend == "aiu":
+        dynamo_backend = "sendnn"
     else:
-        dynamo_backend = 'inductor'
+        dynamo_backend = "inductor"
 
     is_distributed = world_size > 1
     if is_distributed:
         # Initialize the process group
-        torch.distributed.init_process_group(backend="gloo", rank=world_rank, world_size=world_size)
+        torch.distributed.init_process_group(
+            backend="gloo", rank=world_rank, world_size=world_size
+        )
         # Looks like a string compare, but is actually comparing the components
         # https://github.com/pytorch/pytorch/blob/b5be4d8c053e22672719b9a33386b071daf9860d/torch/torch_version.py#L10-L16
-        if torch.__version__ < '2.3.0':
+        if torch.__version__ < "2.3.0":
             # Fix until PyTorch 2.3
-            torch._C._distributed_c10d._register_process_group("default", torch.distributed.group.WORLD)
+            torch._C._distributed_c10d._register_process_group(
+                "default", torch.distributed.group.WORLD
+            )
 
-    #-------------
+    # -------------
     # Setup AIU specific environment variables
-    #-------------
+    # -------------
     if "sendnn" in dynamo_backend:
         aiu_setup.aiu_dist_setup(world_rank, world_size)
 
-    #-------------
+    # -------------
     # Display some diagnostics
-    #-------------
+    # -------------
     if 0 == world_rank:
-        dprint("-"*60)
-        dprint(f"Python Version  : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}")
+        dprint("-" * 60)
+        dprint(
+            f"Python Version  : {sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
+        )
         dprint(f"PyTorch Version : {torch.__version__}")
         dprint(f"Dynamo Backend  : {pargs.backend} -> {dynamo_backend}")
-        if pargs.backend == 'aiu':
+        if pargs.backend == "aiu":
             for peer_rank in range(world_size):
-                pcie_env_str="AIU_WORLD_RANK_"+str(peer_rank)
+                pcie_env_str = "AIU_WORLD_RANK_" + str(peer_rank)
                 dprint(f"PCI Addr. for Rank {peer_rank} : {os.environ[pcie_env_str]}")
-        print("-"*60)
+        print("-" * 60)
     if is_distributed:
         torch.distributed.barrier()
 
-    #-------------
+    # -------------
     # Create the model
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Creating the model...")
     the_model = ToyModelFM()
@@ -118,20 +142,20 @@ def forward(self, x):
         # Create a Tensor Parallel version of the model
         apply_tp(the_model, torch.distributed.group.WORLD)
 
-    #-------------
+    # -------------
     # Compile the model
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Compiling the model...")
     the_compiled_model = torch.compile(the_model, backend=dynamo_backend)
-    the_compiled_model.eval() # inference only mode
+    the_compiled_model.eval()  # inference only mode
     torch.set_grad_enabled(False)
 
-    #-------------
+    # -------------
     # Run the model
     # - First run the compiler will activate to create the artifacts
     # - Second run there is no compiler involved
-    #-------------
+    # -------------
     if is_distributed:
         torch.distributed.barrier()
 
@@ -148,9 +172,9 @@ def forward(self, x):
         dprint(f"Running model: Second Time...")
     the_outputs = the_compiled_model(the_inputs)
 
-    #-------------
+    # -------------
     # Cleanup
-    #-------------
+    # -------------
     if 0 == world_rank:
         dprint(f"Done")
     if is_distributed:
diff --git a/scripts/validation.py b/scripts/validation.py
index c5b1449..bdbc01f 100644
--- a/scripts/validation.py
+++ b/scripts/validation.py
@@ -15,7 +15,17 @@
 from fms.utils.generation import pad_input_ids
 from torch import distributed as dist
 from aiu_fms_testing_utils.utils import warmup_model
-from aiu_fms_testing_utils.testing.validation import LogitsExtractorHook, capture_level_1_metrics, extract_validation_information, StaticTokenInjectorHook, GoldenTokenHook, filter_failed_level_1_cases, validate_level_0, load_validation_information, print_failed_cases
+from aiu_fms_testing_utils.testing.validation import (
+    LogitsExtractorHook,
+    capture_level_1_metrics,
+    extract_validation_information,
+    StaticTokenInjectorHook,
+    GoldenTokenHook,
+    filter_failed_level_1_cases,
+    validate_level_0,
+    load_validation_information,
+    print_failed_cases,
+)
 from aiu_fms_testing_utils.utils import aiu_setup
 from aiu_fms_testing_utils.utils.aiu_setup import dprint, rank, local_rank, world_size
 
@@ -28,7 +38,7 @@
     type=str,
     choices=["aiu", "aiu-senulator"],
     default="aiu",
-    help="The device to run the model on"
+    help="The device to run the model on",
 )
 parser.add_argument("--validation_device", type=str, default="cpu")
 parser.add_argument(
@@ -212,22 +222,22 @@
     "--save_validation_info_path",
     type=str,
     default=None,
-    help="If set, will save the validation info into the path specified for later use"
+    help="If set, will save the validation info into the path specified for later use",
 )
 parser.add_argument(
     "--extra_get_model_kwargs",
-    nargs='*',
+    nargs="*",
     default={},
-    help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,..."
+    help="Use this to override model configuration values to get model. Example: --extra_get_model_kwargs nlayers=2,...",
 )
 args = parser.parse_args()
 
 extra_get_model_kwargs = {}
 for a in args.extra_get_model_kwargs:
-     a_split = a.split("=")
-     try:
+    a_split = a.split("=")
+    try:
         extra_get_model_kwargs[a_split[0]] = ast.literal_eval(a_split[1])
-     except ValueError:
+    except ValueError:
         extra_get_model_kwargs[a_split[0]] = a_split[1]
 
 # this is a test model config
@@ -243,7 +253,9 @@
 
 needs_validation_generation = args.validation_files_path == ""
 needs_validation_forward = (
-    not needs_validation_generation and args.validation_files_type in ["text", "tokens"] and args.validation_level == 1
+    not needs_validation_generation
+    and args.validation_files_type in ["text", "tokens"]
+    and args.validation_level == 1
 )
 needs_validation_run = needs_validation_forward or needs_validation_generation
 
@@ -251,11 +263,10 @@
 
 if args.quantization == "gptq":
     try:
-
         # validation script always loads AIU addon
         from fms_mo.aiu_addons.gptq import gptq_aiu_adapter, gptq_aiu_linear
-        print("Loaded `aiu_addons` functionalities")
 
+        print("Loaded `aiu_addons` functionalities")
 
     except ImportError:
         print("Failed to import addon packages")
@@ -354,7 +365,7 @@
 if args.quantization == "gptq":
     qconfig_path = args.model_path + "/quantize_config.json"
     if os.path.exists(qconfig_path):
-        with open(qconfig_path, 'r') as f:
+        with open(qconfig_path, "r") as f:
             dprint(f"loading quantization config from {qconfig_path}")
             qconfig = json.load(f)
             group_size = qconfig["group_size"]
@@ -395,8 +406,10 @@
     # model, the adapter will take care of converting key/values from
     # ckpt into the appropriate form for the model
     if fused_weights:
-        raise ValueError("GPTQ checkpoints on AIU must always run with --unfuse_weights")
-    default_dtype=None  # GPTQ dtype always comes from ckpt, can't be enforced
+        raise ValueError(
+            "GPTQ checkpoints on AIU must always run with --unfuse_weights"
+        )
+    default_dtype = None  # GPTQ dtype always comes from ckpt, can't be enforced
 else:
     linear_config = {"linear_type": "torch_linear"}
     linear_config_validation = {"linear_type": "torch_linear"}
@@ -412,7 +425,7 @@
     group=dist.group.WORLD,
     linear_config=linear_config,
     fused_weights=fused_weights,
-    **extra_get_model_kwargs
+    **extra_get_model_kwargs,
 )
 
 if args.quantization == "gptq":
@@ -422,14 +435,12 @@
             "and rotary embeddings, in GPTQ LLaMA models"
         )
     dprint(model)
-    dprint("="*60 + "\n")
+    dprint("=" * 60 + "\n")
 
 if needs_validation_run:
     if args.quantization != "gptq":
         data_type_validation = (
-            torch.float32
-            if validation_device == aiu_device
-            else default_dtype
+            torch.float32 if validation_device == aiu_device else default_dtype
         )
     else:
         data_type_validation = default_dtype
@@ -444,7 +455,7 @@
         group=dist.group.WORLD,
         linear_config=linear_config_validation,
         fused_weights=fused_weights,
-        **extra_get_model_kwargs
+        **extra_get_model_kwargs,
     )
     validation_model.load_state_dict(model.state_dict())
     if args.quantization == "gptq":
@@ -454,7 +465,7 @@
                 "rotary embeddings, in GPTQ LLaMA models"
             )
         dprint(validation_model)
-        dprint("="*60 + "\n")
+        dprint("=" * 60 + "\n")
 
 tokenizer = tokenizers.get_tokenizer(args.tokenizer)
 model.eval()
@@ -526,9 +537,9 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
     assert len(prompt_file_paths) > 0, f"Can't find any prompt files at {prompt_path}"
 
     # Check if we have enough files
-    assert (
-        len(prompt_file_paths) >= args.batch_size
-    ), f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}"
+    assert len(prompt_file_paths) >= args.batch_size, (
+        f"Not enough prompt files at {prompt_path} for a batch size of {args.batch_size}"
+    )
 
     prompts = []
     for i, prompt_file_path in enumerate(prompt_file_paths):
@@ -594,6 +605,7 @@ def truncate_prompts_to_max_length(prompts, max_len, max_allowed_length):
     ids = prompts
     padding_kwargs = {}
 
+
 def print_result(result, result_idx: int = 0, file_prefix: str = ""):
     if local_rank != 0:
         return
@@ -644,7 +656,7 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
     # Truncate each answer to its prompt length + max_new_tokens
     for i, prompt in enumerate(prompts):
         prompt_len = prompt.size(0)
-        val_tokens[i] = val_tokens[i][:prompt_len+val_num_gen_tokens]
+        val_tokens[i] = val_tokens[i][: prompt_len + val_num_gen_tokens]
 
     if has_padding:
         val_ids, padding_val_kwargs = pad_input_ids(
@@ -683,10 +695,12 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
         args.max_new_tokens,
         LogitsExtractorHook(),
         attn_algorithm="math",
-        **padding_kwargs
+        **padding_kwargs,
     )
 
-warmup_model(model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **padding_kwargs)
+warmup_model(
+    model, ids, args.max_new_tokens, args.compile_dynamic_sendnn, **padding_kwargs
+)
 
 ### AIU generation loop
 static_tokens = validation_info.get_info("tokens")
@@ -699,10 +713,10 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
     ids,
     args.max_new_tokens,
     post_iteration_hook,
-    eos_token_id = None if args.no_early_termination else tokenizer.eos_token_id,
+    eos_token_id=None if args.no_early_termination else tokenizer.eos_token_id,
     only_last_token=True,
     timing=args.timing,
-    **padding_kwargs
+    **padding_kwargs,
 )
 
 if args.save_validation_info_path is not None:
@@ -714,11 +728,12 @@ def print_result(result, result_idx: int = 0, file_prefix: str = ""):
     failed_cases = validate_level_0(aiu_static_tokens, static_tokens)
 else:
     level_1_metrics = capture_level_1_metrics(
-        validation_info.get_info("logits"),
-        aiu_validation_info.get_info("logits")
+        validation_info.get_info("logits"), aiu_validation_info.get_info("logits")
     )
 
-    failed_cases = filter_failed_level_1_cases(level_1_metrics, lambda m: m >= args.logits_loss_threshold)
+    failed_cases = filter_failed_level_1_cases(
+        level_1_metrics, lambda m: m >= args.logits_loss_threshold
+    )
 
 validation_passed = len(failed_cases) == 0
 
diff --git a/tests/models/conftest.py b/tests/models/conftest.py
index e93db8f..5ede12b 100644
--- a/tests/models/conftest.py
+++ b/tests/models/conftest.py
@@ -4,6 +4,7 @@
 import os
 import pytest
 
+
 def pytest_sessionstart(session):
     """
     Called after the Session object has been created and
@@ -23,6 +24,7 @@ def pytest_sessionstart(session):
     os.environ.setdefault("DTLOG_LEVEL", "error")
     os.environ.setdefault("DT_DEEPRT_VERBOSE", "-1")
 
+
 def pytest_addoption(parser):
     parser.addoption(
         "--runslow", action="store_true", default=False, help="run slow tests"
@@ -43,4 +45,3 @@ def pytest_generate_tests(metafunc):
     option_value = metafunc.config.option.capture_expectation
     if "capture_expectation" in metafunc.fixturenames and option_value is not None:
         metafunc.parametrize("capture_expectation", [option_value])
-
diff --git a/tests/models/test_decoders.py b/tests/models/test_decoders.py
index 25ded59..56ffe7c 100644
--- a/tests/models/test_decoders.py
+++ b/tests/models/test_decoders.py
@@ -34,7 +34,9 @@
 except ImportError:
     GPTQ_ENABLED = False
 
-MICRO_MODELS_HOME = os.environ.get("FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models")
+MICRO_MODELS_HOME = os.environ.get(
+    "FMS_TEST_SHAPES_MICRO_MODELS_HOME", "/mnt/home/models/tiny-models"
+)
 
 # Add models to test here
 LLAMA_3p1_8B_INSTRUCT = "meta-llama/Llama-3.1-8B-Instruct"
@@ -44,11 +46,19 @@
 LLAMA_3p1_70B_INSTRUCT = "meta-llama/Llama-3.1-70B-Instruct"
 
 micro_model_mapping = {
-    LLAMA_3p1_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"),
-    GRANITE_3p2_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"),
+    LLAMA_3p1_8B_INSTRUCT: os.path.join(
+        MICRO_MODELS_HOME, "llama-3.1-8b-layers-3-step-24000"
+    ),
+    GRANITE_3p2_8B_INSTRUCT: os.path.join(
+        MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"
+    ),
     # FIXME: Because this uses the same config as 3.2, re-using here, but should update
-    GRANITE_3p3_8B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"),
-    LLAMA_3p1_70B_INSTRUCT: os.path.join(MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000")
+    GRANITE_3p3_8B_INSTRUCT: os.path.join(
+        MICRO_MODELS_HOME, "granite-3.2-8b-layers-3-step-100000"
+    ),
+    LLAMA_3p1_70B_INSTRUCT: os.path.join(
+        MICRO_MODELS_HOME, "llama-3.1-70b-layers-3-step-24000"
+    ),
 }
 
 SHARE_GPT_DATASET_PATH = os.environ.get(
@@ -127,14 +137,18 @@
     for metric in skip_assertions.split(","):
         metric = metric.lower()
         if metric not in {"ce", "mean_diff"}:
-            pytest.fail("FMS_TEST_SHAPES_SKIP_ASSERTIONS can only accept metrics ce and mean_diff")
+            pytest.fail(
+                "FMS_TEST_SHAPES_SKIP_ASSERTIONS can only accept metrics ce and mean_diff"
+            )
         _skip_assertions.append(metric)
     skip_assertions = set(_skip_assertions)
 
 compile_dynamic_sendnn = ATTN_TYPE == "paged"
 
 if compile_dynamic_sendnn:
-    os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str((((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64)
+    os.environ["VLLM_DT_MAX_CONTEXT_LEN"] = str(
+        (((max(common_seq_lengths) + max(common_max_new_tokens)) // 64) + 1) * 64
+    )
     os.environ["VLLM_DT_MAX_BATCH_SIZE"] = str(max(common_batch_sizes))
 
 common_shapes = list(
@@ -294,8 +308,10 @@ def __load_validation_info(
     else:
         return None
 
+
 class PersistentModel:
     """This class will either get a model that is pre-compiled (if compile_dynamic_sendnn) or re-create the model for each test"""
+
     def __init__(self):
         self.model = None
 
@@ -310,15 +326,17 @@ def get_or_create(self, is_gptq, **kwargs):
             self.__maybe_reset_model(model, is_gptq)
 
             model.eval()
-            model.compile(backend="sendnn", options={'sendnn.dynamic': compile_dynamic_sendnn})
+            model.compile(
+                backend="sendnn", options={"sendnn.dynamic": compile_dynamic_sendnn}
+            )
 
             if compile_dynamic_sendnn:
                 self.model = model
-            
+
             return model
         else:
             return self.model
-    
+
     # TODO: This was added as we require a special reset for gptq models. Ideally, we would be able to do something like this reset when calling reset_parameters() on the model
     #  however the gptq modules are yet to support this
     @staticmethod
@@ -344,6 +362,7 @@ def __maybe_reset_model(model, is_gptq):
                     res /= 20.0
                 param.copy_(res)
 
+
 @pytest.fixture
 def persistent_model():
     return PersistentModel()
@@ -352,7 +371,9 @@ def persistent_model():
 @pytest.mark.parametrize(
     "model_path,batch_size,seq_length,max_new_tokens", common_shapes
 )
-def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persistent_model):
+def test_common_shapes(
+    model_path, batch_size, seq_length, max_new_tokens, persistent_model
+):
     torch.manual_seed(42)
     torch.set_grad_enabled(False)
     os.environ["COMPILATION_MODE"] = "offline_decoder"
@@ -396,7 +417,9 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
     tokenizer = tokenizers.get_tokenizer(model_path)
 
     # prepare the AIU model
-    model = persistent_model.get_or_create(is_gptq, **gptq_kwargs_aiu, **get_model_kwargs)
+    model = persistent_model.get_or_create(
+        is_gptq, **gptq_kwargs_aiu, **get_model_kwargs
+    )
 
     # prepare the cpu model
     validation_model = get_model(
@@ -416,7 +439,14 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
     input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
 
     # warmup aiu model
-    warmup_model(model, input_ids, max_new_tokens, compile_dynamic_sendnn, attn_type=ATTN_TYPE, **padding_kwargs)
+    warmup_model(
+        model,
+        input_ids,
+        max_new_tokens,
+        compile_dynamic_sendnn,
+        attn_type=ATTN_TYPE,
+        **padding_kwargs,
+    )
 
     # generate cpu validation info
     cpu_validation_info = __load_validation_info(
@@ -448,7 +478,13 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
 
     # first test validation level 0
     aiu_validation_info = extract_validation_information(
-        model, input_ids, max_new_tokens, None, only_last_token=ATTN_TYPE != "paged", attn_type=ATTN_TYPE, **padding_kwargs
+        model,
+        input_ids,
+        max_new_tokens,
+        None,
+        only_last_token=ATTN_TYPE != "paged",
+        attn_type=ATTN_TYPE,
+        **padding_kwargs,
     )
     dprint("aiu validation info extracted for validation level 0")
 
@@ -461,7 +497,6 @@ def test_common_shapes(model_path, batch_size, seq_length, max_new_tokens, persi
 
     # if level 0 fails validation, validate level 1
     if FORCE_VALIDATION_LEVEL_1 or failed_validation_level_0:
-
         if failed_validation_level_0:
             dprint("failed validation level 0, testing validation level 1")
         else:
@@ -526,7 +561,7 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 max_new_tokens,
                 GoldenTokenHook(cpu_static_tokens),
                 only_last_token=ATTN_TYPE != "paged",
-                attn_type=ATTN_TYPE, 
+                attn_type=ATTN_TYPE,
                 **padding_kwargs,
             )
             dprint(f"aiu validation info extracted for validation level 1 - iter={i}")
@@ -554,7 +589,10 @@ def _metric_calculator(r: torch.Tensor, t: torch.Tensor):
                 # if we have a micro model with real weights, but no real thresholds, default to the full model thresholds
                 if USE_MICRO_MODELS:
                     ce_threshold, diff_threshold = fail_thresholds.get(
-                        (model_path, True), fail_thresholds.get((model_path, False), default_metrics_threshold)
+                        (model_path, True),
+                        fail_thresholds.get(
+                            (model_path, False), default_metrics_threshold
+                        ),
                     )
                 else:
                     ce_threshold, diff_threshold = fail_thresholds.get(
diff --git a/tests/models/test_encoders.py b/tests/models/test_encoders.py
index 67a032c..8a3eb06 100644
--- a/tests/models/test_encoders.py
+++ b/tests/models/test_encoders.py
@@ -1,4 +1,8 @@
-from fms.testing.comparison import ModelSignatureParams, compare_model_signatures, get_signature
+from fms.testing.comparison import (
+    ModelSignatureParams,
+    compare_model_signatures,
+    get_signature,
+)
 from fms.utils import tokenizers
 import pytest
 from fms.models import get_model
@@ -13,11 +17,17 @@
 # Add models to test here
 ROBERTA_SQUAD_V2 = "deepset/roberta-base-squad2"
 
-SQUAD_V2_DATASET_PATH = os.environ.get("SQUAD_V2_DATASET_PATH", os.path.expanduser("~/squad_v2"))
-common_model_paths = os.environ.get("FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ROBERTA_SQUAD_V2])
+SQUAD_V2_DATASET_PATH = os.environ.get(
+    "SQUAD_V2_DATASET_PATH", os.path.expanduser("~/squad_v2")
+)
+common_model_paths = os.environ.get(
+    "FMS_TEST_SHAPES_COMMON_MODEL_PATHS", [ROBERTA_SQUAD_V2]
+)
 common_batch_sizes = os.environ.get("FMS_TEST_SHAPES_COMMON_BATCH_SIZES", [1, 2, 4, 8])
 common_seq_lengths = os.environ.get("FMS_TEST_SHAPES_COMMON_SEQ_LENGTHS", [64, 512])
-validation_diff_threshold = os.environ.get("FMS_TEST_SHAPES_VALIDATION_DIFF_THRESHOLD", .01)
+validation_diff_threshold = os.environ.get(
+    "FMS_TEST_SHAPES_VALIDATION_DIFF_THRESHOLD", 0.01
+)
 
 # pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/roberta,/tmp/models/roberta-base-squad2"
 if isinstance(common_model_paths, str):
@@ -36,18 +46,30 @@
 if isinstance(validation_diff_threshold, str):
     validation_diff_threshold = float(validation_diff_threshold)
 
-common_shapes = list(itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths))
+common_shapes = list(
+    itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths)
+)
 
 
 def __prepare_inputs(batch_size, seq_length, tokenizer, seed=0):
-    prompts_and_sizes = sample_squad_v2_qa_requests(SQUAD_V2_DATASET_PATH, batch_size, tokenizer, int(seq_length / 2), seq_length, seed)
+    prompts_and_sizes = sample_squad_v2_qa_requests(
+        SQUAD_V2_DATASET_PATH,
+        batch_size,
+        tokenizer,
+        int(seq_length / 2),
+        seq_length,
+        seed,
+    )
     prompt_list = []
     for prompt, _ in prompts_and_sizes:
         prompt_list.append(ids_for_prompt(prompt, tokenizer))
 
-    input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length, is_causal_mask=False)
+    input_ids, padding_kwargs = pad_input_ids(
+        prompt_list, min_pad_length=seq_length, is_causal_mask=False
+    )
     return input_ids, padding_kwargs
 
+
 def __generate_diffs(model_params_1, model_params_2):
     model_params_1.model.eval()
     model_params_2.model.eval()
@@ -57,7 +79,7 @@ def __generate_diffs(model_params_1, model_params_2):
         optional_params=model_params_1.other_params,
         logits_getter_fn=model_params_1.logits_getter_fn,
         inp=model_params_1.inp,
-        device=model_params_1.inp.device
+        device=model_params_1.inp.device,
     )
     signature2 = get_signature(
         model_params_2.model,
@@ -65,7 +87,7 @@ def __generate_diffs(model_params_1, model_params_2):
         optional_params=model_params_2.other_params,
         logits_getter_fn=model_params_2.logits_getter_fn,
         inp=model_params_2.inp,
-        device=model_params_2.inp.device
+        device=model_params_2.inp.device,
     )
 
     signature = np.array(signature)
@@ -73,21 +95,25 @@ def __generate_diffs(model_params_1, model_params_2):
 
     return np.mean(np.abs(signature2 - signature))
 
+
 @pytest.fixture(autouse=True)
 def reset_compiler():
-    yield # run the test
+    yield  # run the test
     torch.compiler.reset()
     torch._dynamo.reset()
-    os.environ.pop('COMPILATION_MODE', None)
+    os.environ.pop("COMPILATION_MODE", None)
+
 
 @pytest.mark.parametrize("model_path,batch_size,seq_length", common_shapes)
 def test_common_shapes(model_path, batch_size, seq_length):
     os.environ["COMPILATION_MODE"] = "offline"
-    
-    dprint(f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}")
+
+    dprint(
+        f"testing model={model_path}, batch_size={batch_size}, seq_length={seq_length}"
+    )
 
     tokenizer = tokenizers.get_tokenizer(model_path)
-    
+
     if os.path.exists(model_path):
         model_path_kwargs = {"model_path": model_path}
     else:
@@ -98,7 +124,7 @@ def test_common_shapes(model_path, batch_size, seq_length):
         architecture="hf_pretrained",
         device_type="cpu",
         fused_weights=False,
-        **model_path_kwargs
+        **model_path_kwargs,
     )
 
     model.eval()
@@ -111,34 +137,56 @@ def test_common_shapes(model_path, batch_size, seq_length):
         device_type="cpu",
         data_type=torch.float32,
         fused_weights=False,
-        **model_path_kwargs
+        **model_path_kwargs,
     )
 
     # prepare input_ids
     input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer)
 
     # warmup model
-    logits_getter_fn = lambda x: x if isinstance(x, torch.Tensor) else torch.cat(list(x), dim=-1)
-    aiu_msp = ModelSignatureParams(model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs)
-    get_signature(aiu_msp.model, aiu_msp.params, aiu_msp.inp, aiu_msp.other_params, aiu_msp.logits_getter_fn)
+    logits_getter_fn = (
+        lambda x: x if isinstance(x, torch.Tensor) else torch.cat(list(x), dim=-1)
+    )
+    aiu_msp = ModelSignatureParams(
+        model,
+        ["x"],
+        logits_getter_fn=logits_getter_fn,
+        inp=input_ids,
+        other_params=padding_kwargs,
+    )
+    get_signature(
+        aiu_msp.model,
+        aiu_msp.params,
+        aiu_msp.inp,
+        aiu_msp.other_params,
+        aiu_msp.logits_getter_fn,
+    )
 
     # get the average diff over multiple samples
     diffs = []
     for i in range(20):
         # prepare input_ids
-        input_ids, padding_kwargs = __prepare_inputs(batch_size, seq_length, tokenizer, seed=i)
+        input_ids, padding_kwargs = __prepare_inputs(
+            batch_size, seq_length, tokenizer, seed=i
+        )
 
         aiu_msp = ModelSignatureParams(
-            model, 
-            ["x"], 
-            logits_getter_fn=logits_getter_fn, 
-            inp=input_ids, 
-            other_params=padding_kwargs
+            model,
+            ["x"],
+            logits_getter_fn=logits_getter_fn,
+            inp=input_ids,
+            other_params=padding_kwargs,
+        )
+        cpu_msp = ModelSignatureParams(
+            validation_model,
+            ["x"],
+            logits_getter_fn=logits_getter_fn,
+            inp=input_ids,
+            other_params=padding_kwargs,
         )
-        cpu_msp = ModelSignatureParams(validation_model, ["x"], logits_getter_fn=logits_getter_fn, inp=input_ids, other_params=padding_kwargs)
         diffs.append(__generate_diffs(aiu_msp, cpu_msp))
 
     abs_mean_diff = sum(diffs) / len(diffs)
     print(f"absolute mean diff: {abs_mean_diff}")
 
-    assert abs_mean_diff < validation_diff_threshold
\ No newline at end of file
+    assert abs_mean_diff < validation_diff_threshold
diff --git a/tests/models/test_model_expectations.py b/tests/models/test_model_expectations.py
index 5cfcd57..64b24d4 100644
--- a/tests/models/test_model_expectations.py
+++ b/tests/models/test_model_expectations.py
@@ -20,7 +20,12 @@
 MISTRAL_7B_INSTRUCT = "mistralai/Mistral-7B-Instruct-v0.3"
 ROBERTA_SQUAD_v2 = "deepset/roberta-base-squad2"
 
-micro_models = {LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_GUARDIAN_3p1_8B, MISTRAL_7B_INSTRUCT}
+micro_models = {
+    LLAMA_3p1_8B_INSTRUCT,
+    GRANITE_3p2_8B_INSTRUCT,
+    GRANITE_GUARDIAN_3p1_8B,
+    MISTRAL_7B_INSTRUCT,
+}
 
 
 class AIUModelFixtureMixin(ModelFixtureMixin):
@@ -51,7 +56,12 @@ def model(self, uninitialized_model):
         return uninitialized_model
 
 
-decoder_models = [LLAMA_3p1_8B_INSTRUCT, GRANITE_3p2_8B_INSTRUCT, GRANITE_GUARDIAN_3p1_8B, MISTRAL_7B_INSTRUCT]
+decoder_models = [
+    LLAMA_3p1_8B_INSTRUCT,
+    GRANITE_3p2_8B_INSTRUCT,
+    GRANITE_GUARDIAN_3p1_8B,
+    MISTRAL_7B_INSTRUCT,
+]
 
 
 class TestAIUDecoderModels(
diff --git a/tests/models/test_scripts.py b/tests/models/test_scripts.py
index e0f2ab4..5601c5d 100644
--- a/tests/models/test_scripts.py
+++ b/tests/models/test_scripts.py
@@ -3,8 +3,9 @@
 from pathlib import Path
 import itertools
 import math
+
 FMS_DIR = Path(__file__).parent
-AIU_FMS_DIR = os.path.join(FMS_DIR,"../../../aiu-fms-testing-utils/")
+AIU_FMS_DIR = os.path.join(FMS_DIR, "../../../aiu-fms-testing-utils/")
 VALIDATION_FILE_PATH = os.path.join(AIU_FMS_DIR, "scripts", "validation.py")
 INFERENCE_FILE_PATH = os.path.join(AIU_FMS_DIR, "scripts", "inference.py")
 
@@ -17,40 +18,68 @@
 GRANITE_3_8B_CODE_BASE = f"{model_dir}/granite-3-8b-base"
 
 # pass custom model path list for eg: EXPORT FMS_TESTING_COMMON_MODEL_PATHS="/tmp/models/granite-3-8b-base,/tmp/models/granite-7b-base"
-if os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == None or os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == "":
+if (
+    os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == None
+    or os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS") == ""
+):
     common_model_paths = [LLAMA_194M]
 else:
-    common_model_paths = os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS").split(',')
+    common_model_paths = os.environ.get("FMS_TESTING_COMMON_MODEL_PATHS").split(",")
 
-common_batch_sizes = [1,8]
+common_batch_sizes = [1, 8]
 common_seq_lengths = [64]
 common_max_new_tokens = [8]
 
-common_params = list(itertools.product(common_model_paths, common_batch_sizes, common_seq_lengths, common_max_new_tokens))
+common_params = list(
+    itertools.product(
+        common_model_paths,
+        common_batch_sizes,
+        common_seq_lengths,
+        common_max_new_tokens,
+    )
+)
 common_asserts = [
-                        "### Response: Chicken soup is a popular soup that is",
-                        "### Response: I am sorry, but I am not",
-                        "### Response: I am ignorant of the fact that I",
-                        "### Response: I have just come into a very large",
-                  ]    
+    "### Response: Chicken soup is a popular soup that is",
+    "### Response: I am sorry, but I am not",
+    "### Response: I am ignorant of the fact that I",
+    "### Response: I have just come into a very large",
+]
 
 current_env = os.environ.copy()
-current_env["DT_OPT"]="varsub=1,lxopt=1,opfusion=1,arithfold=1,dataopt=1,patchinit=1,patchprog=1,autopilot=1,weipreload=0,kvcacheopt=1,progshareopt=1"
+current_env["DT_OPT"] = (
+    "varsub=1,lxopt=1,opfusion=1,arithfold=1,dataopt=1,patchinit=1,patchprog=1,autopilot=1,weipreload=0,kvcacheopt=1,progshareopt=1"
+)
 
-def execute_script(execute_cmd):
-    current_env['MAX_SHAREDPROG_ITERS'] = f"{common_max_new_tokens[0]}"
 
-    with Popen(execute_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True, env=current_env) as p:
+def execute_script(execute_cmd):
+    current_env["MAX_SHAREDPROG_ITERS"] = f"{common_max_new_tokens[0]}"
+
+    with Popen(
+        execute_cmd,
+        stdin=PIPE,
+        stdout=PIPE,
+        stderr=PIPE,
+        universal_newlines=True,
+        env=current_env,
+    ) as p:
         output, error = p.communicate()
         if p.returncode == 0:
             return output
         else:
             raise Exception(error)
 
+
 # we are forcing the number of layers to be 2 to reduce the size of the model as we do not care about output, but just consistency between cpu and aiu
-def execute_validation(validation_level, model_path, max_new_tokens, batch_size, seq_length, logits_loss_threshold=0.0):
+def execute_validation(
+    validation_level,
+    model_path,
+    max_new_tokens,
+    batch_size,
+    seq_length,
+    logits_loss_threshold=0.0,
+):
     execute_cmd = [
-        'python3',
+        "python3",
         VALIDATION_FILE_PATH,
         "--architecture=hf_pretrained",
         f"--model_path={model_path}",
@@ -62,13 +91,14 @@ def execute_validation(validation_level, model_path, max_new_tokens, batch_size,
         "--no_early_termination",
         f"--validation_level={validation_level}",
         f"--logits_loss_threshold={logits_loss_threshold}",
-        "--compile_dynamic"
+        "--compile_dynamic",
     ]
     return execute_script(execute_cmd)
 
+
 def execute_inference(model_path, max_new_tokens, batch_size, seq_length):
     execute_cmd = [
-        'python3',
+        "python3",
         INFERENCE_FILE_PATH,
         "--architecture=hf_pretrained",
         f"--model_path={model_path}",
@@ -80,23 +110,24 @@ def execute_inference(model_path, max_new_tokens, batch_size, seq_length):
         "--no_early_termination",
         "--compile_dynamic",
         "--compile",
-        "--device_type=aiu"
+        "--device_type=aiu",
     ]
     return execute_script(execute_cmd)
 
-@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens", common_params)
+
+@pytest.mark.parametrize(
+    "model_path,batch_size,seq_length,max_new_tokens", common_params
+)
 def test_level_1_validation_script(model_path, batch_size, seq_length, max_new_tokens):
     result_text = execute_validation(
-        1,
-        model_path,
-        max_new_tokens,
-        batch_size,
-        seq_length,
-        64.0
+        1, model_path, max_new_tokens, batch_size, seq_length, 64.0
     )
     assert "The validation has passed!" in result_text
 
-@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens", common_params)
+
+@pytest.mark.parametrize(
+    "model_path,batch_size,seq_length,max_new_tokens", common_params
+)
 def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_tokens):
     result_text = execute_validation(
         0,
@@ -107,6 +138,7 @@ def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_t
     )
     assert "The validation has passed!" in result_text
 
+
 common_asserts = [
     "### Response: Chicken soup is a popular soup that is",
     "### Response: I am sorry, but I am not",
@@ -114,18 +146,25 @@ def test_level_0_validation_script(model_path, batch_size, seq_length, max_new_t
     "### Response: I have just come into a very large",
 ]
 
+
 def __repeat_batch_asserts(bs: int) -> list[str]:
     n_repeats = int(math.ceil(bs / len(common_asserts)))
     return (common_asserts * n_repeats)[:bs]
 
+
 # add the asserts based on batch size
 # for batches greater than common_asserts, repeat common_asserts since this follows inference behavior
-common_inference_params = [common_param + (__repeat_batch_asserts(common_param[1]),) for common_param in common_params]
+common_inference_params = [
+    common_param + (__repeat_batch_asserts(common_param[1]),)
+    for common_param in common_params
+]
 
 
-@pytest.mark.parametrize("model_path,batch_size,seq_length,max_new_tokens,asserts", common_inference_params)
+@pytest.mark.parametrize(
+    "model_path,batch_size,seq_length,max_new_tokens,asserts", common_inference_params
+)
 def test_inference_script(model_path, max_new_tokens, seq_length, batch_size, asserts):
     result_text = execute_inference(model_path, max_new_tokens, batch_size, seq_length)
 
     for common_assert in asserts:
-        assert common_assert in result_text
\ No newline at end of file
+        assert common_assert in result_text
diff --git a/tests/resources/get_thresholds.py b/tests/resources/get_thresholds.py
index 7dedb70..474dbd4 100644
--- a/tests/resources/get_thresholds.py
+++ b/tests/resources/get_thresholds.py
@@ -4,23 +4,21 @@
 import argparse
 import os
 
-parser = argparse.ArgumentParser(
-    description="Script to get thresholds metrics"
-)
+parser = argparse.ArgumentParser(description="Script to get thresholds metrics")
 
 parser.add_argument(
     "--models",
     type=str,
     default=[],
-    nargs='+',
+    nargs="+",
     required=True,
-    help="List of models id separated by space. Eg.: ibm-granite/granite-20b-code-instruct-8k /tmp/models/granite-20b-code-cobol-v1"
+    help="List of models id separated by space. Eg.: ibm-granite/granite-20b-code-instruct-8k /tmp/models/granite-20b-code-cobol-v1",
 )
 parser.add_argument(
     "--metrics",
     type=str,
     default=[],
-    nargs='+',
+    nargs="+",
     required=True,
     help="List of metrics separated by space. Eg.: diff_mean ce",
 )
@@ -43,7 +41,6 @@
 
         metric_list = []
         for metric_file in metric_files:
-
             with open(metric_file, "r") as file:
                 next(file)
                 for line in file:
diff --git a/tests/testing/test_validation.py b/tests/testing/test_validation.py
index 047590e..90cf2e7 100644
--- a/tests/testing/test_validation.py
+++ b/tests/testing/test_validation.py
@@ -1,11 +1,19 @@
 import tempfile
 import pytest
-from aiu_fms_testing_utils.testing.validation import LogitsExtractorHook, extract_validation_information, load_validation_information
+from aiu_fms_testing_utils.testing.validation import (
+    LogitsExtractorHook,
+    extract_validation_information,
+    load_validation_information,
+)
 from fms.models import get_model
 from fms.utils.generation import pad_input_ids
 import torch
 
-@pytest.mark.parametrize("validation_type,post_iteration_hook", [("logits", LogitsExtractorHook()), ("tokens", None)])
+
+@pytest.mark.parametrize(
+    "validation_type,post_iteration_hook",
+    [("logits", LogitsExtractorHook()), ("tokens", None)],
+)
 def test_validation_info_round_trip(validation_type, post_iteration_hook):
     # prepare a small cpu model
     model = get_model(
@@ -22,7 +30,11 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook):
     # prepare input_ids
     prompt_list = []
     for i in range(batch_size):
-        prompt_list.append(torch.randint(0, model.config.src_vocab_size, (seq_length - 2 * i,), dtype=torch.long))
+        prompt_list.append(
+            torch.randint(
+                0, model.config.src_vocab_size, (seq_length - 2 * i,), dtype=torch.long
+            )
+        )
 
     input_ids, padding_kwargs = pad_input_ids(prompt_list, min_pad_length=seq_length)
 
@@ -33,14 +45,16 @@ def test_validation_info_round_trip(validation_type, post_iteration_hook):
         max_new_tokens,
         post_iteration_hook,
         attn_algorithm="math",
-        **padding_kwargs
+        **padding_kwargs,
     )
 
     with tempfile.TemporaryDirectory() as workdir:
         output_path = f"{workdir}/validation_info"
         generated_validation_info.save(output_path)
 
-        loaded_validation_info = load_validation_information(output_path, validation_type, batch_size)
+        loaded_validation_info = load_validation_information(
+            output_path, validation_type, batch_size
+        )
 
         assert len(generated_validation_info) == len(loaded_validation_info)
 
diff --git a/tests/utils/test_paged.py b/tests/utils/test_paged.py
index 519042a..67cba9b 100644
--- a/tests/utils/test_paged.py
+++ b/tests/utils/test_paged.py
@@ -1,13 +1,11 @@
 import torch
 from fms.models import get_model
-from fms.utils.generation import (
-    pad_input_ids,
-    generate
-)
+from fms.utils.generation import pad_input_ids, generate
 from aiu_fms_testing_utils.utils.paged import generate as paged_generate
 from fms.utils.tokenizers import get_tokenizer
 import pytest
 
+
 def test_paged_equivalence():
     torch.manual_seed(0)
     with torch.no_grad():