From 2a5010db5051e000bdfa9671e52547fa96258cba Mon Sep 17 00:00:00 2001
From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:26:07 +0300
Subject: [PATCH 1/3] added dtype based tolerances

---
 .../tests/test_transformer_engine_executor.py | 44 ++++++++++++-------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py
index 4b17c6e877..aa256676e3 100644
--- a/thunder/tests/test_transformer_engine_executor.py
+++ b/thunder/tests/test_transformer_engine_executor.py
@@ -52,6 +52,16 @@
     recipes += [nvfp4_e2m1_recipe]
     recipe_ids += ["nvfp4_e2m1"]
 
+# Returns the estimated numerical error for a given dtype as per TE spec here:
+# https://github.com/NVIDIA/TransformerEngine/blob/7ad130efd52c3aa4a386d25f1d42b28d5aa20090/tests/pytorch/test_numerics.py#L155-L167
+def dtype_tolerance(dtype):
+    if dtype == torch.float32:
+        return dict(rtol=1.3e-6, atol=1e-5)
+    if dtype == torch.float16:
+        return dict(rtol=1e-3, atol=1e-5)
+    if dtype == torch.bfloat16:
+        return dict(rtol=1.6e-2, atol=1e-5)
+    raise ValueError(f"Unsuppored dtype ({dtype})")
 
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
@@ -69,6 +79,7 @@ def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe):
     # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*`
     # and the output as well as the gradients match for thunder compiled code.
     dtype = torch.bfloat16
+    tolerances = dtype_tolerance(dtype)
     device = "cuda"
 
     # TE inputs (3D input)
@@ -106,9 +117,9 @@ def fn(x, w1, w2):
     te_result.backward(grad_output)
     thunder_result.backward(grad_output)
 
-    assert_close(x.grad, x_te.grad)
-    assert_close(w1.grad, te_linear1.weight.grad)
-    assert_close(w2.grad, te_linear2.weight.grad)
+    assert_close(x.grad, x_te.grad, *tolerances)
+    assert_close(w1.grad, te_linear1.weight.grad, *tolerances)
+    assert_close(w2.grad, te_linear2.weight.grad, *tolerances)
 
     # Verifies te_linear was called
     forward_trace = thunder.last_traces(cfn)
@@ -143,6 +154,7 @@ def test_te_linear_forward_backward_multiple_iteration(fp8_recipe: recipe.Recipe
     # Since, the FP8 operations are stateful, we want to verify that
     # our output matches over multiple iterations (where state handling comes into picture)
     dtype = torch.bfloat16
+    tolerances = dtype_tolerance(dtype)
     device = "cuda"
     # Running more iterations leads to `nan` for both eager and thunder
     # with BlockScaling.
@@ -200,10 +212,10 @@ def thunder_model(x):
     train_model(thunder_model, thunder_sgd_optimizer)
 
     # Verify that the weights and biases converge to same value after few iterations.
-    assert_close(w1, te_linear1.weight)
-    assert_close(w2, te_linear2.weight)
-    assert_close(b1, te_linear1.bias)
-    assert_close(b2, te_linear2.bias)
+    assert_close(w1, te_linear1.weight, *tolerances)
+    assert_close(w2, te_linear2.weight, *tolerances)
+    assert_close(b1, te_linear1.bias, *tolerances)
+    assert_close(b2, te_linear2.bias, *tolerances)
 
 
 @requiresCUDA
@@ -220,6 +232,7 @@ def test_te_linear_forward_backward_multiple_iteration_multiple_recipes():
         pytest.skip("platform does not support two different recipes")
 
     dtype = torch.bfloat16
+    tolerances = dtype_tolerance(dtype)
     device = "cuda"
     # Running more iterations leads to `nan` for both eager and thunder
     # with BlockScaling.
@@ -278,10 +291,10 @@ def thunder_model(x, fp8_recipe):
     train_model(thunder_model, thunder_sgd_optimizer)
 
     # Verify that the weights and biases converge to same value after few iterations.
-    assert_close(w1, te_linear1.weight)
-    assert_close(w2, te_linear2.weight)
-    assert_close(b1, te_linear1.bias)
-    assert_close(b2, te_linear2.bias)
+    assert_close(w1, te_linear1.weight, *tolerances)
+    assert_close(w2, te_linear2.weight, *tolerances)
+    assert_close(b1, te_linear1.bias, *tolerances)
+    assert_close(b2, te_linear2.bias, *tolerances)
 
 
 @requiresCUDA
@@ -562,6 +575,7 @@ def test_te_activation_checkpointing_correctness(fp8_recipe: recipe.Recipe, comp
         pytest.skip(msg_nvfp4)
 
     dtype = torch.bfloat16
+    tolerances = dtype_tolerance(dtype)
     device = "cuda"
     iterations = 6
 
@@ -650,10 +664,10 @@ def thunder_model(x):
     for loss, te_loss in zip(thunder_loss_hist, te_loss_hist):
         assert_close(loss, te_loss)
 
-    assert_close(w1, te_linear1.weight)
-    assert_close(w2, te_linear2.weight)
-    assert_close(b1, te_linear1.bias)
-    assert_close(b2, te_linear2.bias)
+    assert_close(w1, te_linear1.weight, *tolerances)
+    assert_close(w2, te_linear2.weight, *tolerances)
+    assert_close(b1, te_linear1.bias, *tolerances)
+    assert_close(b2, te_linear2.bias, *tolerances)
 
     # TE does not expose the scales for MXFP8
     if fp8_recipe.delayed():

From 5b07c96e0f214ee7f8d9ed79e50c47ca7d3a34ea Mon Sep 17 00:00:00 2001
From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:36:37 +0300
Subject: [PATCH 2/3] grouped dtype and wrapped assert_close

---
 .../tests/test_transformer_engine_executor.py | 69 ++++++++++---------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py
index aa256676e3..11564b8336 100644
--- a/thunder/tests/test_transformer_engine_executor.py
+++ b/thunder/tests/test_transformer_engine_executor.py
@@ -52,16 +52,23 @@
     recipes += [nvfp4_e2m1_recipe]
     recipe_ids += ["nvfp4_e2m1"]
 
+
 # Returns the estimated numerical error for a given dtype as per TE spec here:
 # https://github.com/NVIDIA/TransformerEngine/blob/7ad130efd52c3aa4a386d25f1d42b28d5aa20090/tests/pytorch/test_numerics.py#L155-L167
-def dtype_tolerance(dtype):
-    if dtype == torch.float32:
-        return dict(rtol=1.3e-6, atol=1e-5)
-    if dtype == torch.float16:
-        return dict(rtol=1e-3, atol=1e-5)
-    if dtype == torch.bfloat16:
-        return dict(rtol=1.6e-2, atol=1e-5)
-    raise ValueError(f"Unsuppored dtype ({dtype})")
+def te_assert_close(actual, expected, **kwargs):
+    tolerances = {}
+
+    if actual.dtype == torch.float32:
+        tolerances = dict(rtol=1.3e-6, atol=1e-5)
+    if actual.dtype == torch.float16:
+        tolerances = dict(rtol=1e-3, atol=1e-5)
+    if actual.dtype == torch.bfloat16:
+        tolerances = dict(rtol=1.6e-2, atol=1e-5)
+
+    kwargs.update(tolerances)
+
+    assert_close(actual, expected, **kwargs)
+
 
 @requiresCUDA
 @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids)
@@ -79,7 +86,7 @@ def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe):
     # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*`
     # and the output as well as the gradients match for thunder compiled code.
     dtype = torch.bfloat16
-    tolerances = dtype_tolerance(dtype)
+
     device = "cuda"
 
     # TE inputs (3D input)
@@ -111,15 +118,15 @@ def fn(x, w1, w2):
         te_result = te_linear2(inter_result + x_te)
 
     # Verifies the result is close to TE
-    assert_close(thunder_result, te_result)
+    te_assert_close(thunder_result, te_result)
 
     grad_output = torch.randn_like(te_result)
     te_result.backward(grad_output)
     thunder_result.backward(grad_output)
 
-    assert_close(x.grad, x_te.grad, *tolerances)
-    assert_close(w1.grad, te_linear1.weight.grad, *tolerances)
-    assert_close(w2.grad, te_linear2.weight.grad, *tolerances)
+    te_assert_close(x.grad, x_te.grad)
+    te_assert_close(w1.grad, te_linear1.weight.grad)
+    te_assert_close(w2.grad, te_linear2.weight.grad)
 
     # Verifies te_linear was called
     forward_trace = thunder.last_traces(cfn)
@@ -154,7 +161,7 @@ def test_te_linear_forward_backward_multiple_iteration(fp8_recipe: recipe.Recipe
     # Since, the FP8 operations are stateful, we want to verify that
     # our output matches over multiple iterations (where state handling comes into picture)
     dtype = torch.bfloat16
-    tolerances = dtype_tolerance(dtype)
+
     device = "cuda"
     # Running more iterations leads to `nan` for both eager and thunder
     # with BlockScaling.
@@ -212,10 +219,10 @@ def thunder_model(x):
     train_model(thunder_model, thunder_sgd_optimizer)
 
     # Verify that the weights and biases converge to same value after few iterations.
-    assert_close(w1, te_linear1.weight, *tolerances)
-    assert_close(w2, te_linear2.weight, *tolerances)
-    assert_close(b1, te_linear1.bias, *tolerances)
-    assert_close(b2, te_linear2.bias, *tolerances)
+    te_assert_close(w1, te_linear1.weight)
+    te_assert_close(w2, te_linear2.weight)
+    te_assert_close(b1, te_linear1.bias)
+    te_assert_close(b2, te_linear2.bias)
 
 
 @requiresCUDA
@@ -232,7 +239,7 @@ def test_te_linear_forward_backward_multiple_iteration_multiple_recipes():
         pytest.skip("platform does not support two different recipes")
 
     dtype = torch.bfloat16
-    tolerances = dtype_tolerance(dtype)
+
     device = "cuda"
     # Running more iterations leads to `nan` for both eager and thunder
     # with BlockScaling.
@@ -291,10 +298,10 @@ def thunder_model(x, fp8_recipe):
     train_model(thunder_model, thunder_sgd_optimizer)
 
     # Verify that the weights and biases converge to same value after few iterations.
-    assert_close(w1, te_linear1.weight, *tolerances)
-    assert_close(w2, te_linear2.weight, *tolerances)
-    assert_close(b1, te_linear1.bias, *tolerances)
-    assert_close(b2, te_linear2.bias, *tolerances)
+    te_assert_close(w1, te_linear1.weight)
+    te_assert_close(w2, te_linear2.weight)
+    te_assert_close(b1, te_linear1.bias)
+    te_assert_close(b2, te_linear2.bias)
 
 
 @requiresCUDA
@@ -575,7 +582,7 @@ def test_te_activation_checkpointing_correctness(fp8_recipe: recipe.Recipe, comp
         pytest.skip(msg_nvfp4)
 
     dtype = torch.bfloat16
-    tolerances = dtype_tolerance(dtype)
+
     device = "cuda"
     iterations = 6
 
@@ -662,12 +669,12 @@ def thunder_model(x):
     train_model(thunder_model, thunder_sgd_optimizer, thunder_loss_hist)
 
     for loss, te_loss in zip(thunder_loss_hist, te_loss_hist):
-        assert_close(loss, te_loss)
+        te_assert_close(loss, te_loss)
 
-    assert_close(w1, te_linear1.weight, *tolerances)
-    assert_close(w2, te_linear2.weight, *tolerances)
-    assert_close(b1, te_linear1.bias, *tolerances)
-    assert_close(b2, te_linear2.bias, *tolerances)
+    te_assert_close(w1, te_linear1.weight)
+    te_assert_close(w2, te_linear2.weight)
+    te_assert_close(b1, te_linear1.bias)
+    te_assert_close(b2, te_linear2.bias)
 
     # TE does not expose the scales for MXFP8
     if fp8_recipe.delayed():
@@ -701,8 +708,8 @@ def thunder_model(x):
 
         # check the scales are the same but for last dimension which is always on in TE
         for te_scale, th_scale in zip(te_scales, th_scales):
-            assert_close(te_scale[:-1], th_scale)
+            te_assert_close(te_scale[:-1], th_scale)
 
         # check that amax history is the same as TE
         for te_amax, th_amax in zip(te_amax_hist, th_amax_hist):
-            assert_close(te_amax[:, :-1], th_amax)
+            te_assert_close(te_amax[:, :-1], th_amax)

From ae5ab96efe0de2e84cc3830889227ea6d0c3508f Mon Sep 17 00:00:00 2001
From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com>
Date: Tue, 14 Oct 2025 14:39:15 +0300
Subject: [PATCH 3/3] include float scalar input

---
 thunder/tests/test_transformer_engine_executor.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py
index 11564b8336..b8a7f9cb9e 100644
--- a/thunder/tests/test_transformer_engine_executor.py
+++ b/thunder/tests/test_transformer_engine_executor.py
@@ -58,11 +58,13 @@
 def te_assert_close(actual, expected, **kwargs):
     tolerances = {}
 
-    if actual.dtype == torch.float32:
+    if not isinstance(actual, torch.Tensor) and isinstance(actual, float):
         tolerances = dict(rtol=1.3e-6, atol=1e-5)
-    if actual.dtype == torch.float16:
+    elif actual.dtype == torch.float32:
+        tolerances = dict(rtol=1.3e-6, atol=1e-5)
+    elif actual.dtype == torch.float16:
         tolerances = dict(rtol=1e-3, atol=1e-5)
-    if actual.dtype == torch.bfloat16:
+    elif actual.dtype == torch.bfloat16:
         tolerances = dict(rtol=1.6e-2, atol=1e-5)
 
     kwargs.update(tolerances)