From 2a5010db5051e000bdfa9671e52547fa96258cba Mon Sep 17 00:00:00 2001 From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:26:07 +0300 Subject: [PATCH 1/3] added dtype based tolerances --- .../tests/test_transformer_engine_executor.py | 44 ++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py index 4b17c6e877..aa256676e3 100644 --- a/thunder/tests/test_transformer_engine_executor.py +++ b/thunder/tests/test_transformer_engine_executor.py @@ -52,6 +52,16 @@ recipes += [nvfp4_e2m1_recipe] recipe_ids += ["nvfp4_e2m1"] +# Returns the estimated numerical error for a given dtype as per TE spec here: +# https://github.com/NVIDIA/TransformerEngine/blob/7ad130efd52c3aa4a386d25f1d42b28d5aa20090/tests/pytorch/test_numerics.py#L155-L167 +def dtype_tolerance(dtype): + if dtype == torch.float32: + return dict(rtol=1.3e-6, atol=1e-5) + if dtype == torch.float16: + return dict(rtol=1e-3, atol=1e-5) + if dtype == torch.bfloat16: + return dict(rtol=1.6e-2, atol=1e-5) + raise ValueError(f"Unsuppored dtype ({dtype})") @requiresCUDA @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids) @@ -69,6 +79,7 @@ def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe): # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*` # and the output as well as the gradients match for thunder compiled code. dtype = torch.bfloat16 + tolerances = dtype_tolerance(dtype) device = "cuda" # TE inputs (3D input) @@ -106,9 +117,9 @@ def fn(x, w1, w2): te_result.backward(grad_output) thunder_result.backward(grad_output) - assert_close(x.grad, x_te.grad) - assert_close(w1.grad, te_linear1.weight.grad) - assert_close(w2.grad, te_linear2.weight.grad) + assert_close(x.grad, x_te.grad, *tolerances) + assert_close(w1.grad, te_linear1.weight.grad, *tolerances) + assert_close(w2.grad, te_linear2.weight.grad, *tolerances) # Verifies te_linear was called forward_trace = thunder.last_traces(cfn) @@ -143,6 +154,7 @@ def test_te_linear_forward_backward_multiple_iteration(fp8_recipe: recipe.Recipe # Since, the FP8 operations are stateful, we want to verify that # our output matches over multiple iterations (where state handling comes into picture) dtype = torch.bfloat16 + tolerances = dtype_tolerance(dtype) device = "cuda" # Running more iterations leads to `nan` for both eager and thunder # with BlockScaling. @@ -200,10 +212,10 @@ def thunder_model(x): train_model(thunder_model, thunder_sgd_optimizer) # Verify that the weights and biases converge to same value after few iterations. - assert_close(w1, te_linear1.weight) - assert_close(w2, te_linear2.weight) - assert_close(b1, te_linear1.bias) - assert_close(b2, te_linear2.bias) + assert_close(w1, te_linear1.weight, *tolerances) + assert_close(w2, te_linear2.weight, *tolerances) + assert_close(b1, te_linear1.bias, *tolerances) + assert_close(b2, te_linear2.bias, *tolerances) @requiresCUDA @@ -220,6 +232,7 @@ def test_te_linear_forward_backward_multiple_iteration_multiple_recipes(): pytest.skip("platform does not support two different recipes") dtype = torch.bfloat16 + tolerances = dtype_tolerance(dtype) device = "cuda" # Running more iterations leads to `nan` for both eager and thunder # with BlockScaling. @@ -278,10 +291,10 @@ def thunder_model(x, fp8_recipe): train_model(thunder_model, thunder_sgd_optimizer) # Verify that the weights and biases converge to same value after few iterations. - assert_close(w1, te_linear1.weight) - assert_close(w2, te_linear2.weight) - assert_close(b1, te_linear1.bias) - assert_close(b2, te_linear2.bias) + assert_close(w1, te_linear1.weight, *tolerances) + assert_close(w2, te_linear2.weight, *tolerances) + assert_close(b1, te_linear1.bias, *tolerances) + assert_close(b2, te_linear2.bias, *tolerances) @requiresCUDA @@ -562,6 +575,7 @@ def test_te_activation_checkpointing_correctness(fp8_recipe: recipe.Recipe, comp pytest.skip(msg_nvfp4) dtype = torch.bfloat16 + tolerances = dtype_tolerance(dtype) device = "cuda" iterations = 6 @@ -650,10 +664,10 @@ def thunder_model(x): for loss, te_loss in zip(thunder_loss_hist, te_loss_hist): assert_close(loss, te_loss) - assert_close(w1, te_linear1.weight) - assert_close(w2, te_linear2.weight) - assert_close(b1, te_linear1.bias) - assert_close(b2, te_linear2.bias) + assert_close(w1, te_linear1.weight, *tolerances) + assert_close(w2, te_linear2.weight, *tolerances) + assert_close(b1, te_linear1.bias, *tolerances) + assert_close(b2, te_linear2.bias, *tolerances) # TE does not expose the scales for MXFP8 if fp8_recipe.delayed(): From 5b07c96e0f214ee7f8d9ed79e50c47ca7d3a34ea Mon Sep 17 00:00:00 2001 From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:36:37 +0300 Subject: [PATCH 2/3] grouped dtype and wrapped assert_close --- .../tests/test_transformer_engine_executor.py | 69 ++++++++++--------- 1 file changed, 38 insertions(+), 31 deletions(-) diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py index aa256676e3..11564b8336 100644 --- a/thunder/tests/test_transformer_engine_executor.py +++ b/thunder/tests/test_transformer_engine_executor.py @@ -52,16 +52,23 @@ recipes += [nvfp4_e2m1_recipe] recipe_ids += ["nvfp4_e2m1"] + # Returns the estimated numerical error for a given dtype as per TE spec here: # https://github.com/NVIDIA/TransformerEngine/blob/7ad130efd52c3aa4a386d25f1d42b28d5aa20090/tests/pytorch/test_numerics.py#L155-L167 -def dtype_tolerance(dtype): - if dtype == torch.float32: - return dict(rtol=1.3e-6, atol=1e-5) - if dtype == torch.float16: - return dict(rtol=1e-3, atol=1e-5) - if dtype == torch.bfloat16: - return dict(rtol=1.6e-2, atol=1e-5) - raise ValueError(f"Unsuppored dtype ({dtype})") +def te_assert_close(actual, expected, **kwargs): + tolerances = {} + + if actual.dtype == torch.float32: + tolerances = dict(rtol=1.3e-6, atol=1e-5) + if actual.dtype == torch.float16: + tolerances = dict(rtol=1e-3, atol=1e-5) + if actual.dtype == torch.bfloat16: + tolerances = dict(rtol=1.6e-2, atol=1e-5) + + kwargs.update(tolerances) + + assert_close(actual, expected, **kwargs) + @requiresCUDA @pytest.mark.parametrize("fp8_recipe", recipes, ids=recipe_ids) @@ -79,7 +86,7 @@ def test_te_linear_forward_backward(fp8_recipe: recipe.Recipe): # Verify that `torch.nn.functional.linear` is replaced with `te_linear_*` # and the output as well as the gradients match for thunder compiled code. dtype = torch.bfloat16 - tolerances = dtype_tolerance(dtype) + device = "cuda" # TE inputs (3D input) @@ -111,15 +118,15 @@ def fn(x, w1, w2): te_result = te_linear2(inter_result + x_te) # Verifies the result is close to TE - assert_close(thunder_result, te_result) + te_assert_close(thunder_result, te_result) grad_output = torch.randn_like(te_result) te_result.backward(grad_output) thunder_result.backward(grad_output) - assert_close(x.grad, x_te.grad, *tolerances) - assert_close(w1.grad, te_linear1.weight.grad, *tolerances) - assert_close(w2.grad, te_linear2.weight.grad, *tolerances) + te_assert_close(x.grad, x_te.grad) + te_assert_close(w1.grad, te_linear1.weight.grad) + te_assert_close(w2.grad, te_linear2.weight.grad) # Verifies te_linear was called forward_trace = thunder.last_traces(cfn) @@ -154,7 +161,7 @@ def test_te_linear_forward_backward_multiple_iteration(fp8_recipe: recipe.Recipe # Since, the FP8 operations are stateful, we want to verify that # our output matches over multiple iterations (where state handling comes into picture) dtype = torch.bfloat16 - tolerances = dtype_tolerance(dtype) + device = "cuda" # Running more iterations leads to `nan` for both eager and thunder # with BlockScaling. @@ -212,10 +219,10 @@ def thunder_model(x): train_model(thunder_model, thunder_sgd_optimizer) # Verify that the weights and biases converge to same value after few iterations. - assert_close(w1, te_linear1.weight, *tolerances) - assert_close(w2, te_linear2.weight, *tolerances) - assert_close(b1, te_linear1.bias, *tolerances) - assert_close(b2, te_linear2.bias, *tolerances) + te_assert_close(w1, te_linear1.weight) + te_assert_close(w2, te_linear2.weight) + te_assert_close(b1, te_linear1.bias) + te_assert_close(b2, te_linear2.bias) @requiresCUDA @@ -232,7 +239,7 @@ def test_te_linear_forward_backward_multiple_iteration_multiple_recipes(): pytest.skip("platform does not support two different recipes") dtype = torch.bfloat16 - tolerances = dtype_tolerance(dtype) + device = "cuda" # Running more iterations leads to `nan` for both eager and thunder # with BlockScaling. @@ -291,10 +298,10 @@ def thunder_model(x, fp8_recipe): train_model(thunder_model, thunder_sgd_optimizer) # Verify that the weights and biases converge to same value after few iterations. - assert_close(w1, te_linear1.weight, *tolerances) - assert_close(w2, te_linear2.weight, *tolerances) - assert_close(b1, te_linear1.bias, *tolerances) - assert_close(b2, te_linear2.bias, *tolerances) + te_assert_close(w1, te_linear1.weight) + te_assert_close(w2, te_linear2.weight) + te_assert_close(b1, te_linear1.bias) + te_assert_close(b2, te_linear2.bias) @requiresCUDA @@ -575,7 +582,7 @@ def test_te_activation_checkpointing_correctness(fp8_recipe: recipe.Recipe, comp pytest.skip(msg_nvfp4) dtype = torch.bfloat16 - tolerances = dtype_tolerance(dtype) + device = "cuda" iterations = 6 @@ -662,12 +669,12 @@ def thunder_model(x): train_model(thunder_model, thunder_sgd_optimizer, thunder_loss_hist) for loss, te_loss in zip(thunder_loss_hist, te_loss_hist): - assert_close(loss, te_loss) + te_assert_close(loss, te_loss) - assert_close(w1, te_linear1.weight, *tolerances) - assert_close(w2, te_linear2.weight, *tolerances) - assert_close(b1, te_linear1.bias, *tolerances) - assert_close(b2, te_linear2.bias, *tolerances) + te_assert_close(w1, te_linear1.weight) + te_assert_close(w2, te_linear2.weight) + te_assert_close(b1, te_linear1.bias) + te_assert_close(b2, te_linear2.bias) # TE does not expose the scales for MXFP8 if fp8_recipe.delayed(): @@ -701,8 +708,8 @@ def thunder_model(x): # check the scales are the same but for last dimension which is always on in TE for te_scale, th_scale in zip(te_scales, th_scales): - assert_close(te_scale[:-1], th_scale) + te_assert_close(te_scale[:-1], th_scale) # check that amax history is the same as TE for te_amax, th_amax in zip(te_amax_hist, th_amax_hist): - assert_close(te_amax[:, :-1], th_amax) + te_assert_close(te_amax[:, :-1], th_amax) From ae5ab96efe0de2e84cc3830889227ea6d0c3508f Mon Sep 17 00:00:00 2001 From: riccardofelluga <11768013+riccardofelluga@users.noreply.github.com> Date: Tue, 14 Oct 2025 14:39:15 +0300 Subject: [PATCH 3/3] include float scalar input --- thunder/tests/test_transformer_engine_executor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/thunder/tests/test_transformer_engine_executor.py b/thunder/tests/test_transformer_engine_executor.py index 11564b8336..b8a7f9cb9e 100644 --- a/thunder/tests/test_transformer_engine_executor.py +++ b/thunder/tests/test_transformer_engine_executor.py @@ -58,11 +58,13 @@ def te_assert_close(actual, expected, **kwargs): tolerances = {} - if actual.dtype == torch.float32: + if not isinstance(actual, torch.Tensor) and isinstance(actual, float): tolerances = dict(rtol=1.3e-6, atol=1e-5) - if actual.dtype == torch.float16: + elif actual.dtype == torch.float32: + tolerances = dict(rtol=1.3e-6, atol=1e-5) + elif actual.dtype == torch.float16: tolerances = dict(rtol=1e-3, atol=1e-5) - if actual.dtype == torch.bfloat16: + elif actual.dtype == torch.bfloat16: tolerances = dict(rtol=1.6e-2, atol=1e-5) kwargs.update(tolerances)