pytorch
diff --git a/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 193 additions & 0 deletions b/‎test/prototype/mx_formats/test_mx_tensor.py
Lines changed: 193 additions & 0 deletions
diff --git a/‎torchao/prototype/mx_formats/mx_subclass.py
Lines changed: 6 additions & 1 deletion b/‎torchao/prototype/mx_formats/mx_subclass.py
Lines changed: 6 additions & 1 deletion
@@ -657,3 +657,196 @@ def assert_sqnr_gt_threshold(orig, new, threshold):
     assert x.t().dtype == x_reconstructed_t.dtype, (
         f"Transpose dtype mismatch: {x.t().dtype} vs {x_reconstructed_t.dtype}"
     )
+
+
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (128, 4),
+        (256, 8),
+        (100, 3),
+        (4, 4),
+        (50, 10),
+        (384, 12),
+    ],
+)
+@pytest.mark.parametrize("use_triton_kernel", [False, True])
+@pytest.mark.skipif(
+    not TORCH_VERSION_AT_LEAST_2_8, reason="torch.compile requires PyTorch 2.8+"
+)
+def test_to_blocked_from_blocked_roundtrip(shape, use_triton_kernel: bool):
+    """
+    Test that to_blocked and from_blocked are proper inverses of each other
+    for various input shapes that may require padding.
+    """
+    from torchao.prototype.mx_formats.utils import from_blocked, to_blocked
+
+    rows, cols = shape
+
+    # Use CUDA if available, otherwise CPU
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    # Test with random data
+    original = torch.randn(rows, cols, device=device, dtype=torch.float32)
+
+    # Test both triton and PyTorch implementations
+    # Only test triton if we have torch 2.8+ and triton available
+    blocked = to_blocked(original, use_triton_kernel=use_triton_kernel)
+    reconstructed = from_blocked(blocked, rows, cols)
+
+    torch.testing.assert_close(
+        original,
+        reconstructed,
+        atol=1e-6,
+        rtol=1e-6,
+        msg=f"Roundtrip failed for shape {shape} with use_triton_kernel={use_triton_kernel}",
+    )
+
+    ones = torch.ones(rows, cols, device=device, dtype=torch.float32)
+    blocked_ones = to_blocked(ones, use_triton_kernel=False)
+    reconstructed_ones = from_blocked(blocked_ones, rows, cols)
+    torch.testing.assert_close(ones, reconstructed_ones, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.parametrize("store_swizzled", [False, True])
+@pytest.mark.parametrize(
+    "shape",
+    [
+        (32, 64),
+        (16, 32),
+        (64, 128),
+        (384, 128),
+    ],
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_nvfp4_swizzled_scales_construction(store_swizzled, shape):
+    """
+    Test that NVFP4Tensor can be constructed with swizzled scales and
+    that the _swizzled_scales flag is set correctly.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = shape
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensor with specified swizzled storage
+    tensor = NVFP4Tensor.to_nvfp4(data, store_swizzled=store_swizzled)
+
+    # Verify the flag is set correctly
+    assert tensor._swizzled_scales == store_swizzled
+
+    # Verify the tensor can be dequantized correctly
+    reconstructed = tensor.to_dtype(torch.bfloat16)
+    assert reconstructed.shape == data.shape
+
+
+@pytest.mark.parametrize(
+    "slice_dim,slice_spec",
+    [
+        pytest.param(0, slice(0, 16), id="slice_rows[0:16]"),
+        pytest.param(0, slice(8, 24), id="slice_rows[8:24]"),
+        pytest.param(1, slice(0, 32), id="slice_cols[0:32]"),
+        pytest.param(1, slice(16, 48), id="slice_cols[16:48]"),
+    ],
+)
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_nvfp4_swizzled_scales_slicing(slice_dim, slice_spec):
+    """
+    Test that slicing works correctly with swizzled scales and maintains
+    the swizzled state in the output tensor.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 32, 64
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensor with swizzled scales
+    tensor = NVFP4Tensor.to_nvfp4(data, store_swizzled=True)
+    assert tensor._swizzled_scales == True
+
+    # Perform slice operation
+    if slice_dim == 0:
+        sliced_tensor = tensor[slice_spec, :]
+    else:
+        sliced_tensor = tensor[:, slice_spec]
+
+    # Verify sliced tensor maintains swizzled state
+    assert sliced_tensor._swizzled_scales == True
+
+    # Verify sliced tensor can be dequantized
+    sliced_reconstructed = sliced_tensor.to_dtype(torch.bfloat16)
+
+    # Compare with direct slicing of original data
+    original_reconstructed = tensor.to_dtype(torch.bfloat16)
+    if slice_dim == 0:
+        expected = original_reconstructed[slice_spec, :]
+    else:
+        expected = original_reconstructed[:, slice_spec]
+
+    torch.testing.assert_close(sliced_reconstructed, expected, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_nvfp4_swizzled_scales_serialization():
+    """
+    Test that tensor flatten/unflatten preserves the swizzled scales state.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 32, 64
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensor with swizzled scales
+    original_tensor = NVFP4Tensor.to_nvfp4(data, store_swizzled=True)
+
+    # Test serialization
+    tensor_list, ctx = original_tensor.__tensor_flatten__()
+
+    # Verify swizzled flag is preserved in context
+    assert "_swizzled_scales" in ctx
+    assert ctx["_swizzled_scales"] == True
+
+    # Test deserialization
+    inner_tensors = {}
+    for name in tensor_list:
+        inner_tensors[name] = getattr(original_tensor, name)
+
+    reconstructed_tensor = NVFP4Tensor.__tensor_unflatten__(
+        inner_tensors, ctx, None, None
+    )
+
+    # Verify the swizzled state is preserved
+    assert reconstructed_tensor._swizzled_scales == True
+
+    # Verify functionality is preserved
+    original_dq = original_tensor.to_dtype(torch.bfloat16)
+    reconstructed_dq = reconstructed_tensor.to_dtype(torch.bfloat16)
+
+    torch.testing.assert_close(original_dq, reconstructed_dq, atol=1e-6, rtol=1e-6)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+def test_nvfp4_swizzled_scales_get_scales_method():
+    """
+    Test that the get_scales() method correctly unswizzles scales when needed.
+    """
+    from torchao.prototype.mx_formats.nvfp4_tensor import NVFP4Tensor
+
+    M, K = 32, 64
+    data = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+
+    # Create tensors with both storage methods
+    regular_tensor = NVFP4Tensor.to_nvfp4(data, store_swizzled=False)
+    swizzled_tensor = NVFP4Tensor.to_nvfp4(data, store_swizzled=True)
+
+    # Get scales from both tensors
+    regular_scales = regular_tensor.get_hp_scales()
+    swizzled_scales = swizzled_tensor.get_hp_scales()
+
+    # Scales should be equivalent (within quantization error)
+    torch.testing.assert_close(regular_scales, swizzled_scales, atol=1e-6, rtol=1e-6)
+
+    # Verify scales have the expected shape
+    expected_shape = (M, K // 16)
+    assert regular_scales.shape == expected_shape
+    assert swizzled_scales.shape == expected_shape
@@ -184,6 +184,11 @@ def _nvfp4_inference_linear_transform(
 
     weight = module.weight
 
+    if weight.shape[0] % 16 != 0 or weight.shape[1] % 16 != 0:
+        raise RuntimeError(
+            f"NVFP4 only supports weight shape divisible by 16, got {weight.shape}"
+        )
+
     if module.bias is not None and weight.dtype == torch.float32:
         raise RuntimeError(
             "Bias is not supported when module weight is in fp32 (out_dtype=Float32). "
@@ -193,8 +198,8 @@ def _nvfp4_inference_linear_transform(
     quantized_weight = NVFP4Tensor.to_nvfp4(
         weight,
         mm_config=config.mm_config,
+        store_swizzled=True,
     )
-
     module.weight = torch.nn.Parameter(quantized_weight, requires_grad=False)
     module.extra_repr = types.MethodType(_linear_extra_repr, module)
     return module