Add inplace quantizer examples (pytorch#2345)

cccclai · facebook-github-bot · commit 52dbbc6c9f52 · 2025-06-16T17:47:54.000-07:00
Summary: Pull Request resolved: pytorch#2345 Add a quantizer example for in place ops, and add a patch to the constant fold pass such that the mutable buffer won't be folded Differential Revision: D76312488
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -11,32 +11,32 @@
 import unittest
 
 import torch
+
+import torchao
 from torch import Tensor
 from torch.ao.quantization import QConfigMapping
 from torch.ao.quantization.qconfig import (
-    QConfig,
     default_per_channel_symmetric_qnnpack_qconfig,
     per_channel_weight_observer_range_neg_127_to_127,
+    QConfig,
     weight_observer_range_neg_127_to_127,
 )
+from torch.export import ExportedProgram
 from torch.fx import Node
+from torch.fx.graph_module import GraphModule
 from torch.testing._internal.common_quantization import (
     NodeSpec as ns,
-)
-from torch.testing._internal.common_quantization import (
-    TestHelperModules,
     skipIfNoQNNPACK,
+    TestHelperModules,
 )
 from torch.testing._internal.common_utils import (
-    TEST_CUDA,
-    TemporaryFileName,
     instantiate_parametrized_tests,
     parametrize,
     run_tests,
+    TemporaryFileName,
+    TEST_CUDA,
 )
-
-import torchao
-from torchao.quantization.pt2e import ObserverOrFakeQuantize, observer
+from torchao.quantization.pt2e import observer, ObserverOrFakeQuantize
 from torchao.quantization.pt2e.quantize_pt2e import (
     convert_pt2e,
     prepare_pt2e,
@@ -58,8 +58,8 @@
     EmbeddingQuantizer,
 )
 from torchao.testing.pt2e._xnnpack_quantizer import (
-    XNNPACKQuantizer,
     get_symmetric_quantization_config,
+    XNNPACKQuantizer,
 )
 from torchao.testing.pt2e._xnnpack_quantizer_utils import (
     OP_TO_ANNOTATOR,
@@ -75,9 +75,7 @@
 DEVICE_LIST = ["cpu"] + (["cuda"] if TEST_CUDA else [])
 
 if TORCH_VERSION_AT_LEAST_2_7:
-    from torch.testing._internal.common_utils import (
-        TEST_HPU,
-    )
+    from torch.testing._internal.common_utils import TEST_HPU
 
     DEVICE_LIST += ["hpu"] if TEST_HPU else []
 
@@ -2826,6 +2824,88 @@ def check_nn_module(node):
             if node.name == "mul":
                 check_nn_module(node)
 
+    def test_quantize_in_place_ops(self):
+        class TestQuantizer(Quantizer):
+            example_inputs = None
+
+            def set_example_inputs(self, example_inputs):
+                self.example_inputs = example_inputs
+
+            def transform_for_annotation(
+                self, model: torch.fx.GraphModule
+            ) -> torch.fx.GraphModule:
+                # Make a copy of the graph to ensure that we are using the
+                # return value of this function.
+                ep = torch.export.export(model, self.example_inputs)
+                ep = ep.run_decompositions({})
+                return ep.module()
+
+            def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
+                act_qspec = QuantizationSpec(
+                    dtype=torch.uint8,
+                    quant_min=0,
+                    quant_max=255,
+                    qscheme=torch.per_tensor_affine,
+                    is_dynamic=False,
+                    observer_or_fake_quant_ctr=observer.default_observer
+                )
+                for node in model.graph.nodes:
+                    if (
+                        node.op == "call_function"
+                        and node.target == torch.ops.aten.add.Tensor
+                    ):
+                        input_act0 = node.args[0]
+                        assert isinstance(input_act0, torch.fx.Node)
+                        input_act1 = node.args[1]
+                        assert isinstance(input_act1, torch.fx.Node)
+                        print("input_act1 is a node")
+                        node.meta["quantization_annotation"] = QuantizationAnnotation(
+                            input_qspec_map={
+                                input_act0: act_qspec,
+                                input_act1: act_qspec,
+                            },
+                            output_qspec=act_qspec,
+                            _annotated=True,
+                        )
+
+            def validate(self, model: torch.fx.GraphModule) -> None:
+                pass
+
+        class M(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.register_buffer("buf", torch.randn(1, 2, 3, 3))
+
+            def forward(self, x):
+                self.buf.add_(x)
+                return self.buf
+
+        def has_inplace_ops(graph_module: GraphModule) -> bool:
+            return len([
+                n for n in graph_module.graph.nodes if n.op == "call_function" and n.name.endswith("_") and n.name != "copy_"
+                ]) > 0
+
+        m = M().eval()
+        quantizer = TestQuantizer()        
+        example_inputs = (torch.randn(1, 2, 3, 3),)
+        quantizer.set_example_inputs(example_inputs)
+        m = export_for_training(m, example_inputs, strict=True).module()
+        # Check that the model has in-place ops
+        self.assertTrue(has_inplace_ops(m))
+        m = prepare_pt2e(m, quantizer)
+        # Check that the model no longer has in-place ops because the graph is funtionalized during annotate_to_tranform
+        self.assertFalse(has_inplace_ops(m))
+        m(*example_inputs)
+        m = convert_pt2e(m, fold_quantize=True)
+        for node in m.graph.nodes:
+            if node.name == "quantize_per_tensor_default":
+                # Ensure the quant node is not fused with the mutable buffer
+                self.assertTrue(node.op == "call_function")
+    
+        # Verify the quantized model works
+        result = m(*example_inputs)
+        self.assertIsNotNone(result)
+
 
 @skipIfNoQNNPACK
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_7, "Requires torch 2.7+")
diff --git a/torchao/quantization/pt2e/constant_fold.py b/torchao/quantization/pt2e/constant_fold.py
@@ -92,6 +92,24 @@ def __init__(
         self.lifted_constant_names = lifted_constant_names
         self.deferred_value = object()
         self.skip_folding_node_fn = skip_folding_node_fn
+        
+        # Identify mutable buffers by finding copy_ operations
+        self.mutable_buffers = self._find_mutable_buffers()
+
+    def _find_mutable_buffers(self) -> set[torch.fx.Node]:
+        """Find mutable buffers by identifying copy_ operations.
+        The first argument of copy_ op is the mutable buffer."""
+        mutable_buffers = set()
+        for node in self.module.graph.nodes:
+            if (
+                node.op == "call_function"
+                and hasattr(node.target, "_schema")
+                and "copy_" in str(node.target)
+            ):
+                # The first argument of copy_ is the mutable buffer
+                if len(node.args) > 0 and isinstance(node.args[0], torch.fx.Node):
+                    mutable_buffers.add(node.args[0])
+        return mutable_buffers
 
     def _support_dynamic_shape(self) -> bool:
         # ConstantFolder not support dynamic shape now
@@ -156,6 +174,13 @@ def is_woq_int8_pattern(node: torch.fx.node.Node) -> bool:
             # We only folding fp32_weight -> q
             # int8_weight and leave dq in graph to be fused
             return True
+
+        # Check if any input to this node is a mutable buffer
+        # If so, prevent constant folding to avoid issues with quantize_per_tensor_default
+        for arg in node.args:
+            if isinstance(arg, torch.fx.Node) and arg in self.mutable_buffers:
+                return True
+
         return False
 
     def node_to_last_non_output_use(self) -> dict[torch.fx.Node, list[torch.fx.Node]]:
@@ -261,7 +286,6 @@ def set_env(arg: torch.fx.Node) -> None:
 
             if self.is_impure(node):
                 return self.unknown_value
-
             self.add_node_replacement(node, out)
 
             flattened_node_inps = pytree.arg_tree_leaves(*node.args, **node.kwargs)