Arm backend: Added decomposition for MaxPool2d with dilation > 0. (#11724)

wwwind · web-flow · commit 6af28c97c093 · 2025-06-17T09:19:35.000+02:00
Arm backend: Added decomposition for MaxPool2D operator with dilation &gt;
0

Signed-off-by: Elena Zhelezina &lt;elena.zhelezina@arm.com&gt;
diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py
@@ -29,6 +29,7 @@
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
+from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 from .decompose_ne_pass import DecomposeNotEqualPass  # noqa
 from .decompose_select import DecomposeSelectPass  # noqa
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -32,6 +32,7 @@
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeLinearVectorNormPass,
+    DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
     DecomposeNotEqualPass,
     DecomposeSelectPass,
@@ -123,6 +124,7 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass())
+        self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
@@ -179,6 +181,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(CastInt64BuffersToInt32Pass(exported_program))
         self.add_pass(DecomposeSumPass())
         self.add_pass(Conv1dUnsqueezePass())
+        self.add_pass(DecomposeMaxPool2DPass())
         self.add_pass(DecomposeSelectPass())
         self.add_pass(ConvertSqueezesToViewPass())
 
diff --git a/backends/arm/_passes/decompose_maxpool2d_with_dilation.py b/backends/arm/_passes/decompose_maxpool2d_with_dilation.py
@@ -0,0 +1,208 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import operator
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+# We'll decompose only the EXIR edge max_pool2d ops when dilation > 1
+EDGE_MAXPOOL2D = (
+    exir_ops.edge.aten.max_pool2d.default,
+    exir_ops.edge.aten.max_pool2d_with_indices.default,
+)
+
+
+class DecomposeMaxPool2DPass(ArmPass):
+    """
+    Decompose dilated max_pool2d (EXIR edge ops) into space-to-batch -> maxpool -> batch-to-space.
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        # Only intercept EXIR edge max_pool2d ops
+        if op not in EDGE_MAXPOOL2D:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # detect whether indices variant
+        is_with_indices = op is exir_ops.edge.aten.max_pool2d_with_indices.default
+
+        # Normalize missing trailing args to their defaults
+        x = args[0]
+        kernel_size = args[1]
+        stride = args[2]
+        padding = args[3] if len(args) >= 4 else 0
+        dilation = args[4] if len(args) >= 5 else 1
+
+        # Normalize attributes
+        pad_h, pad_w = (padding, padding) if isinstance(padding, int) else padding
+        d_h, d_w = (dilation, dilation) if isinstance(dilation, int) else dilation
+        k_h, k_w = (
+            (kernel_size, kernel_size) if isinstance(kernel_size, int) else kernel_size
+        )
+        s_h, s_w = (stride, stride) if isinstance(stride, int) else stride
+
+        # If no dilation: call EXIR edge op with only supported args (x, kernel, stride[, padding])
+        if d_h == 1 and d_w == 1:
+            minimal_args = [x, kernel_size, stride]
+            # only include padding if non-zero
+            if (pad_h, pad_w) != (0, 0):
+                minimal_args.append((pad_h, pad_w))
+            return super().call_operator(op, tuple(minimal_args), {}, meta)
+
+        # Compute padded and packed dimensions for dilation > 1
+        N, C, H, W = x.data.size()
+        ph, pw = pad_h, pad_w
+        ph2, pw2 = pad_h, pad_w
+        H_pad = H + ph + ph2
+        W_pad = W + pw + pw2
+        H_pack = (H_pad + d_h - 1) // d_h
+        W_pack = (W_pad + d_w - 1) // d_w
+        extra_h = 0 if H_pack < k_h else (s_h - ((H_pack - k_h) % s_h)) % s_h
+        extra_w = 0 if W_pack < k_w else (s_w - ((W_pack - k_w) % s_w)) % s_w
+        ph2 += extra_h * d_h
+        pw2 += extra_w * d_w
+
+        # 1) Pad via EXIR edge pad (preserves dtype)
+        pad_edge = exir_ops.edge.aten.constant_pad_nd.default
+        pads = [pw, pw2, ph, ph2, 0, 0, 0, 0]
+        x_pad = super().call_operator(
+            pad_edge,
+            (x, pads, 0),
+            {},
+            meta,
+        )
+
+        # 2) Space-to-batch: reshape and permute
+        x2 = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (x_pad, [N, C, H_pack, d_h, W_pack, d_w]),
+            {},
+            meta,
+        )
+        x2 = super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (x2, [3, 5, 0, 1, 2, 4]),
+            {},
+            meta,
+        )
+        x2 = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (x2, [N * d_h * d_w, C, H_pack, W_pack]),
+            {},
+            meta,
+        )
+
+        # 3) Core pooling on packed tensor
+        pool_edge_op = (
+            exir_ops.edge.aten.max_pool2d_with_indices.default
+            if is_with_indices
+            else exir_ops.edge.aten.max_pool2d.default
+        )
+        pool_args = (x2, (k_h, k_w), (s_h, s_w), (0, 0))
+        pool_out = super().call_operator(
+            pool_edge_op,
+            pool_args,
+            {},
+            meta,
+        )
+
+        # Unpack pooled result
+        if is_with_indices:
+            pooled_proxy = super().call_operator(
+                operator.getitem,
+                (pool_out, 0),
+                {},
+                meta,
+            )
+            indices_proxy = super().call_operator(
+                operator.getitem,
+                (pool_out, 1),
+                {},
+                meta,
+            )
+            pooled_fake, _ = pool_out.data
+        else:
+            pooled_proxy = pool_out
+            pooled_fake = pool_out.data
+            indices_proxy = None
+
+        _, C_out, H_out, W_out = pooled_fake.shape
+
+        # 4) Batch-to-space: reshape and permute back
+        out = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (pooled_proxy, [d_h, d_w, N, C_out, H_out, W_out]),
+            {},
+            meta,
+        )
+        out = super().call_operator(
+            exir_ops.edge.aten.permute_copy.default,
+            (out, [2, 3, 4, 0, 5, 1]),
+            {},
+            meta,
+        )
+        # now flatten back into (N, C, H_out*d_h, W_out*d_w)
+        out = super().call_operator(
+            exir_ops.edge.aten.view_copy.default,
+            (out, [N, C_out, H_out * d_h, W_out * d_w]),
+            {},
+            meta,
+        )
+
+        # 5) Final crop
+        S_top = ph // d_h + (1 if ph % d_h else 0)
+        S_left = pw // d_w + (1 if pw % d_w else 0)
+        S_top = max(0, min(S_top, H_out * d_h - H))
+        S_left = max(0, min(S_left, W_out * d_w - W))
+        out = super().call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            (out, 2, S_top, S_top + H),
+            {},
+            meta,
+        )
+        out = super().call_operator(
+            exir_ops.edge.aten.slice_copy.Tensor,
+            (out, 3, S_left, S_left + W),
+            {},
+            meta,
+        )
+
+        if is_with_indices:
+            # Reconstruct indices
+            idx = super().call_operator(
+                exir_ops.edge.aten.view_copy.default,
+                (indices_proxy, [d_h, d_w, N, C_out, H_out, W_out]),
+                {},
+                meta,
+            )
+            idx = super().call_operator(
+                exir_ops.edge.aten.permute_copy.default,
+                (idx, [2, 3, 4, 0, 5, 1]),
+                {},
+                meta,
+            )
+            idx = super().call_operator(
+                exir_ops.edge.aten.view_copy.default,
+                (idx, [N, C_out, H_out * d_h, W_out * d_w]),
+                {},
+                meta,
+            )
+            idx = super().call_operator(
+                exir_ops.edge.aten.slice_copy.Tensor,
+                (idx, 2, S_top, S_top + H),
+                {},
+                meta,
+            )
+            idx = super().call_operator(
+                exir_ops.edge.aten.slice_copy.Tensor,
+                (idx, 3, S_left, S_left + W),
+                {},
+                meta,
+            )
+            return out, idx
+
+        return out
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
@@ -19,7 +19,6 @@
     TosaPipelineMI,
 )
 
-
 test_data_suite = {
     # (test_name, test_data, [kernel_size, stride, padding])
     "zeros": lambda: (torch.zeros(1, 1, 4, 8), [2, 2, 1]),
@@ -34,6 +33,20 @@
     "randn": lambda: (torch.randn(5, 16, 50, 32), [4, 2, 0]),
 }
 
+test_data_suite_dilation = [
+    # Simple dilation=2 on 8x8 input, kernel=3, stride=1, no padding
+    ("dilation2", torch.rand(1, 1, 8, 8), [3, 1, 0, 2]),
+    # Input is 6x6, kernel=3, stride=1, dilation=2.
+    # Padding=1 expands the effective input to 8x8.
+    ("pad_then_dil2", torch.rand(1, 1, 6, 6), [3, 1, 1, 2]),
+    # Input is 16x16, kernel=2x2, stride=2x2, dilation=1 (no dilation).
+    # Padding of 1 ensures the input size remains divisible by stride
+    # after padding.
+    ("even_kernel_fast", torch.rand(1, 3, 16, 16), [(2, 2), (2, 2), (1, 1), 1]),
+    # Multi-batch, multi-channel input (N=4, C=3), kernel=3x3,
+    # stride=3x3, no padding, dilation=1.
+    ("mb_ch_dil1", torch.rand(4, 3, 12, 12), [(3, 3), (3, 3), 0, 1]),
+]
 
 aten_op = "torch.ops.aten.max_pool2d.default"
 exir_op = "executorch_exir_dialects_edge__ops_aten_max_pool2d_default"
@@ -47,10 +60,14 @@ def __init__(
         kernel_size: int | Tuple[int, int],
         stride: int | Tuple[int, int],
         padding: int | Tuple[int, int],
+        dilation: int | Tuple[int, int] = 1,
     ):
         super().__init__()
         self.max_pool_2d = torch.nn.MaxPool2d(
-            kernel_size=kernel_size, stride=stride, padding=padding
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
         )
 
     def forward(self, x):
@@ -180,3 +197,41 @@ def test_max_pool2d_u55_BI_failure_set(test_data: Tuple):
     )
     pipeline.pop_stage("check_count.exir")
     pipeline.run()
+
+
+# Convert the list of (name, tensor, params) into the dict-of-lambdas shape
+dilation_test_data = {
+    name: (lambda data=data, params=params: (data, params))
+    for name, data, params in test_data_suite_dilation
+}
+
+
+@common.parametrize("test_data", dilation_test_data)
+def test_max_pool2d_tosa_MI_dilation(test_data):
+    """
+    TOSA MI pipeline with dilation > 1 (and dilation=1 sanity cases).
+    """
+    data, model_params = test_data()
+    pipeline = TosaPipelineMI[input_t1](
+        MaxPool2d(*model_params),
+        (data,),
+        aten_op,
+        exir_op,
+    )
+    pipeline.run()
+
+
+@common.parametrize("test_data", dilation_test_data)
+def test_max_pool2d_tosa_BI_dilation(test_data):
+    """
+    TOSA BI pipeline with dilation > 1 (and dilation=1 sanity cases).
+    """
+    data, model_params = test_data()
+    pipeline = TosaPipelineBI[input_t1](
+        MaxPool2d(*model_params),
+        (data,),
+        aten_op,
+        exir_op,
+        symmetric_io_quantization=True,
+    )
+    pipeline.run()