Update occlusion to new method of constructing ablated batches (#1616)

sarahtranfb · facebook-github-bot · commit 0fea20c73764 · 2025-07-03T15:56:33.000-07:00
Summary:

`FeaturePermutation` and `Occlusion` are subclasses of `FeatureAblation`, which contains the bulk of the logic around iterating through and processing perturbed inputs. 
Previously, `FeatureAblation` only constructed the perturbed input by looking at each input tensor individually, as there wasn't an explicit use-case that needed cross-tensor feature grouping. However this behavior has been modified to support cross-tensor masking, as different sparse features at Meta are represented by different tensors when they're finally passed to Captum, and we need to support grouping across features and feature types for various Ads workstreams/asks. 
The new behavior is mostly rolled out internally, and is controlled by the `enable_cross_tensor_attribution` flag. We do not want to support both behaviors indefinitely. 

`Occlusion` does not use the custom masks parameter exposed via `FeatureAblation`, but constructs the masks internally. There's no use-case requiring cross-tensor masking for occlusion, so it's not a requirement that it still follows the logic in `FeatureAblation`. However, it's been adapted in this diff in order to just reuse the code from `FeatureAblation`, as the logic specific to `enable_cross_tensor_attribution=False` will be going away.

Reviewed By: cyrjano

Differential Revision: D76483214
diff --git a/captum/_utils/common.py b/captum/_utils/common.py
@@ -908,22 +908,6 @@ def _get_max_feature_index(feature_mask: Tuple[Tensor, ...]) -> int:
     return int(max(torch.max(mask).item() for mask in feature_mask if mask.numel()))
 
 
-def _get_feature_idx_to_tensor_idx(
-    formatted_feature_mask: Tuple[Tensor, ...],
-) -> Dict[int, List[int]]:
-    """
-    For a given tuple of tensors, return dict of tensor values to list of tensor indices
-    they appear in.
-    """
-    feature_idx_to_tensor_idx: Dict[int, List[int]] = {}
-    for i, mask in enumerate(formatted_feature_mask):
-        for feature_idx in torch.unique(mask):
-            if feature_idx.item() not in feature_idx_to_tensor_idx:
-                feature_idx_to_tensor_idx[feature_idx.item()] = []
-            feature_idx_to_tensor_idx[feature_idx.item()].append(i)
-    return feature_idx_to_tensor_idx
-
-
 def _maybe_expand_parameters(
     perturbations_per_eval: int,
     formatted_inputs: Tuple[Tensor, ...],
diff --git a/captum/attr/_core/feature_ablation.py b/captum/attr/_core/feature_ablation.py
@@ -24,7 +24,6 @@
     _format_additional_forward_args,
     _format_feature_mask,
     _format_output,
-    _get_feature_idx_to_tensor_idx,
     _is_tuple,
     _maybe_expand_parameters,
     _run_forward,
@@ -507,8 +506,8 @@ def _attribute_with_cross_tensor_feature_masks(
         perturbations_per_eval: int,
         **kwargs: Any,
     ) -> Tuple[List[Tensor], List[Tensor]]:
-        feature_idx_to_tensor_idx = _get_feature_idx_to_tensor_idx(
-            formatted_feature_mask
+        feature_idx_to_tensor_idx = self._get_feature_idx_to_tensor_idx(
+            formatted_feature_mask, **kwargs
         )
         all_feature_idxs = list(feature_idx_to_tensor_idx.keys())
 
@@ -575,6 +574,7 @@ def _attribute_with_cross_tensor_feature_masks(
                     current_feature_idxs,
                     feature_idx_to_tensor_idx,
                     current_num_ablated_features,
+                    **kwargs,
                 )
             )
 
@@ -613,6 +613,21 @@ def _attribute_with_cross_tensor_feature_masks(
             )
         return total_attrib, weights
 
+    def _get_feature_idx_to_tensor_idx(
+        self, formatted_feature_mask: Tuple[Tensor, ...], **kwargs: Any
+    ) -> Dict[int, List[int]]:
+        """
+        For a given tuple of tensors, return dict of tensor values to list of tensor
+        indices they appear in.
+        """
+        feature_idx_to_tensor_idx: Dict[int, List[int]] = {}
+        for i, mask in enumerate(formatted_feature_mask):
+            for feature_idx in torch.unique(mask):
+                if feature_idx.item() not in feature_idx_to_tensor_idx:
+                    feature_idx_to_tensor_idx[feature_idx.item()] = []
+                feature_idx_to_tensor_idx[feature_idx.item()].append(i)
+        return feature_idx_to_tensor_idx
+
     def _should_skip_inputs_and_warn(
         self,
         current_feature_idxs: List[int],
@@ -656,6 +671,7 @@ def _construct_ablated_input_across_tensors(
         feature_idxs: List[int],
         feature_idx_to_tensor_idx: Dict[int, List[int]],
         current_num_ablated_features: int,
+        **kwargs: Any,
     ) -> Tuple[Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]]:
         ablated_inputs = []
         current_masks: List[Optional[Tensor]] = []
@@ -946,8 +962,8 @@ def _attribute_with_cross_tensor_feature_masks_future(
         perturbations_per_eval: int,
         **kwargs: Any,
     ) -> Future[Union[Tensor, Tuple[Tensor, ...]]]:
-        feature_idx_to_tensor_idx = _get_feature_idx_to_tensor_idx(
-            formatted_feature_mask
+        feature_idx_to_tensor_idx = self._get_feature_idx_to_tensor_idx(
+            formatted_feature_mask, **kwargs
         )
         all_feature_idxs = list(feature_idx_to_tensor_idx.keys())
 
@@ -1016,6 +1032,7 @@ def _attribute_with_cross_tensor_feature_masks_future(
                     current_feature_idxs,
                     feature_idx_to_tensor_idx,
                     current_num_ablated_features,
+                    **kwargs,
                 )
             )
 
diff --git a/captum/attr/_core/feature_permutation.py b/captum/attr/_core/feature_permutation.py
@@ -377,6 +377,7 @@ def _construct_ablated_input_across_tensors(
         feature_idxs: List[int],
         feature_idx_to_tensor_idx: Dict[int, List[int]],
         current_num_ablated_features: int,
+        **kwargs: Any,
     ) -> Tuple[Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]]:
         current_masks: List[Optional[Tensor]] = []
         tensor_idxs = {
diff --git a/captum/attr/_core/occlusion.py b/captum/attr/_core/occlusion.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 # pyre-strict
-from typing import Any, Callable, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -267,6 +267,7 @@ def attribute(  # type: ignore
             shift_counts=tuple(shift_counts),
             strides=strides,
             show_progress=show_progress,
+            enable_cross_tensor_attribution=True,
         )
 
     def attribute_future(self) -> None:
@@ -310,6 +311,7 @@ def _construct_ablated_input(
                     kwargs["sliding_window_tensors"],
                     kwargs["strides"],
                     kwargs["shift_counts"],
+                    is_expanded_input=True,
                 )
                 for j in range(start_feature, end_feature)
             ],
@@ -327,11 +329,12 @@ def _construct_ablated_input(
 
     def _occlusion_mask(
         self,
-        expanded_input: Tensor,
+        input: Tensor,
         ablated_feature_num: int,
         sliding_window_tsr: Tensor,
         strides: Union[int, Tuple[int, ...]],
         shift_counts: Tuple[int, ...],
+        is_expanded_input: bool,
     ) -> Tensor:
         """
         This constructs the current occlusion mask, which is the appropriate
@@ -365,8 +368,9 @@ def _occlusion_mask(
             current_index.append((remaining_total % shift_count) * stride)
             remaining_total = remaining_total // shift_count
 
+        dim = 2 if is_expanded_input else 1
         remaining_padding = np.subtract(
-            expanded_input.shape[2:], np.add(current_index, sliding_window_tsr.shape)
+            input.shape[dim:], np.add(current_index, sliding_window_tsr.shape)
         )
         pad_values = [
             val for pair in zip(remaining_padding, current_index) for val in pair
@@ -391,3 +395,88 @@ def _get_feature_counts(
     ) -> Tuple[int, ...]:
         """return the numbers of possible input features"""
         return tuple(np.prod(counts).astype(int) for counts in kwargs["shift_counts"])
+
+    def _get_feature_idx_to_tensor_idx(
+        self, formatted_feature_mask: Tuple[Tensor, ...], **kwargs: Any
+    ) -> Dict[int, List[int]]:
+        feature_idx_to_tensor_idx = {}
+        curr_feature_idx = 0
+        for i, shift_count in enumerate(kwargs["shift_counts"]):
+            num_features = int(np.prod(shift_count))
+            for _ in range(num_features):
+                feature_idx_to_tensor_idx[curr_feature_idx] = [i]
+                curr_feature_idx += 1
+        return feature_idx_to_tensor_idx
+
+    def _get_accumulated_shift_count_products(
+        self,
+        shift_counts: Tuple[int, ...],
+    ) -> List[int]:
+        shift_count_prod = [np.prod(counts).astype(int) for counts in shift_counts]
+        curr_prod = 1
+        acc_prod = [0]
+        for i in range(1, len(shift_count_prod)):
+            curr_prod *= shift_count_prod[i - 1]
+            acc_prod.append(curr_prod)
+        return acc_prod
+
+    def _construct_ablated_input_across_tensors(
+        self,
+        inputs: Tuple[Tensor, ...],
+        input_mask: Tuple[Tensor, ...],
+        baselines: BaselineType,
+        feature_idxs: List[int],
+        feature_idx_to_tensor_idx: Dict[int, List[int]],
+        current_num_ablated_features: int,
+        **kwargs: Any,
+    ) -> Tuple[Tuple[Tensor, ...], Tuple[Optional[Tensor], ...]]:
+        ablated_inputs = []
+        current_masks: List[Optional[Tensor]] = []
+        tensor_idxs = {
+            tensor_idx
+            for sublist in (
+                feature_idx_to_tensor_idx[feature_idx] for feature_idx in feature_idxs
+            )
+            for tensor_idx in sublist
+        }
+
+        accumulated_shift_count_prods = self._get_accumulated_shift_count_products(
+            kwargs["shift_counts"]
+        )
+        for i, input_tensor in enumerate(inputs):
+            if i not in tensor_idxs:
+                ablated_inputs.append(input_tensor)
+                current_masks.append(None)
+                continue
+            tensor_mask: List[Tensor] = []
+            baseline = baselines[i] if isinstance(baselines, tuple) else baselines
+            for feature_idx in feature_idxs:
+
+                if feature_idx_to_tensor_idx[feature_idx][0] != i:
+                    tensor_mask.append(torch.zeros((1,) + input_tensor.shape[1:]))
+                    continue
+                ablated_feature_num = feature_idx - accumulated_shift_count_prods[i]
+                mask = self._occlusion_mask(
+                    input_tensor,
+                    ablated_feature_num,
+                    kwargs["sliding_window_tensors"][i],
+                    kwargs["strides"][i],
+                    kwargs["shift_counts"][i],
+                    is_expanded_input=False,
+                )
+                tensor_mask.append(mask)
+            assert baseline is not None, "baseline must be provided"
+            current_mask = torch.stack(tensor_mask, dim=0)
+            current_masks.append(current_mask)
+            ablated_input = input_tensor.clone().reshape(
+                (current_num_ablated_features, -1) + tuple(input_tensor.shape[1:])
+            )
+            ablated_input = (
+                ablated_input
+                * (
+                    torch.ones(1, dtype=torch.long, device=input_tensor.device)
+                    - current_mask
+                ).to(input_tensor.dtype)
+            ) + (baseline * current_mask.to(input_tensor.dtype))
+            ablated_inputs.append(ablated_input.reshape(input_tensor.shape))
+        return tuple(ablated_inputs), tuple(current_masks)
diff --git a/tests/attr/test_occlusion.py b/tests/attr/test_occlusion.py
@@ -144,6 +144,18 @@ def test_simple_multi_input(self) -> None:
             sliding_window_shapes=((1,), (1,)),
         )
 
+    def test_simple_multi_input_diff_shapes(self) -> None:
+        net = BasicModel3()
+        inp1 = torch.tensor([[-10.0, 3.0], [8.0, 4.0]])
+        inp2 = torch.tensor([[-5.0], [1.0]])
+        self._occlusion_test_assert(
+            net,
+            (inp1, inp2),
+            (torch.tensor([[0.0, 0.0], [6.0, 6.0]]), torch.tensor([[0.0], [-1.0]])),
+            sliding_window_shapes=((2,), (1,)),
+            perturbations_per_eval=(2,),
+        )
+
     def test_simple_multi_input_0d(self) -> None:
         net = BasicModel3()
         inp1 = torch.tensor([-10.0, 3.0])
@@ -282,6 +294,31 @@ def test_simple_multi_input_conv(self) -> None:
             strides=((1, 2, 1), (1, 1, 2)),
         )
 
+    def test_simple_multi_input_conv_diff_shapes(self) -> None:
+        net = BasicModel_ConvNet_One_Conv()
+        inp = torch.arange(16, dtype=torch.float).view(1, 1, 4, 4)
+        inp2 = torch.ones((1, 1, 4, 1))
+        self._occlusion_test_assert(
+            net,
+            (inp, inp2),
+            (
+                [
+                    [
+                        [
+                            [17.0, 17.0, 17.0, 10.0],
+                            [17.0, 17.0, 17.0, 10.0],
+                            [64.0, 64.0, 64.0, 13.0],
+                            [64.0, 64.0, 64.0, 13.0],
+                        ]
+                    ]
+                ],
+                [[[[13.0], [13.0], [13.0], [0.0]]]],
+            ),
+            perturbations_per_eval=(1, 2, 4, 8, 12, 16),
+            sliding_window_shapes=((1, 2, 3), (1, 3, 1)),
+            strides=((1, 2, 3), (1, 3, 1)),
+        )
+
     def test_futures_not_implemented(self) -> None:
         net = BasicModel_ConvNet_One_Conv()
         occ = Occlusion(net)