[Refactor] Refactor voxelization for faster speed (#2062)

Xiangxu-0103 · ZwwWayne · commit 2072a9df7511 · 2022-12-03T16:39:52.000+08:00
* refactor voxelization for faster speed

* fix doc typo
diff --git a/mmdet3d/models/data_preprocessors/data_preprocessor.py b/mmdet3d/models/data_preprocessors/data_preprocessor.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import math
 from numbers import Number
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Union
 
 import numpy as np
 import torch
@@ -28,24 +28,25 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
     - 1) For image data:
     - Pad images in inputs to the maximum size of current batch with defined
       ``pad_value``. The padding size can be divisible by a defined
-      ``pad_size_divisor``
+      ``pad_size_divisor``.
     - Stack images in inputs to batch_imgs.
     - Convert images in inputs from bgr to rgb if the shape of input is
-        (3, H, W).
+      (3, H, W).
     - Normalize images in inputs with defined std and mean.
     - Do batch augmentations during training.
 
     - 2) For point cloud data:
-    - if no voxelization, directly return list of point cloud data.
-    - if voxelization is applied, voxelize point cloud according to
+    - If no voxelization, directly return list of point cloud data.
+    - If voxelization is applied, voxelize point cloud according to
       ``voxel_type`` and obtain ``voxels``.
 
     Args:
-        voxel (bool): Whether to apply voxelziation to point cloud.
+        voxel (bool): Whether to apply voxelization to point cloud.
+            Defaults to False.
         voxel_type (str): Voxelization type. Two voxelization types are
             provided: 'hard' and 'dynamic', respectively for hard
             voxelization and dynamic voxelization. Defaults to 'hard'.
-        voxel_layer (:obj:`ConfigDict`, optional): Voxelization layer
+        voxel_layer (dict or :obj:`ConfigDict`, optional): Voxelization layer
             config. Defaults to None.
         mean (Sequence[Number], optional): The pixel mean of R, G, B channels.
             Defaults to None.
@@ -54,11 +55,21 @@ class Det3DDataPreprocessor(DetDataPreprocessor):
         pad_size_divisor (int): The size of padded image should be
             divisible by ``pad_size_divisor``. Defaults to 1.
         pad_value (Number): The padded pixel value. Defaults to 0.
-        bgr_to_rgb (bool): whether to convert image from BGR to RGB.
+        pad_mask (bool): Whether to pad instance masks. Defaults to False.
+        mask_pad_value (int): The padded pixel value for instance masks.
+            Defaults to 0.
+        pad_seg (bool): Whether to pad semantic segmentation maps.
+            Defaults to False.
+        seg_pad_value (int): The padded pixel value for semantic
+            segmentation maps. Defaults to 255.
+        bgr_to_rgb (bool): Whether to convert image from BGR to RGB.
             Defaults to False.
-        rgb_to_bgr (bool): whether to convert image from RGB to RGB.
+        rgb_to_bgr (bool): Whether to convert image from RGB to BGR.
             Defaults to False.
-        batch_augments (list[dict], optional): Batch-level augmentations
+        boxtype2tensor (bool): Whether to keep the ``BaseBoxes`` type of
+            bboxes data or not. Defaults to True.
+        batch_augments (List[dict], optional): Batch-level augmentations.
+            Defaults to None.
     """
 
     def __init__(self,
@@ -76,8 +87,8 @@ def __init__(self,
                  bgr_to_rgb: bool = False,
                  rgb_to_bgr: bool = False,
                  boxtype2tensor: bool = True,
-                 batch_augments: Optional[List[dict]] = None):
-        super().__init__(
+                 batch_augments: Optional[List[dict]] = None) -> None:
+        super(Det3DDataPreprocessor).__init__(
             mean=mean,
             std=std,
             pad_size_divisor=pad_size_divisor,
@@ -94,24 +105,21 @@ def __init__(self,
         if voxel:
             self.voxel_layer = Voxelization(**voxel_layer)
 
-    def forward(
-        self,
-        data: Union[dict, List[dict]],
-        training: bool = False
-    ) -> Tuple[Union[dict, List[dict]], Optional[list]]:
-        """Perform normalization、padding and bgr2rgb conversion based on
+    def forward(self,
+                data: Union[dict, List[dict]],
+                training: bool = False) -> Union[dict, List[dict]]:
+        """Perform normalization, padding and bgr2rgb conversion based on
         ``BaseDataPreprocessor``.
 
         Args:
-            data (dict | List[dict]): data from dataloader.
+            data (dict or List[dict]): Data from dataloader.
                 The dict contains the whole batch data, when it is
                 a list[dict], the list indicate test time augmentation.
-
             training (bool): Whether to enable training time augmentation.
                 Defaults to False.
 
         Returns:
-            Dict | List[Dict]: Data in the same format as the model input.
+            dict or List[dict]: Data in the same format as the model input.
         """
         if isinstance(data, list):
             num_augs = len(data)
@@ -126,7 +134,7 @@ def forward(
             return self.simple_process(data, training)
 
     def simple_process(self, data: dict, training: bool = False) -> dict:
-        """Perform normalization、padding and bgr2rgb conversion for img data
+        """Perform normalization, padding and bgr2rgb conversion for img data
         based on ``BaseDataPreprocessor``, and voxelize point cloud if `voxel`
         is set to be True.
 
@@ -188,7 +196,7 @@ def simple_process(self, data: dict, training: bool = False) -> dict:
 
         return {'inputs': batch_inputs, 'data_samples': data_samples}
 
-    def preprocess_img(self, _batch_img):
+    def preprocess_img(self, _batch_img: torch.Tensor) -> torch.Tensor:
         # channel transform
         if self._channel_conversion:
             _batch_img = _batch_img[[2, 1, 0], ...]
@@ -206,7 +214,7 @@ def preprocess_img(self, _batch_img):
         return _batch_img
 
     def collate_data(self, data: dict) -> dict:
-        """Copying data to the target device and Performs normalization、
+        """Copying data to the target device and Performs normalization,
         padding and bgr2rgb conversion and stack based on
         ``BaseDataPreprocessor``.
 
@@ -273,7 +281,7 @@ def collate_data(self, data: dict) -> dict:
                 raise TypeError(
                     'Output of `cast_data` should be a list of dict '
                     'or a tuple with inputs and data_samples, but got'
-                    f'{type(data)}： {data}')
+                    f'{type(data)}: {data}')
 
             data['inputs']['imgs'] = batch_imgs
 
@@ -284,14 +292,14 @@ def collate_data(self, data: dict) -> dict:
     def _get_pad_shape(self, data: dict) -> List[tuple]:
         """Get the pad_shape of each image based on data and
         pad_size_divisor."""
-        # rewrite `_get_pad_shape` for obaining image inputs.
+        # rewrite `_get_pad_shape` for obtaining image inputs.
         _batch_inputs = data['inputs']['img']
         # Process data with `pseudo_collate`.
         if is_list_of(_batch_inputs, torch.Tensor):
             batch_pad_shape = []
             for ori_input in _batch_inputs:
                 if ori_input.dim() == 4:
-                    # mean multiivew input, select ont of the
+                    # mean multiview input, select one of the
                     # image to calculate the pad shape
                     ori_input = ori_input[0]
                 pad_h = int(
@@ -316,24 +324,24 @@ def _get_pad_shape(self, data: dict) -> List[tuple]:
             batch_pad_shape = [(pad_h, pad_w)] * _batch_inputs.shape[0]
         else:
             raise TypeError('Output of `cast_data` should be a list of dict '
-                            'or a tuple with inputs and data_samples, but got'
+                            'or a tuple with inputs and data_samples, but got '
                             f'{type(data)}: {data}')
         return batch_pad_shape
 
     @torch.no_grad()
-    def voxelize(self, points: List[torch.Tensor]) -> Dict:
+    def voxelize(self, points: List[torch.Tensor]) -> Dict[str, torch.Tensor]:
         """Apply voxelization to point cloud.
 
         Args:
             points (List[Tensor]): Point cloud in one data batch.
 
         Returns:
-            dict[str, Tensor]: Voxelization information.
+            Dict[str, Tensor]: Voxelization information.
 
-            - voxels (Tensor): Features of voxels, shape is MXNxC for hard
-                voxelization, NXC for dynamic voxelization.
-            - coors (Tensor): Coordinates of voxels, shape is  Nx(1+NDim),
-                where 1 represents the batch index.
+            - voxels (Tensor): Features of voxels, shape is MxNxC for hard
+              voxelization, NxC for dynamic voxelization.
+            - coors (Tensor): Coordinates of voxels, shape is Nx(1+NDim),
+              where 1 represents the batch index.
             - num_points (Tensor, optional): Number of points in each voxel.
             - voxel_centers (Tensor, optional): Centers of voxels.
         """
@@ -342,43 +350,38 @@ def voxelize(self, points: List[torch.Tensor]) -> Dict:
 
         if self.voxel_type == 'hard':
             voxels, coors, num_points, voxel_centers = [], [], [], []
-            for res in points:
+            for i, res in enumerate(points):
                 res_voxels, res_coors, res_num_points = self.voxel_layer(res)
                 res_voxel_centers = (
                     res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
                         self.voxel_layer.voxel_size) + res_voxels.new_tensor(
                             self.voxel_layer.point_cloud_range[0:3])
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
                 voxels.append(res_voxels)
                 coors.append(res_coors)
                 num_points.append(res_num_points)
                 voxel_centers.append(res_voxel_centers)
 
             voxels = torch.cat(voxels, dim=0)
+            coors = torch.cat(coors, dim=0)
             num_points = torch.cat(num_points, dim=0)
             voxel_centers = torch.cat(voxel_centers, dim=0)
-            coors_batch = []
-            for i, coor in enumerate(coors):
-                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-                coors_batch.append(coor_pad)
-            coors_batch = torch.cat(coors_batch, dim=0)
+
             voxel_dict['num_points'] = num_points
             voxel_dict['voxel_centers'] = voxel_centers
         elif self.voxel_type == 'dynamic':
             coors = []
             # dynamic voxelization only provide a coors mapping
-            for res in points:
+            for i, res in enumerate(points):
                 res_coors = self.voxel_layer(res)
+                res_coors = F.pad(res_coors, (1, 0), mode='constant', value=i)
                 coors.append(res_coors)
             voxels = torch.cat(points, dim=0)
-            coors_batch = []
-            for i, coor in enumerate(coors):
-                coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
-                coors_batch.append(coor_pad)
-            coors_batch = torch.cat(coors_batch, dim=0)
+            coors = torch.cat(coors, dim=0)
         else:
             raise ValueError(f'Invalid voxelization type {self.voxel_type}')
 
         voxel_dict['voxels'] = voxels
-        voxel_dict['coors'] = coors_batch
+        voxel_dict['coors'] = coors
 
         return voxel_dict
diff --git a/mmdet3d/models/data_preprocessors/utils.py b/mmdet3d/models/data_preprocessors/utils.py
@@ -12,7 +12,7 @@ def multiview_img_stack_batch(
     """
     Compared to the stack_batch in mmengine.model.utils,
     multiview_img_stack_batch further handle the multiview images.
-    see diff of padded_sizes[:, :-2] = 0 vs padded_sizees[:, 0] = 0 in line 47
+    see diff of padded_sizes[:, :-2] = 0 vs padded_sizes[:, 0] = 0 in line 47
     Stack multiple tensors to form a batch and pad the tensor to the max
     shape use the right bottom padding mode in these images. If
     ``pad_size_divisor > 0``, add padding to ensure the shape of each dim is
@@ -23,20 +23,20 @@ def multiview_img_stack_batch(
         pad_size_divisor (int): If ``pad_size_divisor > 0``, add padding
             to ensure the shape of each dim is divisible by
             ``pad_size_divisor``. This depends on the model, and many
-            models need to be divisible by 32. Defaults to 1
-        pad_value (int, float): The padding value. Defaults to 0.
+            models need to be divisible by 32. Defaults to 1.
+        pad_value (int or float): The padding value. Defaults to 0.
 
     Returns:
         Tensor: The n dim tensor.
     """
     assert isinstance(
         tensor_list,
-        list), (f'Expected input type to be list, but got {type(tensor_list)}')
+        list), f'Expected input type to be list, but got {type(tensor_list)}'
     assert tensor_list, '`tensor_list` could not be an empty list'
     assert len({
         tensor.ndim
         for tensor in tensor_list
-    }) == 1, (f'Expected the dimensions of all tensors must be the same, '
+    }) == 1, ('Expected the dimensions of all tensors must be the same, '
               f'but got {[tensor.ndim for tensor in tensor_list]}')
 
     dim = tensor_list[0].dim()
@@ -46,7 +46,7 @@ def multiview_img_stack_batch(
     max_sizes = torch.ceil(
         torch.max(all_sizes, dim=0)[0] / pad_size_divisor) * pad_size_divisor
     padded_sizes = max_sizes - all_sizes
-    # The first dim normally means channel,  which should not be padded.
+    # The first dim normally means channel, which should not be padded.
     padded_sizes[:, :-2] = 0
     if padded_sizes.sum() == 0:
         return torch.stack(tensor_list)