open-mmlab
diff --git a/‎models/codec/dualcodec/LICENSE
Lines changed: 21 additions & 0 deletions b/‎models/codec/dualcodec/LICENSE
Lines changed: 21 additions & 0 deletions
diff --git a/‎models/codec/dualcodec/dualcodec/infer/flattened_ar/flatten_patterns.py
Lines changed: 23 additions & 7 deletions b/‎models/codec/dualcodec/dualcodec/infer/flattened_ar/flatten_patterns.py
Lines changed: 23 additions & 7 deletions
diff --git a/‎models/codec/dualcodec/dualcodec/infer/flattened_ar/inference_flattened.py
Lines changed: 46 additions & 19 deletions b/‎models/codec/dualcodec/dualcodec/infer/flattened_ar/inference_flattened.py
Lines changed: 46 additions & 19 deletions
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2025 Jiaqi Li
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
@@ -1,8 +1,13 @@
+# Copyright (c) 2025 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 import torch
 from einops import rearrange
 
 import numpy as np
 
+
 def offset_codes(semantic_code, offset_sizes):
     """
     Applies layer-specific offsets to each codec layer.
@@ -15,7 +20,9 @@ def offset_codes(semantic_code, offset_sizes):
         torch.Tensor: Offset-applied tensor of shape (batch_size, T, num_codec_layers).
     """
     # Calculate cumulative offsets for each layer
-    cumulative_offsets = np.cumsum([0] + offset_sizes[:-1])  # Start with 0 for the first layer
+    cumulative_offsets = np.cumsum(
+        [0] + offset_sizes[:-1]
+    )  # Start with 0 for the first layer
     # Apply offsets layer by layer
     offsetted_code = []
     for i, offset in enumerate(cumulative_offsets):
@@ -24,10 +31,13 @@ def offset_codes(semantic_code, offset_sizes):
         offsetted_code.append(current_layer_code)
 
     # Stack all layers along the codec layer dimension
-    offsetted_code = torch.stack(offsetted_code, dim=-1)  # Shape: (batch_size, T, num_codec_layers)
+    offsetted_code = torch.stack(
+        offsetted_code, dim=-1
+    )  # Shape: (batch_size, T, num_codec_layers)
 
     return offsetted_code
 
+
 def deoffset_codes(flattened_codes, offset_sizes):
     """
     De-offsets a flattened tensor by subtracting the codebook size offsets for each codec layer.
@@ -40,7 +50,9 @@ def deoffset_codes(flattened_codes, offset_sizes):
         torch.Tensor: The de-offset tensor of shape (batch_size, T, num_codec_layers).
     """
     # Calculate cumulative offsets for each layer
-    cumulative_offsets = np.cumsum([0] + offset_sizes[:-1])  # Start with 0 for the first layer
+    cumulative_offsets = np.cumsum(
+        [0] + offset_sizes[:-1]
+    )  # Start with 0 for the first layer
 
     # Determine dimensions for reshaping
     batch_size, flattened_dim = flattened_codes.shape
@@ -53,11 +65,15 @@ def deoffset_codes(flattened_codes, offset_sizes):
     # De-offset each layer by subtracting the respective cumulative offset
     deoffsetted_code = []
     for i, offset in enumerate(cumulative_offsets):
-        current_layer_code = reshaped_codes[..., i].clone()  # Clone to avoid in-place operation
+        current_layer_code = reshaped_codes[
+            ..., i
+        ].clone()  # Clone to avoid in-place operation
         current_layer_code = current_layer_code - offset  # Remove the cumulative offset
         deoffsetted_code.append(current_layer_code)
-    
+
     # Stack all layers along the codec layer dimension
-    deoffsetted_code = torch.stack(deoffsetted_code, dim=-1)  # Shape: (batch_size, T, num_codec_layers)
+    deoffsetted_code = torch.stack(
+        deoffsetted_code, dim=-1
+    )  # Shape: (batch_size, T, num_codec_layers)
 
-    return deoffsetted_code
+    return deoffsetted_code
@@ -1,3 +1,7 @@
+# Copyright (c) 2025 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 from cv2 import repeat
 import torch
 from einops import rearrange
@@ -6,8 +10,16 @@
 
 class Inference:
     def __init__(
-        self, model, tokenizer_obj, dualcodec_inference_obj, device="cuda", normalize=False, 
-        half=False, split_paragraph=True, offset_sizes=[16384, 4096, 4096, 4096], **kwargs
+        self,
+        model,
+        tokenizer_obj,
+        dualcodec_inference_obj,
+        device="cuda",
+        normalize=False,
+        half=False,
+        split_paragraph=True,
+        offset_sizes=[16384, 4096, 4096, 4096],
+        **kwargs,
     ) -> None:
         self.model = model
         import safetensors.torch
@@ -21,9 +33,9 @@ def __init__(
         self.offset_sizes = offset_sizes
 
         self.model = self.model.half()
-        
+
         self.split_paragraph = split_paragraph
-    
+
     @torch.no_grad()
     def inference(
         self,
@@ -68,8 +80,9 @@ def inference(
         prompt_len_tmp = len(self.tokenizer.encode(prompt_text)) // 2
 
         if self.split_paragraph:
-            if prompt_language == 'zh':
+            if prompt_language == "zh":
                 from dualcodec.utils.frontend_utils import split_paragraph
+
                 texts = split_paragraph(
                     target_text,
                     None,
@@ -79,8 +92,9 @@ def inference(
                     merge_len=20,
                     comma_split=False,
                 )
-            elif prompt_language == 'ja':
+            elif prompt_language == "ja":
                 from dualcodec.utils.frontend_utils import split_paragraph
+
                 texts = split_paragraph(
                     target_text,
                     None,
@@ -90,8 +104,9 @@ def inference(
                     merge_len=20,
                     comma_split=False,
                 )
-            elif prompt_language == 'en':
+            elif prompt_language == "en":
                 from dualcodec.utils.frontend_utils import split_paragraph
+
                 texts = split_paragraph(
                     target_text,
                     self.tokenizer.encode,
@@ -103,8 +118,8 @@ def inference(
                 )
             else:
                 texts = [target_text]
-        if prompt_language == 'en':
-            texts = [prompt_text + ' ' + t for t in texts]
+        if prompt_language == "en":
+            texts = [prompt_text + " " + t for t in texts]
         else:
             texts = [prompt_text + t for t in texts]
         print(texts)
@@ -115,12 +130,20 @@ def inference(
 
             if self.normalize:
                 from dualcodec.dataset.processor import normalize
-                text = list(normalize([{
-                    'language': prompt_language,
-                    'text': text,
-                }], en_punct=True, use_kana=False))[0]['text']
-            print(text)
 
+                text = list(
+                    normalize(
+                        [
+                            {
+                                "language": prompt_language,
+                                "text": text,
+                            }
+                        ],
+                        en_punct=True,
+                        use_kana=False,
+                    )
+                )[0]["text"]
+            print(text)
 
             prompt_text_tokens = torch.tensor(
                 [
@@ -143,13 +166,17 @@ def inference(
 
             # prompt semantic codes
             # semantic_code, _ = self._extract_semantic_code(input_features, attention_mask)
-            semantic_codes, acoustic_codes = self.dualcodec_inference_obj.encode(prompt_speech, n_quantizers=4)
-            semantic_codes = rearrange(semantic_codes, 'b t -> b t 1')
+            semantic_codes, acoustic_codes = self.dualcodec_inference_obj.encode(
+                prompt_speech, n_quantizers=4
+            )
+            semantic_codes = rearrange(semantic_codes, "b t -> b t 1")
             num_codec_layers = 4
-            semantic_code = torch.cat([semantic_codes, acoustic_codes], dim=-1)[..., :num_codec_layers]
+            semantic_code = torch.cat([semantic_codes, acoustic_codes], dim=-1)[
+                ..., :num_codec_layers
+            ]
 
             semantic_code = offset_codes(semantic_code, self.offset_sizes)
-            semantic_code = rearrange(semantic_code, 'b t q -> b (t q)')
+            semantic_code = rearrange(semantic_code, "b t q -> b (t q)")
 
             ret_semantic_code = semantic_code.clone().detach()
 
@@ -169,6 +196,6 @@ def inference(
 
             all_codes.append(out)
 
-        all_codes = torch.cat(all_codes, dim=1) # FIXME not tested
+        all_codes = torch.cat(all_codes, dim=1)  # FIXME not tested
         out = self.dualcodec_inference_obj.decode(all_codes)
         return out