Skip to content

Commit 2eaa87d

Browse files
committed
add amphion license
1 parent 11a177b commit 2eaa87d

File tree

10 files changed

+264
-118
lines changed

10 files changed

+264
-118
lines changed

models/codec/dualcodec/LICENSE

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
MIT License
2+
3+
Copyright (c) 2025 Jiaqi Li
4+
5+
Permission is hereby granted, free of charge, to any person obtaining a copy
6+
of this software and associated documentation files (the "Software"), to deal
7+
in the Software without restriction, including without limitation the rights
8+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
copies of the Software, and to permit persons to whom the Software is
10+
furnished to do so, subject to the following conditions:
11+
12+
The above copyright notice and this permission notice shall be included in all
13+
copies or substantial portions of the Software.
14+
15+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
SOFTWARE.

models/codec/dualcodec/dualcodec/infer/flattened_ar/flatten_patterns.py

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
# Copyright (c) 2025 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
15
import torch
26
from einops import rearrange
37

48
import numpy as np
59

10+
611
def offset_codes(semantic_code, offset_sizes):
712
"""
813
Applies layer-specific offsets to each codec layer.
@@ -15,7 +20,9 @@ def offset_codes(semantic_code, offset_sizes):
1520
torch.Tensor: Offset-applied tensor of shape (batch_size, T, num_codec_layers).
1621
"""
1722
# Calculate cumulative offsets for each layer
18-
cumulative_offsets = np.cumsum([0] + offset_sizes[:-1]) # Start with 0 for the first layer
23+
cumulative_offsets = np.cumsum(
24+
[0] + offset_sizes[:-1]
25+
) # Start with 0 for the first layer
1926
# Apply offsets layer by layer
2027
offsetted_code = []
2128
for i, offset in enumerate(cumulative_offsets):
@@ -24,10 +31,13 @@ def offset_codes(semantic_code, offset_sizes):
2431
offsetted_code.append(current_layer_code)
2532

2633
# Stack all layers along the codec layer dimension
27-
offsetted_code = torch.stack(offsetted_code, dim=-1) # Shape: (batch_size, T, num_codec_layers)
34+
offsetted_code = torch.stack(
35+
offsetted_code, dim=-1
36+
) # Shape: (batch_size, T, num_codec_layers)
2837

2938
return offsetted_code
3039

40+
3141
def deoffset_codes(flattened_codes, offset_sizes):
3242
"""
3343
De-offsets a flattened tensor by subtracting the codebook size offsets for each codec layer.
@@ -40,7 +50,9 @@ def deoffset_codes(flattened_codes, offset_sizes):
4050
torch.Tensor: The de-offset tensor of shape (batch_size, T, num_codec_layers).
4151
"""
4252
# Calculate cumulative offsets for each layer
43-
cumulative_offsets = np.cumsum([0] + offset_sizes[:-1]) # Start with 0 for the first layer
53+
cumulative_offsets = np.cumsum(
54+
[0] + offset_sizes[:-1]
55+
) # Start with 0 for the first layer
4456

4557
# Determine dimensions for reshaping
4658
batch_size, flattened_dim = flattened_codes.shape
@@ -53,11 +65,15 @@ def deoffset_codes(flattened_codes, offset_sizes):
5365
# De-offset each layer by subtracting the respective cumulative offset
5466
deoffsetted_code = []
5567
for i, offset in enumerate(cumulative_offsets):
56-
current_layer_code = reshaped_codes[..., i].clone() # Clone to avoid in-place operation
68+
current_layer_code = reshaped_codes[
69+
..., i
70+
].clone() # Clone to avoid in-place operation
5771
current_layer_code = current_layer_code - offset # Remove the cumulative offset
5872
deoffsetted_code.append(current_layer_code)
59-
73+
6074
# Stack all layers along the codec layer dimension
61-
deoffsetted_code = torch.stack(deoffsetted_code, dim=-1) # Shape: (batch_size, T, num_codec_layers)
75+
deoffsetted_code = torch.stack(
76+
deoffsetted_code, dim=-1
77+
) # Shape: (batch_size, T, num_codec_layers)
6278

63-
return deoffsetted_code
79+
return deoffsetted_code

models/codec/dualcodec/dualcodec/infer/flattened_ar/inference_flattened.py

Lines changed: 46 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
# Copyright (c) 2025 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
15
from cv2 import repeat
26
import torch
37
from einops import rearrange
@@ -6,8 +10,16 @@
610

711
class Inference:
812
def __init__(
9-
self, model, tokenizer_obj, dualcodec_inference_obj, device="cuda", normalize=False,
10-
half=False, split_paragraph=True, offset_sizes=[16384, 4096, 4096, 4096], **kwargs
13+
self,
14+
model,
15+
tokenizer_obj,
16+
dualcodec_inference_obj,
17+
device="cuda",
18+
normalize=False,
19+
half=False,
20+
split_paragraph=True,
21+
offset_sizes=[16384, 4096, 4096, 4096],
22+
**kwargs,
1123
) -> None:
1224
self.model = model
1325
import safetensors.torch
@@ -21,9 +33,9 @@ def __init__(
2133
self.offset_sizes = offset_sizes
2234

2335
self.model = self.model.half()
24-
36+
2537
self.split_paragraph = split_paragraph
26-
38+
2739
@torch.no_grad()
2840
def inference(
2941
self,
@@ -68,8 +80,9 @@ def inference(
6880
prompt_len_tmp = len(self.tokenizer.encode(prompt_text)) // 2
6981

7082
if self.split_paragraph:
71-
if prompt_language == 'zh':
83+
if prompt_language == "zh":
7284
from dualcodec.utils.frontend_utils import split_paragraph
85+
7386
texts = split_paragraph(
7487
target_text,
7588
None,
@@ -79,8 +92,9 @@ def inference(
7992
merge_len=20,
8093
comma_split=False,
8194
)
82-
elif prompt_language == 'ja':
95+
elif prompt_language == "ja":
8396
from dualcodec.utils.frontend_utils import split_paragraph
97+
8498
texts = split_paragraph(
8599
target_text,
86100
None,
@@ -90,8 +104,9 @@ def inference(
90104
merge_len=20,
91105
comma_split=False,
92106
)
93-
elif prompt_language == 'en':
107+
elif prompt_language == "en":
94108
from dualcodec.utils.frontend_utils import split_paragraph
109+
95110
texts = split_paragraph(
96111
target_text,
97112
self.tokenizer.encode,
@@ -103,8 +118,8 @@ def inference(
103118
)
104119
else:
105120
texts = [target_text]
106-
if prompt_language == 'en':
107-
texts = [prompt_text + ' ' + t for t in texts]
121+
if prompt_language == "en":
122+
texts = [prompt_text + " " + t for t in texts]
108123
else:
109124
texts = [prompt_text + t for t in texts]
110125
print(texts)
@@ -115,12 +130,20 @@ def inference(
115130

116131
if self.normalize:
117132
from dualcodec.dataset.processor import normalize
118-
text = list(normalize([{
119-
'language': prompt_language,
120-
'text': text,
121-
}], en_punct=True, use_kana=False))[0]['text']
122-
print(text)
123133

134+
text = list(
135+
normalize(
136+
[
137+
{
138+
"language": prompt_language,
139+
"text": text,
140+
}
141+
],
142+
en_punct=True,
143+
use_kana=False,
144+
)
145+
)[0]["text"]
146+
print(text)
124147

125148
prompt_text_tokens = torch.tensor(
126149
[
@@ -143,13 +166,17 @@ def inference(
143166

144167
# prompt semantic codes
145168
# semantic_code, _ = self._extract_semantic_code(input_features, attention_mask)
146-
semantic_codes, acoustic_codes = self.dualcodec_inference_obj.encode(prompt_speech, n_quantizers=4)
147-
semantic_codes = rearrange(semantic_codes, 'b t -> b t 1')
169+
semantic_codes, acoustic_codes = self.dualcodec_inference_obj.encode(
170+
prompt_speech, n_quantizers=4
171+
)
172+
semantic_codes = rearrange(semantic_codes, "b t -> b t 1")
148173
num_codec_layers = 4
149-
semantic_code = torch.cat([semantic_codes, acoustic_codes], dim=-1)[..., :num_codec_layers]
174+
semantic_code = torch.cat([semantic_codes, acoustic_codes], dim=-1)[
175+
..., :num_codec_layers
176+
]
150177

151178
semantic_code = offset_codes(semantic_code, self.offset_sizes)
152-
semantic_code = rearrange(semantic_code, 'b t q -> b (t q)')
179+
semantic_code = rearrange(semantic_code, "b t q -> b (t q)")
153180

154181
ret_semantic_code = semantic_code.clone().detach()
155182

@@ -169,6 +196,6 @@ def inference(
169196

170197
all_codes.append(out)
171198

172-
all_codes = torch.cat(all_codes, dim=1) # FIXME not tested
199+
all_codes = torch.cat(all_codes, dim=1) # FIXME not tested
173200
out = self.dualcodec_inference_obj.decode(all_codes)
174201
return out

0 commit comments

Comments
 (0)