Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .github/workflows/export-kitten.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-latest]
version: ["nano_v0_1", "nano_v0_2"]
version: ["nano_v0_1", "nano_v0_2", "mini_v0_1"]
python-version: ["3.10"]

steps:
Expand Down Expand Up @@ -59,6 +59,8 @@ jobs:
d=kitten-nano-en-v0_1-fp16
elif [[ $version == "nano_v0_2" ]]; then
d=kitten-nano-en-v0_2-fp16
elif [[ $version == "mini_v0_1" ]]; then
d=kitten-mini-en-v0_1-fp16
else
echo "version $version"
exit 1
Expand Down Expand Up @@ -111,6 +113,7 @@ jobs:
dirs=(
kitten-nano-en-v0_1-fp16
kitten-nano-en-v0_2-fp16
kitten-mini-en-v0_1-fp16
)

export GIT_LFS_SKIP_SMUDGE=1
Expand Down
21 changes: 17 additions & 4 deletions .github/workflows/generate-tts-samples.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
pip install "numpy<=1.26.4" sherpa-onnx soundfile

- name: kitten
if: false
if: true
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
Expand All @@ -46,8 +46,9 @@ jobs:
export GIT_LFS_SKIP_SMUDGE=1
export GIT_CLONE_PROTECTION_ACTIVE=false
git clone https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/sherpa-onnx-tts-samples hf
mkdir -p ./hf/kitten/v0.1/mp3
mkdir -p ./hf/kitten/v0.2/mp3
mkdir -p ./hf/kitten/v0.1-nano/mp3
mkdir -p ./hf/kitten/v0.2-nano/mp3
mkdir -p ./hf/kitten/v0.1-mini/mp3

for v in 1 2; do
pushd nano_v0_$v
Expand All @@ -61,6 +62,18 @@ jobs:
popd
done

for v in 1; do
pushd mini_v0_$v
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kitten-mini-en-v0_$v-fp16.tar.bz2
tar xf kitten-mini-en-v0_$v-fp16.tar.bz2
rm kitten-mini-en-v0_$v-fp16.tar.bz2

ln -s ../hf .
python3 ./generate_samples.py
rm -rf kitten-mini-en-v0_$v-fp16
popd
done

pushd hf
git pull
git add .
Expand All @@ -70,7 +83,7 @@ jobs:
rm -rf hf

- name: matcha en (ljspeech)
if: true
if: false
shell: bash
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
Expand Down
5 changes: 5 additions & 0 deletions scripts/apk/generate-tts-apk-script.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,11 @@ def get_kitten_models() -> List[TtsModel]:
model_name="model.fp16.onnx",
lang="en",
),
TtsModel(
model_dir="kitten-mini-en-v0_1-fp16",
model_name="model.fp16.onnx",
lang="en",
),
]
for m in english_models:
m.data_dir = f"{m.model_dir}/espeak-ng-data"
Expand Down
76 changes: 76 additions & 0 deletions scripts/kitten-tts/mini_v0_1/add_meta_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)


import argparse

import numpy as np
import onnx

from generate_voices_bin import speaker2id


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model", type=str, required=True, help="input and output onnx model"
)

return parser.parse_args()


def main():
args = get_args()
print(args.model)

model = onnx.load(args.model)

style = np.load("./voices.npz")
style_shape = style[list(style.keys())[0]].shape

speaker2id_str = ""
id2speaker_str = ""
sep = ""
for s, i in speaker2id.items():
speaker2id_str += f"{sep}{s}->{i}"
id2speaker_str += f"{sep}{i}->{s}"
sep = ","

meta_data = {
"model_type": "kitten-tts",
"language": "English",
"has_espeak": 1,
"sample_rate": 24000,
"version": 1,
"voice": "en-us",
"style_dim": ",".join(map(str, style_shape)),
"n_speakers": len(speaker2id),
"speaker2id": speaker2id_str,
"id2speaker": id2speaker_str,
"speaker_names": ",".join(map(str, speaker2id.keys())),
"model_url": "https://huggingface.co/KittenML/kitten-tts-nano-0.2",
"see_also": "https://github.com/KittenML/KittenTTS",
"maintainer": "k2-fsa",
"comment": "This is kitten-tts-nano-0.2 and supports only English",
}

print(model.metadata_props)

while len(model.metadata_props):
model.metadata_props.pop()

for key, value in meta_data.items():
meta = model.metadata_props.add()
meta.key = key
meta.value = str(value)
print("--------------------")

print(model.metadata_props)

onnx.save(model, args.model)

print(f"Please see {args.model}")


if __name__ == "__main__":
main()
30 changes: 30 additions & 0 deletions scripts/kitten-tts/mini_v0_1/convert_opset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)

"""
Change the model so that it can be run in onnxruntime 1.17.1
"""

import onnx


def main():
model = onnx.load("kitten_tts_mini_v0_1.onnx")

# Print current opsets
for opset in model.opset_import:
print(f"Domain: '{opset.domain}', Version: {opset.version}")

# Modify the opset versions (be careful!)
for opset in model.opset_import:
if opset.domain == "": # ai.onnx domain
opset.version = 19 # change from 20 to 19
elif opset.domain == "ai.onnx.ml":
opset.version = 4 # change from 5 to 4

# Save the modified model
onnx.save(model, "model.fp16.onnx")


if __name__ == "__main__":
main()
41 changes: 41 additions & 0 deletions scripts/kitten-tts/mini_v0_1/generate_samples.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python3
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)
"""
Generate samples for
https://k2-fsa.github.io/sherpa/onnx/tts/all/
"""


import sherpa_onnx
import soundfile as sf

from generate_voices_bin import speaker2id

config = sherpa_onnx.OfflineTtsConfig(
model=sherpa_onnx.OfflineTtsModelConfig(
kitten=sherpa_onnx.OfflineTtsKittenModelConfig(
model="kitten-mini-en-v0_1-fp16/model.fp16.onnx",
voices="kitten-mini-en-v0_1-fp16/voices.bin",
tokens="kitten-mini-en-v0_1-fp16/tokens.txt",
data_dir="kitten-mini-en-v0_1-fp16/espeak-ng-data",
),
num_threads=2,
),
max_num_sentences=1,
)

if not config.validate():
raise ValueError("Please check your config")

tts = sherpa_onnx.OfflineTts(config)
text = "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone."

for s, i in speaker2id.items():
print(s, i, len(speaker2id))
audio = tts.generate(text, sid=i, speed=1.0)

sf.write(
f"./hf/kitten/v0.1-mini/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)
1 change: 1 addition & 0 deletions scripts/kitten-tts/mini_v0_1/generate_tokens.py
1 change: 1 addition & 0 deletions scripts/kitten-tts/mini_v0_1/generate_voices_bin.py
21 changes: 21 additions & 0 deletions scripts/kitten-tts/mini_v0_1/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# Copyright 2025 Xiaomi Corp. (authors: Fangjun Kuang)

set -ex

if [ ! -f kitten_tts_mini_v0_1.onnx ]; then
curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/kitten_tts_mini_v0_1.onnx
fi

if [ ! -f voices.npz ]; then
curl -SL -O https://huggingface.co/KittenML/kitten-tts-mini-0.1/resolve/main/voices.npz
fi

./generate_voices_bin.py
./generate_tokens.py

./convert_opset.py
./show.py
./add_meta_data.py --model ./model.fp16.onnx
# ./test.py --model ./model.fp16.onnx --tokens ./tokens.txt --voice ./voices.bin
ls -lh
1 change: 1 addition & 0 deletions scripts/kitten-tts/mini_v0_1/show.py
1 change: 1 addition & 0 deletions scripts/kitten-tts/mini_v0_1/test.py
2 changes: 1 addition & 1 deletion scripts/kitten-tts/nano_v0_1/generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
audio = tts.generate(text, sid=i, speed=1.0)

sf.write(
f"./hf/kitten/v0.1/mp3/{i}-{s}.mp3",
f"./hf/kitten/v0.1-nano/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)
2 changes: 1 addition & 1 deletion scripts/kitten-tts/nano_v0_2/generate_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
audio = tts.generate(text, sid=i, speed=1.0)

sf.write(
f"./hf/kitten/v0.2/mp3/{i}-{s}.mp3",
f"./hf/kitten/v0.2-nano/mp3/{i}-{s}.mp3",
audio.samples,
samplerate=audio.sample_rate,
)