diff --git a/api_examples/pipelines/test_text_to_speech.py b/api_examples/pipelines/test_text_to_speech.py
new file mode 100644
index 000000000..7eeb47c45
--- /dev/null
+++ b/api_examples/pipelines/test_text_to_speech.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+ "根据您的情况,建议低盐饮食配合轻度活动,已为您推荐了健康的食谱"
+)
+
+for res in output:
+ print(res)
+ res.print()
+ res.save_to_audio("./output/test.wav")
+ res.save_to_json("./output")
diff --git a/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md
new file mode 100644
index 000000000..14780d1c5
--- /dev/null
+++ b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.en.md
@@ -0,0 +1,245 @@
+---
+comments: true
+---
+
+# Text to Speech pipeline Tutorial
+
+## 1. Introduction to Text to Speech pipeline
+Text to Speech is a cutting-edge technology capable of converting computer-generated text information into natural and fluent human speech signals in real-time. This technology has been deeply applied across multiple domains including virtual assistants, accessibility services, navigation announcements, and media entertainment, significantly enhancing human-computer interaction experiences and enabling highly natural voice output in cross-linguistic scenarios.
+
+
Text to Speech Model:
+
+
+ | Model |
+ Model Download Link |
+ Training Data |
+ Model Storage Size (MB) |
+ Introduction |
+
+
+ | fastspeech2_csmsc_pwgan_csmsc |
+ fastspeech2_csmscpwgan_csmsc |
+ / |
+ 768.1 |
+ FastSpeech2 is an end-to-end text-to-speech (TTS) model developed by Microsoft, featuring efficient and stable prosody control capabilities. Utilizing a non-autoregressive architecture, it enables fast and high-quality speech synthesis, making it suitable for various applications such as virtual assistants and audiobook production. |
+
+
+
+## 2. Quick Start
+PaddleX supports experiencing the multilingual speech recognition pipeline locally using the command line or Python.
+
+Before using the multilingual speech recognition pipeline locally, please ensure that you have completed the installation of the PaddleX wheel package according to the [PaddleX Local Installation Guide](../../../installation/installation.en.md). If you wish to selectively install dependencies, please refer to the relevant instructions in the installation guide. The dependency group corresponding to this pipeline is `speech`.
+
+### 2.1 Local Experience
+
+#### 2.1.1 Command Line Experience
+PaddleX supports experiencing the text to speech pipeline locally using the command line or Python.
+
+```bash
+paddlex --pipeline text_to_speech \
+ --input "今天天气真的很好"
+```
+
+The relevant parameter descriptions can be found in the parameter descriptions in [2.1.2 Integration via Python Script]().
+
+After running, the result will be printed to the terminal, as follows:
+
+```plaintext
+{'res': {'result': array([-8.118157e-04, ..., 6.217696e-05], shape=(38700,), dtype=float32)}}
+```
+
+The explanation of the result parameters can refer to the result explanation in [2.1.2 Integration with Python Script](#212-integration-with-python-script).。
+
+#### 2.1.2 Integration with Python Script
+
+The above command line is for quickly experiencing and viewing the effect. Generally speaking, in a project, it is often necessary to integrate through code. You can complete the rapid inference of the pipeline with just a few lines of code. The inference code is as follows:
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+ "今天天气真的很好"
+)
+
+for res in output:
+ print(res)
+ res.print()
+ res.save_to_audio("./output/test.wav")
+ res.save_to_json("./output")
+```
+
+In the above Python script, the following steps are executed:
+
+(1)The text to speech pipeline object is instantiated through create_pipeline(). The specific parameter descriptions are as follows:
+
+
+
+
+| Parameter |
+Parameter Description |
+Parameter Type |
+Default |
+
+
+
+
+pipeline |
+The name of the pipeline or the path to the pipeline configuration file. If it is the pipeline name, it must be a pipeline supported by PaddleX. |
+str |
+None |
+
+
+device |
+The inference device for the pipeline. It supports specifying the specific card number of the GPU, such as "gpu:0", the specific card number of other hardware, such as "npu:0" |
+str |
+gpu:0 |
+
+
+use_hpip |
+Whether to enable the high-performance inference plugin. If set to None, the setting from the configuration file or config will be used. Not supported for now. |
+bool | None |
+None |
+
+
+hpi_config |
+High-performance inference configuration. Not supported for now. |
+dict | None |
+None |
+
+
+
+
+(2)The predict() method of the text to speech pipeline object is called to perform inference and prediction. This method will return a generator. Below are the parameters and their descriptions for the predict() method:
+
+
+
+
+| Parameter |
+Parameter Description |
+Parameter Type |
+Options |
+Default |
+
+
+
+input |
+Data to be predicted |
+str |
+
+
+ - File path, such as the local path of an text file:
/root/data/text.txt
+ - Text to be synthesized, such as
今天天气真不错
+
+ |
+None |
+
+
+
+
+(3)Process the prediction results. The prediction result for each sample is of the AudioResult type and supports operations such as printing, saving as an audio, and saving as a `json` file:
+
+
+
+
+| Method |
+Method Descrition |
+Parameter |
+Parameter type |
+Parameter Description |
+Default |
+
+
+
+print() |
+Print the result to the terminal |
+format_json |
+bool |
+Whether to format the output content using JSON indentation |
+True |
+
+
+indent |
+int |
+ | Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True |
+4 |
+
+
+ensure_ascii |
+bool |
+Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters. Effective only when format_json is True |
+False |
+
+
+save_to_json() |
+Save the result as a JSON file |
+save_path |
+str |
+Path to save the file. When it is a directory, the saved file name is consistent with the input file type naming |
+None |
+
+
+indent |
+int |
+Specify the indentation level to beautify the output JSON data, making it more readable. Effective only when format_json is True |
+4 |
+
+
+ensure_ascii |
+bool |
+Control whether to escape non-ASCII characters to Unicode. When set to True, all non-ASCII characters will be escaped; False will retain the original characters. Effective only when format_json is True |
+False |
+
+
+
+save_to_audio() |
+Save the result as a wav file |
+save_path |
+str |
+The saved file path. When it is a directory, the saved file name is consistent with the input file type name. |
+None |
+
+
+
+
+
+- Calling the `print()` method will print the result to the terminal, with the printed content explained as follows:
+
+- Calling the `save_to_audio()` method will save the above content to the specified `save_path`.
+
+
+
+Once you have the configuration file, you can customize the text_to_speech pipeline configuration by modifying the `pipeline` parameter in the `create_pipeline` method to the path to the pipeline configuration file. An example is as follows:
+
+For example, if your configuration file is saved at `./my_path/text_to_speech.yaml`, you only need to execute:
+
+```python
+from paddlex import create_pipeline
+pipeline = create_pipeline(pipeline="./my_path/text_to_speech.yaml")
+output = pipeline.predict(input="今天天气真的很好")
+for res in output:
+ res.print()
+ res.save_to_json("./output/")
+ res.save_to_audio("./output/test.wav")
+```
+
+Note: The parameters in the configuration file are the initialization parameters for the pipeline. If you want to change the initialization parameters of the text to speech pipeline, you can directly modify the parameters in the configuration file and load the configuration file for prediction. Additionally, CLI prediction also supports passing in a configuration file, simply specify the path of the configuration file with --pipeline.
+
+## 3. Development Integration/Deployment
+
+If the pipeline meets your requirements for inference speed and accuracy, you can directly proceed with development integration/deployment.
+
+If you need to apply the pipeline directly in your Python project, you can refer to the example code in [2.2.2 Python Script Method](#222-python脚本方式集成).
+
+In addition, PaddleX also provides three other deployment methods, which are detailed as follows:
+
+🚀 High-Performance Inference: In actual production environments, many applications have strict performance requirements for deployment strategies, especially in terms of response speed, to ensure the efficient operation of the system and the smoothness of the user experience. To this end, PaddleX provides a high-performance inference plugin, which aims to deeply optimize the performance of model inference and pre/post-processing to achieve significant acceleration of the end-to-end process. For detailed high-performance inference procedures, please refer to the [PaddleX High-Performance Inference Guide](../../../pipeline_deploy/high_performance_inference.en.md).
+
+☁️ Serving Deployment: Serving Deployment is a common deployment form in actual production environments. By encapsulating inference functions as services, clients can access these services through network requests to obtain inference results. PaddleX supports multiple pipeline serving deployment solutions. For detailed pipeline serving deployment procedures, please refer to the [PaddleX Serving Deployment Guide](../../../pipeline_deploy/serving.en.md).
+
+📱 On-Device Deployment: Edge deployment is a method that places computational and data processing capabilities directly on user devices, allowing them to process data without relying on remote servers. PaddleX supports deploying models on edge devices such as Android. For detailed procedures, please refer to the [PaddleX On-Device Deployment Guide](../../../pipeline_deploy/on_device_deployment.en.md).
\ No newline at end of file
diff --git a/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md
new file mode 100644
index 000000000..e36fef052
--- /dev/null
+++ b/docs/pipeline_usage/tutorials/speech_pipelines/text_to_speech.md
@@ -0,0 +1,249 @@
+---
+comments: true
+---
+
+# 语音合成产线使用教程
+
+## 1. 语音合成产线介绍
+语音合成是一种前沿技术,能够将计算机生成的文本信息实时转换为自然流畅的人类语音信号。该技术已在智能助手、无障碍服务、导航播报、媒体娱乐等多个领域深度应用,显著提升人机交互体验,实现跨语言场景的高自然度语音输出。
+
+
+> 推理耗时仅包含模型推理耗时,不包含前后处理耗时。
+
+语音合成模型:
+
+
+ | 模型 |
+ 模型下载链接 |
+ 训练数据 |
+ 模型存储大小(MB) |
+ 介绍 |
+
+
+ | fastspeech2_csmsc_pwgan_csmsc |
+ fastspeech2_csmscpwgan_csmsc |
+ / |
+ 768.1 |
+ FastSpeech2 是微软开发的端到端文本转语音(TTS)模型,具备高效稳定的韵律控制能力。它采用非自回归架构,能实现快速高质量的语音合成,适用于虚拟助手、有声读物等多种场景。 |
+
+
+
+## 2. 快速开始
+PaddleX 支持在本地使用命令行或 Python 体验语音合成产线的效果。
+
+在本地使用语音合成产线前,请确保您已经按照[PaddleX本地安装教程](../../../installation/installation.md)完成了 PaddleX 的 wheel 包安装。如果您希望选择性安装依赖,请参考安装教程中的相关说明。该产线对应的依赖分组为 `speech`。
+
+### 2.1 本地体验
+
+#### 2.1.1 命令行方式体验
+一行命令即可快速体验语音合成产线效果
+
+```bash
+paddlex --pipeline text_to_speech \
+ --input "今天天气真的很好"
+```
+
+相关的参数说明可以参考[2.1.2 Python脚本方式集成](#212-python脚本方式集成)中的参数说明。
+
+运行后,会将结果打印到终端上,结果如下:
+
+```plaintext
+{'res': {'result': array([-8.118157e-04, ..., 6.217696e-05], shape=(38700,), dtype=float32)}}
+```
+
+运行结果参数说明可以参考[2.1.2 Python脚本方式集成](#212-python脚本方式集成)中的结果解释。
+
+#### 2.1.2 Python脚本方式集成
+
+上述命令行是为了快速体验查看效果,一般来说,在项目中,往往需要通过代码集成,您可以通过几行代码即可完成产线的快速推理,推理代码如下:
+
+```python
+from paddlex import create_pipeline
+
+pipeline = create_pipeline(pipeline="text_to_speech")
+
+output = pipeline.predict(
+ "今天天气真的很好"
+)
+
+for res in output:
+ print(res)
+ res.print()
+ res.save_to_audio("./output/test.wav")
+ res.save_to_json("./output")
+```
+
+在上述 Python 脚本中,执行了如下几个步骤:
+
+(1)通过 `create_pipeline()` 实例化 text_to_speech 产线对象:具体参数说明如下:
+
+
+
+
+| 参数 |
+参数说明 |
+参数类型 |
+默认值 |
+
+
+
+
+pipeline |
+产线名称或是产线配置文件路径。如为产线名称,则必须为 PaddleX 所支持的产线。 |
+str |
+None |
+
+
+device |
+产线推理设备。支持指定GPU具体卡号,如“gpu:0”,其他硬件具体卡号,如“npu:0”,CPU如“cpu”。 |
+str |
+gpu:0 |
+
+
+use_hpip |
+是否启用高性能推理插件。如果为 None,则使用配置文件或 config 中的配置。目前暂不支持。 |
+bool | None |
+无 |
+
+
+hpi_config |
+高性能推理配置。目前暂不支持。 |
+dict | None |
+无 |
+
+
+
+
+(2)调用 text_to_speech 产线对象的 `predict()` 方法进行推理预测。该方法将返回一个 `generator`。以下是 `predict()` 方法的参数及其说明:
+
+
+
+
+| 参数 |
+参数说明 |
+参数类型 |
+可选项 |
+默认值 |
+
+
+
+input |
+待预测数据 |
+str |
+
+
+ - 文件路径,如语音文件的本地路径:
/root/data/text.txt
+ - 合成的文字,如
今天天气真不错
+
+ |
+None |
+
+
+
+
+(3)对预测结果进行处理,每个样本的预测结果均为对应的Result对象,且支持打印、保存为音频、保存为`json`文件的操作:
+
+
+
+
+| 方法 |
+方法说明 |
+参数 |
+参数类型 |
+参数说明 |
+默认值 |
+
+
+
+print() |
+打印结果到终端 |
+format_json |
+bool |
+是否对输出内容进行使用 JSON 缩进格式化 |
+True |
+
+
+indent |
+int |
+指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_json 为 True 时有效 |
+4 |
+
+
+ensure_ascii |
+bool |
+控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_json为True时有效 |
+False |
+
+
+save_to_json() |
+将结果保存为json格式的文件 |
+save_path |
+str |
+保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 |
+无 |
+
+
+indent |
+int |
+指定缩进级别,以美化输出的 JSON 数据,使其更具可读性,仅当 format_json 为 True 时有效 |
+4 |
+
+
+ensure_ascii |
+bool |
+控制是否将非 ASCII 字符转义为 Unicode。设置为 True 时,所有非 ASCII 字符将被转义;False 则保留原始字符,仅当format_json为True时有效 |
+False |
+
+
+
+save_to_audio() |
+将结果保存为wav格式的文件 |
+save_path |
+str |
+保存的文件路径,当为目录时,保存文件命名与输入文件类型命名一致 |
+无 |
+
+
+
+
+
+- 调用`print()` 方法会将结果打印到终端
+
+- 调用`save_to_audio()` 方法会将上述内容保存到指定的`save_path`中
+
+
+
+若您获取了配置文件,即可对 text_to_speech 产线各项配置进行自定义,只需要修改 `create_pipeline` 方法中的 `pipeline` 参数值为产线配置文件路径即可。示例如下:
+
+例如,若您的配置文件保存在 `./my_path/text_to_speech.yaml` ,则只需执行:
+
+```python
+from paddlex import create_pipeline
+pipeline = create_pipeline(pipeline="./my_path/text_to_speech.yaml")
+output = pipeline.predict(input="今天天气真的很好")
+for res in output:
+ res.print()
+ res.save_to_json("./output/")
+ res.save_to_audio("./output/test.wav")
+```
+
+注: 配置文件中的参数为产线初始化参数,如果希望更改 text_to_speech 产线初始化参数,可以直接修改配置文件中的参数,并加载配置文件进行预测。同时,CLI 预测也支持传入配置文件,`--pipeline` 指定配置文件的路径即可。
+
+## 3. 开发集成/部署
+
+如果产线可以达到您对产线推理速度和精度的要求,您可以直接进行开发集成/部署。
+
+若您需要将产线直接应用在您的Python项目中,可以参考 [2.2.2 Python脚本方式](#222-python脚本方式集成)中的示例代码。
+
+此外,PaddleX 也提供了其他三种部署方式,详细说明如下:
+
+🚀 高性能推理:在实际生产环境中,许多应用对部署策略的性能指标(尤其是响应速度)有着较严苛的标准,以确保系统的高效运行与用户体验的流畅性。为此,PaddleX 提供高性能推理插件,旨在对模型推理及前后处理进行深度性能优化,实现端到端流程的显著提速,详细的高性能推理流程请参考[PaddleX高性能推理指南](../../../pipeline_deploy/high_performance_inference.md)。
+
+☁️ 服务化部署:服务化部署是实际生产环境中常见的一种部署形式。通过将推理功能封装为服务,客户端可以通过网络请求来访问这些服务,以获取推理结果。PaddleX 支持多种产线服务化部署方案,详细的产线服务化部署流程请参考[PaddleX服务化部署指南](../../../pipeline_deploy/serving.md)。
+
+📱 端侧部署:端侧部署是一种将计算和数据处理功能放在用户设备本身上的方式,设备可以直接处理数据,而不需要依赖远程的服务器。PaddleX 支持将模型部署在 Android 等端侧设备上,详细的端侧部署流程请参考[PaddleX端侧部署指南](../../../pipeline_deploy/on_device_deployment.md)。
+您可以根据需要选择合适的方式部署模型产线,进而进行后续的 AI 应用集成。
diff --git a/paddlex/configs/pipelines/text_to_speech.yaml b/paddlex/configs/pipelines/text_to_speech.yaml
new file mode 100644
index 000000000..c0c8b1875
--- /dev/null
+++ b/paddlex/configs/pipelines/text_to_speech.yaml
@@ -0,0 +1,33 @@
+pipeline_name: text_to_speech
+
+SubModules:
+ TextToPinyin:
+ module_name: text_to_pinyin
+ model_name: G2PWModel
+ model_dir: null
+ batch_size: 1
+ device: gpu:0
+ use_trt: False
+ use_mkldnn: False
+ cpu_threads: 1
+ precision: "fp32"
+ TextToSpeechAcoustic:
+ module_name: text_to_speech_acoustic
+ model_name: fastspeech2_csmsc
+ model_dir: null
+ batch_size: 1
+ device: gpu:0
+ use_trt: False
+ use_mkldnn: False
+ cpu_threads: 1
+ precision: "fp32"
+ TextToSpeechVocoder:
+ module_name: text_to_speech_vocoder
+ model_name: pwgan_csmsc
+ model_dir: null
+ batch_size: 1
+ device: gpu:0
+ use_trt: False
+ use_mkldnn: False
+ cpu_threads: 1
+ precision: "fp32"
\ No newline at end of file
diff --git a/paddlex/inference/pipelines/__init__.py b/paddlex/inference/pipelines/__init__.py
index 1ae875824..69aa48056 100644
--- a/paddlex/inference/pipelines/__init__.py
+++ b/paddlex/inference/pipelines/__init__.py
@@ -55,6 +55,7 @@
from .ts_forecasting import TSFcPipeline
from .video_classification import VideoClassificationPipeline
from .video_detection import VideoDetectionPipeline
+from .text_to_speech import TextToSpeechPipeline
def get_pipeline_path(pipeline_name: str) -> str:
diff --git a/paddlex/inference/pipelines/text_to_speech/__init__.py b/paddlex/inference/pipelines/text_to_speech/__init__.py
new file mode 100644
index 000000000..5ea98f433
--- /dev/null
+++ b/paddlex/inference/pipelines/text_to_speech/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .pipeline import TextToSpeechPipeline
diff --git a/paddlex/inference/pipelines/text_to_speech/pipeline.py b/paddlex/inference/pipelines/text_to_speech/pipeline.py
new file mode 100644
index 000000000..33e773b0e
--- /dev/null
+++ b/paddlex/inference/pipelines/text_to_speech/pipeline.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from ...models.text_to_speech_vocoder.result import PwganResult
+from ...models.text_to_speech_acoustic.result import Fastspeech2Result
+from ...models.text_to_pinyin.result import TextToPinyinResult
+from ...utils.benchmark import benchmark
+from ...utils.hpi import HPIConfig
+from ...utils.pp_option import PaddlePredictorOption
+from ..base import BasePipeline
+
+
+@benchmark.time_methods
+class TextToSpeechPipeline(BasePipeline):
+ """Text to Speech Pipeline Pipeline"""
+
+ entities = "text_to_speech"
+
+ def __init__(
+ self,
+ config: Dict,
+ device: str = None,
+ pp_option: PaddlePredictorOption = None,
+ use_hpip: bool = False,
+ hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+ ) -> None:
+ """
+ Initializes the class with given configurations and options.
+
+ Args:
+ config (Dict): Configuration dictionary containing model and other parameters.
+ device (str): The device to run the prediction on. Default is None.
+ pp_option (PaddlePredictorOption): Options for PaddlePaddle predictor. Default is None.
+ use_hpip (bool, optional): Whether to use the high-performance
+ inference plugin (HPIP) by default. Defaults to False.
+ hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
+ The default high-performance inference configuration dictionary.
+ Defaults to None.
+ """
+ super().__init__(
+ device=device, pp_option=pp_option, use_hpip=use_hpip, hpi_config=hpi_config
+ )
+
+ text_to_pinyin_model_config = config["SubModules"][
+ "TextToPinyin"
+ ]
+ self.text_to_pinyin_model = self.create_model(
+ text_to_pinyin_model_config
+ )
+ text_to_speech_acoustic_model_config = config["SubModules"][
+ "TextToSpeechAcoustic"
+ ]
+ self.text_to_speech_acoustic_model = self.create_model(
+ text_to_speech_acoustic_model_config
+ )
+ text_to_speech_vocoder_model_config = config["SubModules"][
+ "TextToSpeechVocoder"
+ ]
+ self.text_to_speech_vocoder_model = self.create_model(
+ text_to_speech_vocoder_model_config
+ )
+
+ def predict(
+ self, input: Union[str, List[str], np.ndarray, List[np.ndarray]], **kwargs
+ ) -> PwganResult:
+ """Predicts speech recognition results for the given input.
+
+ Args:
+ input (Union[str, list[str], np.ndarray, list[np.ndarray]]): The input audio or path.
+ **kwargs: Additional keyword arguments that can be passed to the function.
+
+ Returns:
+ PwganResult: The predicted pwgan results, support str and json output.
+ """
+ sentences = []
+ if isinstance(input, str):
+ if input.endswith(".txt"):
+ if not os.path.exists(input):
+ raise FileNotFoundError(f"The specified text file does not exist: {input}")
+ try:
+ with open(input, "r", encoding="utf-8") as f:
+ sentences = [line.strip() for line in f.readlines()]
+ except IOError as e:
+ raise IOError(f"An error occurred while reading the file {input}: {e}")
+ else:
+ sentences = [input]
+ elif isinstance(input, list):
+ for item in input:
+ if isinstance(item, str):
+ if item.endswith(".txt"):
+ if not os.path.exists(item):
+ raise FileNotFoundError(f"The specified text file in the list does not exist: {item}")
+ try:
+ with open(item, "r", encoding="utf-8") as f:
+ sentences.extend([line.strip() for line in f.readlines()])
+ except IOError as e:
+ raise IOError(f"An error occurred while reading the file {item}: {e}")
+ else:
+ sentences.append(item)
+ else:
+ raise TypeError(f"Unsupported input type: {type(input)}. Expected str, list, or np.ndarray.")
+ if not sentences:
+ raise ValueError("The input resulted in an empty list of sentences to process.")
+
+ for sentence in sentences:
+ text_to_pinyin_res = [self.get_text_to_pinyin_result(sentence)['result']['phone_ids']]
+ text_to_speech_acoustic_res = [self.get_text_to_speech_acoustic_result(text_to_pinyin_res)['result']]
+ yield from self.text_to_speech_vocoder_model(text_to_speech_acoustic_res)
+
+ def get_text_to_pinyin_result(self, input: Union[str, List[str]]
+ ) -> TextToPinyinResult:
+ """Get the result of text to pinyin conversion.
+
+ Args:
+ input (Union[str, list[str]]): The input text or list of texts.
+
+ Returns:
+ TextToPinyinResult: The result of text to pinyin conversion.
+ """
+ return next(self.text_to_pinyin_model(input))
+
+ def get_text_to_speech_acoustic_result(self, input: Union[str, List[str]]
+ ) -> Fastspeech2Result:
+ """Get the result of text to speech acoustic conversion.
+
+ Args:
+ input (Union[str, list[str]]): The input text or list of texts.
+
+ Returns:
+ Fastspeech2Result: The result of text to speech acoustic conversion.
+ """
+ return next(self.text_to_speech_acoustic_model(input))
+
diff --git a/paddlex/utils/pipeline_arguments.py b/paddlex/utils/pipeline_arguments.py
index ff31dc1ee..386cb02e0 100644
--- a/paddlex/utils/pipeline_arguments.py
+++ b/paddlex/utils/pipeline_arguments.py
@@ -805,4 +805,5 @@ def validator(cli_input: str) -> cli_expected_type:
],
"3d_bev_detection": None,
"multilingual_speech_recognition": None,
+ "text_to_speech": None,
}