k2-fsa
diff --git a/‎cmake/kaldi-native-fbank.cmake‎
Lines changed: 8 additions & 8 deletions b/‎cmake/kaldi-native-fbank.cmake‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc‎
Lines changed: 2 additions & 2 deletions b/‎cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python-api-examples/generate-subtitles.py‎
Lines changed: 47 additions & 15 deletions b/‎python-api-examples/generate-subtitles.py‎
Lines changed: 47 additions & 15 deletions
diff --git a/‎sherpa-onnx/csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎sherpa-onnx/csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎sherpa-onnx/csrc/silero-vad-model-config.cc‎
Lines changed: 1 addition & 1 deletion b/‎sherpa-onnx/csrc/silero-vad-model-config.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sherpa-onnx/csrc/silero-vad-model-config.h‎
Lines changed: 0 additions & 1 deletion b/‎sherpa-onnx/csrc/silero-vad-model-config.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎sherpa-onnx/csrc/ten-vad-model-config.cc‎
Lines changed: 111 additions & 0 deletions b/‎sherpa-onnx/csrc/ten-vad-model-config.cc‎
Lines changed: 111 additions & 0 deletions
diff --git a/‎sherpa-onnx/csrc/ten-vad-model-config.h‎
Lines changed: 45 additions & 0 deletions b/‎sherpa-onnx/csrc/ten-vad-model-config.h‎
Lines changed: 45 additions & 0 deletions
@@ -1,9 +1,9 @@
 function(download_kaldi_native_fbank)
   include(FetchContent)
 
-  set(kaldi_native_fbank_URL   "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.2.tar.gz")
-  set(kaldi_native_fbank_URL2  "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.2.tar.gz")
-  set(kaldi_native_fbank_HASH "SHA256=f4bd7d53fe8aeaecc4eda9680c72696bb86bf74e86371d81aacacd6f4ca3914d")
+  set(kaldi_native_fbank_URL   "https://github.com/csukuangfj/kaldi-native-fbank/archive/refs/tags/v1.21.3.tar.gz")
+  set(kaldi_native_fbank_URL2  "https://hf-mirror.com/csukuangfj/sherpa-onnx-cmake-deps/resolve/main/kaldi-native-fbank-1.21.3.tar.gz")
+  set(kaldi_native_fbank_HASH "SHA256=d409eddae5a46dc796f0841880f489ff0728b96ae26218702cd438c28667c70e")
 
   set(KALDI_NATIVE_FBANK_BUILD_TESTS OFF CACHE BOOL "" FORCE)
   set(KALDI_NATIVE_FBANK_BUILD_PYTHON OFF CACHE BOOL "" FORCE)
@@ -12,11 +12,11 @@ function(download_kaldi_native_fbank)
   # If you don't have access to the Internet,
   # please pre-download kaldi-native-fbank
   set(possible_file_locations
-    $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.2.tar.gz
-    ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.2.tar.gz
-    ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.2.tar.gz
-    /tmp/kaldi-native-fbank-1.21.2.tar.gz
-    /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.2.tar.gz
+    $ENV{HOME}/Downloads/kaldi-native-fbank-1.21.3.tar.gz
+    ${CMAKE_SOURCE_DIR}/kaldi-native-fbank-1.21.3.tar.gz
+    ${CMAKE_BINARY_DIR}/kaldi-native-fbank-1.21.3.tar.gz
+    /tmp/kaldi-native-fbank-1.21.3.tar.gz
+    /star-fj/fangjun/download/github/kaldi-native-fbank-1.21.3.tar.gz
   )
 
   foreach(f IN LISTS possible_file_locations)
 
@@ -1,8 +1,8 @@
 // cxx-api-examples/zipformer-transducer-simulate-streaming-microphone-cxx-api.cc
 // Copyright (c)  2025  Xiaomi Corporation
 //
-// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++ API
-// for streaming speech recognition from a microphone.
+// This file demonstrates how to use Zipformer transducer with sherpa-onnx's C++
+// API for streaming speech recognition from a microphone.
 //
 // clang-format off
 //
 
@@ -19,6 +19,12 @@
 
 wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
 
+or download ten-vad.onnx, for instance
+
+wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
+
+Please replace --silero-vad-model with --ten-vad-model below to use ten-vad.
+
 (1) For paraformer
 
     ./python-api-examples/generate-subtitles.py  \
@@ -124,8 +130,13 @@ def get_args():
     parser.add_argument(
         "--silero-vad-model",
         type=str,
-        required=True,
-        help="Path to silero_vad.onnx",
+        help="Path to silero_vad.onnx.",
+    )
+
+    parser.add_argument(
+        "--ten-vad-model",
+        type=str,
+        help="Path to ten-vad.onnx",
     )
 
     parser.add_argument(
@@ -499,7 +510,12 @@ def __str__(self):
 def main():
     args = get_args()
     assert_file_exists(args.tokens)
-    assert_file_exists(args.silero_vad_model)
+    if args.silero_vad_model:
+        assert_file_exists(args.silero_vad_model)
+    elif args.ten_vad_model:
+        assert_file_exists(args.ten_vad_model)
+    else:
+        raise ValueError("You need to supply one vad model")
 
     assert args.num_threads > 0, args.num_threads
 
@@ -536,18 +552,34 @@ def main():
     stream = recognizer.create_stream()
 
     config = sherpa_onnx.VadModelConfig()
-    config.silero_vad.model = args.silero_vad_model
-    config.silero_vad.threshold = 0.5
-    config.silero_vad.min_silence_duration = 0.25  # seconds
-    config.silero_vad.min_speech_duration = 0.25  # seconds
-
-    # If the current segment is larger than this value, then it increases
-    # the threshold to 0.9 internally. After detecting this segment,
-    # it resets the threshold to its original value.
-    config.silero_vad.max_speech_duration = 5  # seconds
-    config.sample_rate = args.sample_rate
-
-    window_size = config.silero_vad.window_size
+    if args.silero_vad_model:
+        config.silero_vad.model = args.silero_vad_model
+        config.silero_vad.threshold = 0.2
+        config.silero_vad.min_silence_duration = 0.25  # seconds
+        config.silero_vad.min_speech_duration = 0.25  # seconds
+
+        # If the current segment is larger than this value, then it increases
+        # the threshold to 0.9 internally. After detecting this segment,
+        # it resets the threshold to its original value.
+        config.silero_vad.max_speech_duration = 5  # seconds
+        config.sample_rate = args.sample_rate
+
+        window_size = config.silero_vad.window_size
+        print("use silero-vad")
+    else:
+        config.ten_vad.model = args.ten_vad_model
+        config.ten_vad.threshold = 0.2
+        config.ten_vad.min_silence_duration = 0.25  # seconds
+        config.ten_vad.min_speech_duration = 0.25  # seconds
+
+        # If the current segment is larger than this value, then it increases
+        # the threshold to 0.9 internally. After detecting this segment,
+        # it resets the threshold to its original value.
+        config.ten_vad.max_speech_duration = 5  # seconds
+        config.sample_rate = args.sample_rate
+
+        window_size = config.ten_vad.window_size
+        print("use ten-vad")
 
     buffer = []
     vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
 
@@ -123,6 +123,8 @@ set(sources
   spoken-language-identification.cc
   stack.cc
   symbol-table.cc
+  ten-vad-model-config.cc
+  ten-vad-model.cc
   text-utils.cc
   transducer-keyword-decoder.cc
   transpose.cc
 
@@ -40,7 +40,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {
       "to the silero VAD model. WARNING! Silero VAD models were trained using "
       "512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "
       "for 8000 sample rate. Values other than these may affect model "
-      "perfomance!");
+      "performance!");
 }
 
 bool SileroVadModelConfig::Validate() const {
 
@@ -24,7 +24,6 @@ struct SileroVadModelConfig {
   float min_speech_duration = 0.25;  // in seconds
 
   // 512, 1024, 1536 samples for 16000 Hz
-  // 256, 512, 768 samples for 800 Hz
   int32_t window_size = 512;  // in samples
 
   // If a speech segment is longer than this value, then we increase
 
@@ -0,0 +1,111 @@
+// sherpa-onnx/csrc/ten-vad-model-config.cc
+//
+// Copyright (c)  2025  Xiaomi Corporation
+
+#include "sherpa-onnx/csrc/ten-vad-model-config.h"
+
+#include "sherpa-onnx/csrc/file-utils.h"
+#include "sherpa-onnx/csrc/macros.h"
+
+namespace sherpa_onnx {
+
+void TenVadModelConfig::Register(ParseOptions *po) {
+  po->Register("ten-vad-model", &model, "Path to TEN VAD ONNX model.");
+
+  po->Register("ten-vad-threshold", &threshold,
+               "Speech threshold. TEN VAD outputs speech probabilities for "
+               "each audio chunk, probabilities ABOVE this value are "
+               "considered as SPEECH. It is better to tune this parameter for "
+               "each dataset separately, but lazy "
+               "0.5 is pretty good for most datasets.");
+
+  po->Register("ten-vad-min-silence-duration", &min_silence_duration,
+               "In seconds.  In the end of each speech chunk wait for "
+               "--ten-vad-min-silence-duration seconds before separating it");
+
+  po->Register("ten-vad-min-speech-duration", &min_speech_duration,
+               "In seconds.  In the end of each silence chunk wait for "
+               "--ten-vad-min-speech-duration seconds before separating it");
+
+  po->Register(
+      "ten-vad-max-speech-duration", &max_speech_duration,
+      "In seconds. If a speech segment is longer than this value, then we "
+      "increase the threshold to 0.9. After finishing detecting the segment, "
+      "the threshold value is reset to its original value.");
+
+  po->Register(
+      "ten-vad-window-size", &window_size,
+      "In samples. Audio chunks of --ten-vad-window-size samples are fed "
+      "to the ten VAD model. WARNING! Please use 160 or 256 ");
+}
+
+bool TenVadModelConfig::Validate() const {
+  if (model.empty()) {
+    SHERPA_ONNX_LOGE("Please provide --ten-vad-model");
+    return false;
+  }
+
+  if (!FileExists(model)) {
+    SHERPA_ONNX_LOGE("TEN vad model file '%s' does not exist", model.c_str());
+    return false;
+  }
+
+  if (threshold < 0.01) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-threshold. Given: %f",
+        threshold);
+    return false;
+  }
+
+  if (threshold >= 1) {
+    SHERPA_ONNX_LOGE(
+        "Please use a smaller value for --ten-vad-threshold. Given: %f",
+        threshold);
+    return false;
+  }
+
+  if (min_silence_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-min-silence-duration. "
+        "Given: "
+        "%f",
+        min_silence_duration);
+    return false;
+  }
+
+  if (min_speech_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-min-speech-duration. "
+        "Given: "
+        "%f",
+        min_speech_duration);
+    return false;
+  }
+
+  if (max_speech_duration <= 0) {
+    SHERPA_ONNX_LOGE(
+        "Please use a larger value for --ten-vad-max-speech-duration. "
+        "Given: "
+        "%f",
+        max_speech_duration);
+    return false;
+  }
+
+  return true;
+}
+
+std::string TenVadModelConfig::ToString() const {
+  std::ostringstream os;
+
+  os << "TenVadModelConfig(";
+  os << "model=\"" << model << "\", ";
+  os << "threshold=" << threshold << ", ";
+  os << "min_silence_duration=" << min_silence_duration << ", ";
+  os << "min_speech_duration=" << min_speech_duration << ", ";
+  os << "max_speech_duration=" << max_speech_duration << ", ";
+  os << "window_size=" << window_size << ")";
+
+  return os.str();
+}
+
+}  // namespace sherpa_onnx
@@ -0,0 +1,45 @@
+// sherpa-onnx/csrc/ten-vad-model-config.h
+//
+// Copyright (c)  2025  Xiaomi Corporation
+#ifndef SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
+#define SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
+
+#include <string>
+
+#include "sherpa-onnx/csrc/parse-options.h"
+
+namespace sherpa_onnx {
+
+struct TenVadModelConfig {
+  std::string model;
+
+  // threshold to classify a segment as speech
+  //
+  // If the predicted probability of a segment is larger than this
+  // value, then it is classified as speech.
+  float threshold = 0.5;
+
+  float min_silence_duration = 0.5;  // in seconds
+
+  float min_speech_duration = 0.25;  // in seconds
+
+  // 160 or 256
+  int32_t window_size = 256;  // in samples
+
+  // If a speech segment is longer than this value, then we increase
+  // the threshold to 0.9. After finishing detecting the segment,
+  // the threshold value is reset to its original value.
+  float max_speech_duration = 20;  // in seconds
+
+  TenVadModelConfig() = default;
+
+  void Register(ParseOptions *po);
+
+  bool Validate() const;
+
+  std::string ToString() const;
+};
+
+}  // namespace sherpa_onnx
+
+#endif  // SHERPA_ONNX_CSRC_TEN_VAD_MODEL_CONFIG_H_
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ void SileroVadModelConfig::Register(ParseOptions *po) {`
`40`	`40`	`"to the silero VAD model. WARNING! Silero VAD models were trained using "`
`41`	`41`	`"512, 1024, 1536 samples for 16000 sample rate and 256, 512, 768 samples "`
`42`	`42`	`"for 8000 sample rate. Values other than these may affect model "`
`43`		`- "perfomance!");`
	`43`	`+ "performance!");`
`44`	`44`	`}`
`45`	`45`
`46`	`46`	`bool SileroVadModelConfig::Validate() const {`