lucasnewman
diff --git a/‎Package.resolved‎
Lines changed: 9 additions & 0 deletions b/‎Package.resolved‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎Package.swift‎
Lines changed: 15 additions & 2 deletions b/‎Package.swift‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎Sources/F5TTS/CFM.swift‎
Lines changed: 3 additions & 4 deletions b/‎Sources/F5TTS/CFM.swift‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎Sources/F5TTS/test_en_1_ref_short.wav‎
250 KB b/‎Sources/F5TTS/test_en_1_ref_short.wav‎
250 KB
diff --git a/‎Sources/f5-tts-generate/GenerateCommand.swift‎
Lines changed: 111 additions & 0 deletions b/‎Sources/f5-tts-generate/GenerateCommand.swift‎
Lines changed: 111 additions & 0 deletions
@@ -10,11 +10,13 @@ let package = Package(
         .library(
             name: "F5TTS",
             targets: ["F5TTS"]
-        ),
+        )
     ],
     dependencies: [
         .package(url: "https://github.com/ml-explore/mlx-swift", from: "0.18.0"),
         .package(url: "https://github.com/huggingface/swift-transformers", from: "0.1.13"),
+        .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.3.0"),
+        .package(url: "https://github.com/lucasnewman/vocos-swift.git", from: "0.0.1")
     ],
     targets: [
         .target(
@@ -26,12 +28,23 @@ let package = Package(
                 .product(name: "MLXFFT", package: "mlx-swift"),
                 .product(name: "MLXLinalg", package: "mlx-swift"),
                 .product(name: "MLXRandom", package: "mlx-swift"),
-                .product(name: "Transformers", package: "swift-transformers"),
+                .product(name: "Transformers", package: "swift-transformers")
             ],
             path: "Sources/F5TTS",
             resources: [
                 .copy("mel_filters.npy"),
+                .copy("test_en_1_ref_short.wav")
             ]
         ),
+        .executableTarget(
+            name: "f5-tts-generate",
+            dependencies: [
+                "F5TTS",
+                .product(name: "Vocos", package: "vocos-swift"),
+                .product(name: "ArgumentParser", package: "swift-argument-parser"),
+                .product(name: "MLX", package: "mlx-swift"),
+            ],
+            path: "Sources/f5-tts-generate"
+        )
     ]
 )
@@ -41,7 +41,6 @@ func padToLength(_ t: MLXArray, length: Int, value: Float? = nil) -> MLXArray {
 func padSequence(_ t: [MLXArray], paddingValue: Float = 0) -> MLXArray {
     let maxLen = t.map { $0.shape.last ?? 0 }.max() ?? 0
     let t = MLX.stacked(t, axis: 0)
-    let paddedArrays = padToLength(t, length: maxLen, value: paddingValue)
     return padToLength(t, length: maxLen, value: paddingValue)
 }
 
@@ -108,7 +107,7 @@ public class F5TTS: Module {
         lens: MLXArray? = nil,
         steps: Int = 32,
         cfgStrength: Float = 2.0,
-        swaySamplingCoef: Float? = -1.0,
+        swayCoef: Float? = -1.0,
         seed: Int? = nil,
         maxDuration: Int = 4096,
         vocoder: ((MLXArray) -> MLXArray)? = nil,
@@ -203,12 +202,12 @@ public class F5TTS: Module {
 
         var t = MLXArray.linspace(Float32(0.0), Float32(1.0), count: steps)
 
-        if let coef = swaySamplingCoef {
+        if let coef = swayCoef {
             t = t + coef * (MLX.cos(MLXArray(.pi) / 2 * t) - 1 + t)
         }
 
         let trajectory = self.odeint(fun: fn, y0: y0Padded, t: t)
-        var sampled = trajectory[-1]
+        let sampled = trajectory[-1]
         var out = MLX.where(condMask, cond, sampled)
 
         if let vocoder = vocoder {
 
@@ -0,0 +1,111 @@
+import ArgumentParser
+import MLX
+import F5TTS
+import Foundation
+import Vocos
+
+@main
+struct GenerateAudio: AsyncParsableCommand {
+    @Argument(help: "Text to generate speech from")
+    var text: String
+    
+    @Option(name: .long, help: "Duration of the generated audio in seconds")
+    var duration: Double?
+    
+    @Option(name: .long, help: "Path to the reference audio file")
+    var refAudioPath: String?
+    
+    @Option(name: .long, help: "Text spoken in the reference audio")
+    var refAudioText: String?
+    
+    @Option(name: .long, help: "Model name to use")
+    var model: String = "lucasnewman/f5-tts-mlx"
+    
+    @Option(name: .long, help: "Output path for the generated audio")
+    var outputPath: String = "output.wav"
+    
+    @Option(name: .long, help: "Strength of classifier free guidance")
+    var cfg: Float = 2.0
+    
+    @Option(name: .long, help: "Coefficient for sway sampling")
+    var sway: Float = -1.0
+    
+    @Option(name: .long, help: "Speed factor for the duration heuristic")
+    var speed: Float = 1.0
+    
+    @Option(name: .long, help: "Seed for noise generation")
+    var seed: Int?
+    
+    func run() async throws {
+        let sampleRate = 24_000
+        let hopLength = 256
+        let framesPerSec = Double(sampleRate) / Double(hopLength)
+        let targetRMS: Float = 0.1
+        
+        let f5tts = try await F5TTS.fromPretrained(repoId: model)
+        let vocos = try await Vocos.fromPretrained(repoId: "lucasnewman/vocos-mel-24khz-mlx")
+        
+        var audio: MLXArray
+        let referenceText: String
+        
+        if let refPath = refAudioPath {
+            audio = try AudioUtilities.loadAudioFile(url: URL(filePath: refPath))
+            referenceText = refAudioText ?? "Some call me nature, others call me mother nature."
+        } else if let refURL = Bundle.main.url(forResource: "test_en_1_ref_short", withExtension: "wav") {
+            audio = try AudioUtilities.loadAudioFile(url: refURL)
+            referenceText = "Some call me nature, others call me mother nature."
+        } else {
+            fatalError("No reference audio file specified.")
+        }
+        
+        let rms = audio.square().mean().sqrt().item(Float.self)
+        if rms < targetRMS {
+            audio = audio * targetRMS / rms
+        }
+        
+        // use a heuristic to determine the duration if not provided
+        let refAudioDuration = Double(audio.shape[0]) / framesPerSec
+        var generatedDuration = duration
+        
+        if generatedDuration == nil {
+            let refAudioLength = audio.shape[0] / hopLength
+            let pausePunctuation = "。，、；：？！"
+            let refTextLength = referenceText.utf8.count + 3 * pausePunctuation.utf8.count
+            let genTextLength = text.utf8.count + 3 * pausePunctuation.utf8.count
+            
+            let durationInFrames = refAudioLength + Int((Double(refAudioLength) / Double(refTextLength)) * (Double(genTextLength) / Double(speed)))
+            let estimatedDuration = Double(durationInFrames - refAudioLength) / framesPerSec
+            
+            print("Using duration of \(estimatedDuration) seconds for generated speech.")
+            generatedDuration = estimatedDuration
+        }
+        
+        guard let generatedDuration else {
+            fatalError("Unable to determine duration.")
+        }
+        
+        let processedText = referenceText + " " + text
+        let frameDuration = Int((refAudioDuration + generatedDuration) * framesPerSec)
+        print("Generating \(frameDuration) frames of audio...")
+        
+        let startTime = Date()
+        
+        let (outputAudio, _) = f5tts.sample(
+            cond: audio.expandedDimensions(axis: 0),
+            text: [processedText],
+            duration: frameDuration,
+            steps: 32,
+            cfgStrength: cfg,
+            swayCoef: sway,
+            seed: seed,
+            vocoder: vocos.decode
+        )
+        
+        let generatedAudio = outputAudio[audio.shape[0]...]
+        
+        let elapsedTime = Date().timeIntervalSince(startTime)
+        print("Generated \(Double(generatedAudio.count) / Double(sampleRate)) seconds of audio in \(elapsedTime) seconds.")
+        
+        try AudioUtilities.saveAudioFile(url: URL(filePath: outputPath), samples: generatedAudio)
+    }
+}