|
| 1 | +import ArgumentParser |
| 2 | +import MLX |
| 3 | +import F5TTS |
| 4 | +import Foundation |
| 5 | +import Vocos |
| 6 | + |
| 7 | +@main |
| 8 | +struct GenerateAudio: AsyncParsableCommand { |
| 9 | + @Argument(help: "Text to generate speech from") |
| 10 | + var text: String |
| 11 | + |
| 12 | + @Option(name: .long, help: "Duration of the generated audio in seconds") |
| 13 | + var duration: Double? |
| 14 | + |
| 15 | + @Option(name: .long, help: "Path to the reference audio file") |
| 16 | + var refAudioPath: String? |
| 17 | + |
| 18 | + @Option(name: .long, help: "Text spoken in the reference audio") |
| 19 | + var refAudioText: String? |
| 20 | + |
| 21 | + @Option(name: .long, help: "Model name to use") |
| 22 | + var model: String = "lucasnewman/f5-tts-mlx" |
| 23 | + |
| 24 | + @Option(name: .long, help: "Output path for the generated audio") |
| 25 | + var outputPath: String = "output.wav" |
| 26 | + |
| 27 | + @Option(name: .long, help: "Strength of classifier free guidance") |
| 28 | + var cfg: Float = 2.0 |
| 29 | + |
| 30 | + @Option(name: .long, help: "Coefficient for sway sampling") |
| 31 | + var sway: Float = -1.0 |
| 32 | + |
| 33 | + @Option(name: .long, help: "Speed factor for the duration heuristic") |
| 34 | + var speed: Float = 1.0 |
| 35 | + |
| 36 | + @Option(name: .long, help: "Seed for noise generation") |
| 37 | + var seed: Int? |
| 38 | + |
| 39 | + func run() async throws { |
| 40 | + let sampleRate = 24_000 |
| 41 | + let hopLength = 256 |
| 42 | + let framesPerSec = Double(sampleRate) / Double(hopLength) |
| 43 | + let targetRMS: Float = 0.1 |
| 44 | + |
| 45 | + let f5tts = try await F5TTS.fromPretrained(repoId: model) |
| 46 | + let vocos = try await Vocos.fromPretrained(repoId: "lucasnewman/vocos-mel-24khz-mlx") |
| 47 | + |
| 48 | + var audio: MLXArray |
| 49 | + let referenceText: String |
| 50 | + |
| 51 | + if let refPath = refAudioPath { |
| 52 | + audio = try AudioUtilities.loadAudioFile(url: URL(filePath: refPath)) |
| 53 | + referenceText = refAudioText ?? "Some call me nature, others call me mother nature." |
| 54 | + } else if let refURL = Bundle.main.url(forResource: "test_en_1_ref_short", withExtension: "wav") { |
| 55 | + audio = try AudioUtilities.loadAudioFile(url: refURL) |
| 56 | + referenceText = "Some call me nature, others call me mother nature." |
| 57 | + } else { |
| 58 | + fatalError("No reference audio file specified.") |
| 59 | + } |
| 60 | + |
| 61 | + let rms = audio.square().mean().sqrt().item(Float.self) |
| 62 | + if rms < targetRMS { |
| 63 | + audio = audio * targetRMS / rms |
| 64 | + } |
| 65 | + |
| 66 | + // use a heuristic to determine the duration if not provided |
| 67 | + let refAudioDuration = Double(audio.shape[0]) / framesPerSec |
| 68 | + var generatedDuration = duration |
| 69 | + |
| 70 | + if generatedDuration == nil { |
| 71 | + let refAudioLength = audio.shape[0] / hopLength |
| 72 | + let pausePunctuation = "。,、;:?!" |
| 73 | + let refTextLength = referenceText.utf8.count + 3 * pausePunctuation.utf8.count |
| 74 | + let genTextLength = text.utf8.count + 3 * pausePunctuation.utf8.count |
| 75 | + |
| 76 | + let durationInFrames = refAudioLength + Int((Double(refAudioLength) / Double(refTextLength)) * (Double(genTextLength) / Double(speed))) |
| 77 | + let estimatedDuration = Double(durationInFrames - refAudioLength) / framesPerSec |
| 78 | + |
| 79 | + print("Using duration of \(estimatedDuration) seconds for generated speech.") |
| 80 | + generatedDuration = estimatedDuration |
| 81 | + } |
| 82 | + |
| 83 | + guard let generatedDuration else { |
| 84 | + fatalError("Unable to determine duration.") |
| 85 | + } |
| 86 | + |
| 87 | + let processedText = referenceText + " " + text |
| 88 | + let frameDuration = Int((refAudioDuration + generatedDuration) * framesPerSec) |
| 89 | + print("Generating \(frameDuration) frames of audio...") |
| 90 | + |
| 91 | + let startTime = Date() |
| 92 | + |
| 93 | + let (outputAudio, _) = f5tts.sample( |
| 94 | + cond: audio.expandedDimensions(axis: 0), |
| 95 | + text: [processedText], |
| 96 | + duration: frameDuration, |
| 97 | + steps: 32, |
| 98 | + cfgStrength: cfg, |
| 99 | + swayCoef: sway, |
| 100 | + seed: seed, |
| 101 | + vocoder: vocos.decode |
| 102 | + ) |
| 103 | + |
| 104 | + let generatedAudio = outputAudio[audio.shape[0]...] |
| 105 | + |
| 106 | + let elapsedTime = Date().timeIntervalSince(startTime) |
| 107 | + print("Generated \(Double(generatedAudio.count) / Double(sampleRate)) seconds of audio in \(elapsedTime) seconds.") |
| 108 | + |
| 109 | + try AudioUtilities.saveAudioFile(url: URL(filePath: outputPath), samples: generatedAudio) |
| 110 | + } |
| 111 | +} |
0 commit comments