From 71f999fe66c8ec7e3a8289ef281331ee8103478c Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 17:40:20 +0000 Subject: [PATCH 1/6] feat: Add cloud-based Whisper support for ARM64 architectures --- README.md | 21 +++++ docs/source/cloud_whisper.md | 92 +++++++++++++++++++++ examples/cloud_whisper_demo.py | 87 ++++++++++++++++++++ manim_voiceover/__init__.py | 11 +++ manim_voiceover/cli/__init__.py | 1 + manim_voiceover/cli/config.py | 36 ++++++++ manim_voiceover/cli/main.py | 67 +++++++++++++++ manim_voiceover/config.py | 6 ++ manim_voiceover/services/base.py | 127 +++++++++++++++++++++++++---- manim_voiceover/voiceover_scene.py | 4 + pyproject.toml | 5 +- 11 files changed, 439 insertions(+), 18 deletions(-) create mode 100644 docs/source/cloud_whisper.md create mode 100644 examples/cloud_whisper_demo.py create mode 100644 manim_voiceover/cli/__init__.py create mode 100644 manim_voiceover/cli/config.py create mode 100644 manim_voiceover/cli/main.py create mode 100644 manim_voiceover/config.py diff --git a/README.md b/README.md index 5d7e3ec..54e5f7a 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Manim Voiceover is a [Manim](https://manim.community) plugin for all things voic - Record voiceovers with your microphone during rendering with a simple command line interface. - Develop animations with auto-generated AI voices from various free and proprietary services. - Per-word timing of animations, i.e. trigger animations at specific words in the voiceover, even for the recordings. This works thanks to [OpenAI Whisper](https://github.com/openai/whisper). +- **NEW**: Supports both local and cloud-based Whisper for ARM64 architectures (like Apple Silicon) where the local model may not work. Here is a demo: @@ -41,6 +42,26 @@ Currently supported TTS services (aside from the CLI that allows you to records [Check out the example gallery to get inspired.](https://voiceover.manim.community/en/latest/examples.html) +## Cloud Whisper Support + +For ARM64 architectures (like Apple Silicon Macs) or systems where installing the local Whisper model is problematic, you can now use OpenAI's cloud-based Whisper API for speech-to-text alignment: + +```bash +# Run with cloud-based Whisper +manim -pql --use-cloud-whisper examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +Or enable it programmatically: + +```python +service = GTTSService( + transcription_model="base", + use_cloud_whisper=True # This enables cloud-based Whisper +) +``` + +[Learn more about cloud-based Whisper in the documentation.](https://voiceover.manim.community/en/latest/cloud_whisper.html) + ## Translate Manim Voiceover can use machine translation services like [DeepL](https://www.deepl.com/) to translate voiceovers into other languages. [Check out the docs for more details.](https://voiceover.manim.community/en/latest/translate.html) \ No newline at end of file diff --git a/docs/source/cloud_whisper.md b/docs/source/cloud_whisper.md new file mode 100644 index 0000000..78cf5d1 --- /dev/null +++ b/docs/source/cloud_whisper.md @@ -0,0 +1,92 @@ +# Cloud-based Whisper Transcription + +## Overview + +Manim-voiceover now supports cloud-based transcription using OpenAI's Whisper API. This is particularly useful for: + +- ARM64 architectures (like Apple Silicon Macs) where installing the local Whisper model might be problematic +- Systems where you don't want to install the large Whisper model +- When you need higher accuracy transcription than the local model provides + +## Setup + +To use cloud-based Whisper, you'll need: + +1. An OpenAI API key +2. The OpenAI Python package + +Install the necessary dependencies: + +```bash +pip install "manim-voiceover[openai]" +``` + +## Usage + +### Command Line Option + +You can enable cloud-based Whisper for any Manim render by adding the `--use-cloud-whisper` flag: + +```bash +manim -pql --use-cloud-whisper example.py MyScene +``` + +### Programmatic Usage + +You can also enable cloud-based Whisper programmatically when initializing any speech service: + +```python +from manim_voiceover.services.azure import AzureService +from manim_voiceover.voiceover_scene import VoiceoverScene + +class MyScene(VoiceoverScene): + def construct(self): + # Use cloud-based Whisper for transcription + service = AzureService( + voice="en-US-GuyNeural", + transcription_model="base", # Still specify a model name + use_cloud_whisper=True # This enables cloud-based Whisper + ) + self.set_speech_service(service) + + # Rest of your scene... +``` + +## How It Works + +When cloud-based Whisper is enabled: + +1. The speech service will use OpenAI's API to transcribe your audio files +2. Word-level alignment will still work for bookmarks and animations +3. Your audio files will be sent to OpenAI's servers for transcription +4. An OpenAI API key is required and you'll be prompted to enter one if not found + +## Pricing + +Using cloud-based Whisper incurs costs based on OpenAI's pricing model: + +- Audio transcription is billed per minute of audio +- Check [OpenAI's pricing page](https://openai.com/pricing) for the most up-to-date information + +## Switching Between Local and Cloud + +You can use both local and cloud-based Whisper in the same project: + +- Use the `--use-cloud-whisper` flag when you need cloud-based transcription +- Omit the flag to use the local Whisper model + +## Troubleshooting + +### API Key Issues + +If you encounter errors related to the API key: + +1. Check that you have set the `OPENAI_API_KEY` environment variable +2. Alternatively, create a `.env` file in your project directory with `OPENAI_API_KEY=your_key_here` + +### Response Format Issues + +The cloud API might return a different format than expected. If you encounter errors: + +1. Check that you're using the latest version of manim-voiceover +2. Try using a different transcription model \ No newline at end of file diff --git a/examples/cloud_whisper_demo.py b/examples/cloud_whisper_demo.py new file mode 100644 index 0000000..7192837 --- /dev/null +++ b/examples/cloud_whisper_demo.py @@ -0,0 +1,87 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.gtts import GTTSService +from manim_voiceover.services.recorder import RecorderService + +class CloudWhisperDemo(VoiceoverScene): + def construct(self): + # Initialize speech service with cloud whisper option + # Note: You can also run this with --use-cloud-whisper flag + # instead of setting use_cloud_whisper=True here + service = GTTSService( + transcription_model="base", # Model name is still required + use_cloud_whisper=True # This enables cloud-based Whisper + ) + self.set_speech_service(service) + + # Create a title + title = Text("Cloud Whisper Demo", font_size=48) + self.play(Write(title)) + self.wait() + + # Demonstrate voiceover with bookmarks + with self.voiceover( + """This demonstration uses cloud-based Whisper + from OpenAI for speech-to-text alignment. + """ + ) as tracker: + # Wait until the first bookmark + self.wait_until_bookmark("cloud_point") + + # Create and animate the cloud text + cloud_text = Text("☁️ Cloud-based", color=BLUE, font_size=36) + cloud_text.next_to(title, DOWN, buff=1) + self.play(FadeIn(cloud_text)) + + # Wait until the second bookmark + self.wait_until_bookmark("alignment_point") + + # Create and animate the alignment text + alignment_text = Text("Word-level Alignment", color=GREEN, font_size=36) + alignment_text.next_to(cloud_text, DOWN, buff=0.5) + self.play(FadeIn(alignment_text)) + + # Continue with demonstration + self.wait(1) + + # Show ARM64 compatibility + arm_title = Text("Works on ARM64 Architectures!", color=YELLOW, font_size=36) + arm_title.next_to(alignment_text, DOWN, buff=1) + + with self.voiceover( + "This feature is especially useful for ARM64 architectures like Apple Silicon." + ): + self.play(FadeIn(arm_title)) + + # Show how it's used + self.wait(1) + + code_text = """ +# Run with CLI flag: +manim -pql --use-cloud-whisper example.py MyScene + +# Or enable programmatically: +service = GTTSService( + transcription_model="base", + use_cloud_whisper=True +) + """ + code = Code(code=code_text, language="python", font_size=24) + code.next_to(arm_title, DOWN, buff=1) + + with self.voiceover( + "You can enable cloud-based Whisper using either a command-line flag or programmatically in your code." + ): + self.play(Create(code)) + + self.wait(2) + + with self.voiceover( + "This means you can use word-level alignment on any system without installing large local models." + ): + self.play(FadeOut(code, title, cloud_text, alignment_text, arm_title)) + + final_text = Text("No Local Models Required!", font_size=48, color=BLUE) + self.play(Write(final_text)) + + self.wait(2) \ No newline at end of file diff --git a/manim_voiceover/__init__.py b/manim_voiceover/__init__.py index d2bd2f1..4ece6e2 100644 --- a/manim_voiceover/__init__.py +++ b/manim_voiceover/__init__.py @@ -1,6 +1,17 @@ from manim_voiceover.tracker import VoiceoverTracker from manim_voiceover.voiceover_scene import VoiceoverScene +from manim import config import pkg_resources __version__: str = pkg_resources.get_distribution(__name__).version + +# Add our custom config attribute +if not hasattr(config, 'use_cloud_whisper'): + config.use_cloud_whisper = False + +# Try to load our CLI extension +try: + from manim_voiceover.cli import main +except ImportError: + pass # CLI module couldn't be loaded diff --git a/manim_voiceover/cli/__init__.py b/manim_voiceover/cli/__init__.py new file mode 100644 index 0000000..67b0a65 --- /dev/null +++ b/manim_voiceover/cli/__init__.py @@ -0,0 +1 @@ +# This file initializes the CLI module \ No newline at end of file diff --git a/manim_voiceover/cli/config.py b/manim_voiceover/cli/config.py new file mode 100644 index 0000000..dcf56d3 --- /dev/null +++ b/manim_voiceover/cli/config.py @@ -0,0 +1,36 @@ +""" +Configuration support for manim-voiceover CLI +""" + +import os +from manim.utils.file_ops import guarantee_existence +from manim._config import config_file, library_wide_cfg_file, ManimConfig +import manim.config as manim_config + +# The Manim config system doesn't provide an easy way to extend the CLI from plugins +# So instead, we'll monkey patch the ManimConfig class to add our custom flag +original_digest_args = manim_config.ManimConfig.digest_args + +def patched_digest_args(self, args, namespace=''): + # Call original method + original_digest_args(self, args, namespace) + + # Handle our custom CLI flags + if hasattr(args, 'use_cloud_whisper'): + self.use_cloud_whisper = args.use_cloud_whisper + +# Apply the monkey patch +manim_config.ManimConfig.digest_args = patched_digest_args + +# Make sure the config object has our flag +if not hasattr(manim_config.config, 'use_cloud_whisper'): + manim_config.config.use_cloud_whisper = False + +def add_voiceover_args(parser): + """Add manim-voiceover specific arguments to the parser.""" + whisper_group = parser.add_argument_group("Manim Voiceover") + whisper_group.add_argument( + "--use-cloud-whisper", + action="store_true", + help="Use OpenAI's cloud Whisper API instead of local model for transcription", + ) \ No newline at end of file diff --git a/manim_voiceover/cli/main.py b/manim_voiceover/cli/main.py new file mode 100644 index 0000000..5106a83 --- /dev/null +++ b/manim_voiceover/cli/main.py @@ -0,0 +1,67 @@ +""" +CLI entrypoint for manim-voiceover +""" + +import inspect +import os +import sys +from pathlib import Path + +from manim.cli.render.commands import main_command, render + +from manim_voiceover.cli.config import add_voiceover_args + +# Hook into the Manim CLI by monkey patching the main command +original_render_command = render.command + +def patched_render_command(function): + # Call the original decorated function + cmd = original_render_command(function) + + # Get the 'params' attribute from the decorated function + params = getattr(cmd, 'params', []) + + # Add our custom arguments to the command line + def param_callback(ctx, param, value): + # Store the use_cloud_whisper value in the context object + ctx.ensure_object(dict) + ctx.obj['use_cloud_whisper'] = value + return value + + # Add our parameter to the command + from click import option + cmd = option('--use-cloud-whisper', + is_flag=True, + help='Use OpenAI cloud API for Whisper instead of local model', + callback=param_callback)(cmd) + + return cmd + +# Apply our monkey patch +render.command = patched_render_command + +# Also hook into the argparse version of the CLI +original_main = main_command + +def patched_main(): + """Entry point for renderer.""" + # Find the render subcommand in the argument parser + import argparse + + # Create a dummy parser just to intercept the args + dummy_parser = argparse.ArgumentParser(add_help=False) + dummy_parser.add_argument("--use-cloud-whisper", action="store_true") + + # Parse known args to get our flags + args, unknown = dummy_parser.parse_known_args() + + # Set the global config value + from manim.config import config + if hasattr(args, 'use_cloud_whisper') and args.use_cloud_whisper: + config.use_cloud_whisper = True + + # Call the original main command + return original_main() + +# Apply our second monkey patch +main_command = patched_main \ No newline at end of file diff --git a/manim_voiceover/config.py b/manim_voiceover/config.py new file mode 100644 index 0000000..1227492 --- /dev/null +++ b/manim_voiceover/config.py @@ -0,0 +1,6 @@ +from manim import config +from pathlib import Path +import tempfile + +# Whether to use cloud-based Whisper API +config.use_cloud_whisper = False \ No newline at end of file diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index fc6c898..e212ac9 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -53,6 +53,7 @@ def __init__( cache_dir: t.Optional[str] = None, transcription_model: t.Optional[str] = None, transcription_kwargs: dict = {}, + use_cloud_whisper: bool = False, **kwargs, ): """ @@ -66,6 +67,9 @@ def __init__( to use for transcription. Defaults to None. transcription_kwargs (dict, optional): Keyword arguments to pass to the transcribe() function. Defaults to {}. + use_cloud_whisper (bool, optional): Whether to use OpenAI's cloud-based + Whisper API for transcription instead of the local model. Useful for + ARM64 architectures where local Whisper may not work. Defaults to False. """ self.global_speed = global_speed @@ -79,6 +83,7 @@ def __init__( self.transcription_model = None self._whisper_model = None + self.use_cloud_whisper = use_cloud_whisper self.set_transcription(model=transcription_model, kwargs=transcription_kwargs) self.additional_kwargs = kwargs @@ -92,9 +97,85 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic # Check whether word boundaries exist and if not run stt if "word_boundaries" not in dict_ and self._whisper_model is not None: - transcription_result = self._whisper_model.transcribe( - str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs - ) + if self.use_cloud_whisper: + # Use OpenAI's cloud-based Whisper API + try: + import openai + from dotenv import find_dotenv, load_dotenv + load_dotenv(find_dotenv(usecwd=True)) + + if os.getenv("OPENAI_API_KEY") is None: + from manim_voiceover.services.openai import create_dotenv_openai + create_dotenv_openai() + + audio_file_path = str(Path(self.cache_dir) / original_audio) + with open(audio_file_path, "rb") as audio_file: + transcription_result = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + **self.transcription_kwargs + ) + + # Convert OpenAI API response to the format expected by manim-voiceover + segments = [] + for segment in transcription_result.segments: + segment_dict = { + "id": segment.id, + "seek": segment.seek, + "start": segment.start, + "end": segment.end, + "text": segment.text, + "tokens": segment.tokens, + "temperature": segment.temperature, + "avg_logprob": segment.avg_logprob, + "compression_ratio": segment.compression_ratio, + "no_speech_prob": segment.no_speech_prob, + "words": [] + } + + # Process word-level timestamps if available + if hasattr(segment, "words"): + for word in segment.words: + segment_dict["words"].append({ + "word": word.word, + "start": word.start, + "end": word.end, + "probability": word.probability + }) + + segments.append(segment_dict) + + # Create a result object similar to what local Whisper would return + class CloudWhisperResult: + def __init__(self, text, segments): + self.text = text + self.segments = segments + + def segments_to_dicts(self): + return self.segments + + transcription_result = CloudWhisperResult( + transcription_result.text, + segments + ) + + logger.info("Cloud Transcription: " + transcription_result.text) + + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[openai]"` to use cloud-based Whisper.' + ) + return dict_ + except Exception as e: + logger.error(f"Error using cloud-based Whisper: {str(e)}") + return dict_ + else: + # Use local Whisper model + transcription_result = self._whisper_model.transcribe( + str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs + ) + logger.info("Transcription: " + transcription_result.text) word_boundaries = timestamps_to_word_boundaries( transcription_result.segments_to_dicts() @@ -138,23 +219,37 @@ def set_transcription(self, model: str = None, kwargs: dict = {}): """ if model != self.transcription_model: if model is not None: - try: - import whisper as __tmp - import stable_whisper as whisper - except ImportError: - logger.error( - 'Missing packages. Run `pip install "manim-voiceover[transcribe]"` to be able to transcribe voiceovers.' - ) + if self.use_cloud_whisper: + # For cloud-based Whisper, we don't need to load a local model + # but we still need the OpenAI package + try: + import openai + self._whisper_model = True # Just a placeholder to indicate we have a model + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[openai]"` to use cloud-based Whisper.' + ) + self._whisper_model = None + else: + # Load local Whisper model + try: + import whisper as __tmp + import stable_whisper as whisper + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[transcribe]"` to be able to transcribe voiceovers.' + ) - prompt_ask_missing_extras( - ["whisper", "stable_whisper"], - "transcribe", - "SpeechService.set_transcription()", - ) - self._whisper_model = whisper.load_model(model) + prompt_ask_missing_extras( + ["whisper", "stable_whisper"], + "transcribe", + "SpeechService.set_transcription()", + ) + self._whisper_model = whisper.load_model(model) else: self._whisper_model = None + self.transcription_model = model self.transcription_kwargs = kwargs def get_audio_basename(self, data: dict) -> str: diff --git a/manim_voiceover/voiceover_scene.py b/manim_voiceover/voiceover_scene.py index 8030aa6..a42b0d1 100644 --- a/manim_voiceover/voiceover_scene.py +++ b/manim_voiceover/voiceover_scene.py @@ -35,6 +35,10 @@ def set_speech_service( create_subcaption (bool, optional): Whether to create subcaptions for the scene. Defaults to True. If `config.save_last_frame` is True, the argument is ignored and no subcaptions will be created. """ + # Set use_cloud_whisper from the config if it has the attribute + if hasattr(config, "use_cloud_whisper"): + speech_service.use_cloud_whisper = config.use_cloud_whisper + self.speech_service = speech_service self.current_tracker = None if config.save_last_frame: diff --git a/pyproject.toml b/pyproject.toml index 911f17c..fe21952 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.3.7" +version = "0.4.0" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT" @@ -66,7 +66,7 @@ elevenlabs = {version = "^0.2.27", optional = true} [tool.poetry.extras] azure = ["azure-cognitiveservices-speech"] gtts = ["gTTS"] -openai = ["openai"] +openai = ["openai", "python-dotenv"] pyttsx3 = ["pyttsx3"] # coqui = ["torch", "TTS"] coqui = [] # Removed TTS as deps for now @@ -83,6 +83,7 @@ all = [ "PyAudio", "pynput", "openai", + "python-dotenv", "deepl", "openai-whisper", "stable-ts", From 3b51312fb1c47d40b58d2819ecd73db0d68b8b50 Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 18:07:59 +0000 Subject: [PATCH 2/6] cloud whisper works --- README.md | 18 +- demo_openai_cloud_whisper.py | 110 +++++++++ direct_openai_test.py | 107 +++++++++ docs/source/cloud_whisper.md | 14 +- examples/cloud_whisper_demo.py | 48 ++-- manim_cloud_whisper.py | 30 +++ manim_voiceover/__init__.py | 6 - manim_voiceover/cli/__init__.py | 4 +- manim_voiceover/cli/main.py | 36 +-- manim_voiceover/services/base.py | 137 ++++++----- manim_voiceover/voiceover_scene.py | 8 +- speech-to-text.md | 369 +++++++++++++++++++++++++++++ standalone_openai_debug.py | 68 ++++++ standalone_openai_demo.py | 108 +++++++++ temp_direct_test/cache.json | 84 +++++++ temp_openai_demo/cache.json | 114 +++++++++ temp_service_test/cache.json | 190 +++++++++++++++ temp_test/cache.json | 13 + test_cli.py | 33 +++ test_cloud_whisper.py | 29 +++ test_cloud_whisper_simple.py | 56 +++++ test_scene.py | 24 ++ test_speech_service.py | 56 +++++ 23 files changed, 1542 insertions(+), 120 deletions(-) create mode 100644 demo_openai_cloud_whisper.py create mode 100644 direct_openai_test.py create mode 100755 manim_cloud_whisper.py create mode 100644 speech-to-text.md create mode 100644 standalone_openai_debug.py create mode 100644 standalone_openai_demo.py create mode 100644 temp_direct_test/cache.json create mode 100644 temp_openai_demo/cache.json create mode 100644 temp_service_test/cache.json create mode 100644 temp_test/cache.json create mode 100644 test_cli.py create mode 100644 test_cloud_whisper.py create mode 100644 test_cloud_whisper_simple.py create mode 100644 test_scene.py create mode 100644 test_speech_service.py diff --git a/README.md b/README.md index 54e5f7a..fb08980 100644 --- a/README.md +++ b/README.md @@ -47,19 +47,31 @@ Currently supported TTS services (aside from the CLI that allows you to records For ARM64 architectures (like Apple Silicon Macs) or systems where installing the local Whisper model is problematic, you can now use OpenAI's cloud-based Whisper API for speech-to-text alignment: ```bash -# Run with cloud-based Whisper -manim -pql --use-cloud-whisper examples/cloud_whisper_demo.py CloudWhisperDemo +# Run with the provided script +python manim_cloud_whisper.py -pql examples/cloud_whisper_demo.py CloudWhisperDemo ``` Or enable it programmatically: ```python -service = GTTSService( +service = OpenAIService( + voice="alloy", + model="tts-1", transcription_model="base", use_cloud_whisper=True # This enables cloud-based Whisper ) ``` +You can also set an environment variable to enable cloud-based Whisper: + +```bash +# Set the environment variable +export MANIM_VOICEOVER_USE_CLOUD_WHISPER=1 + +# Run Manim normally +manim -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + [Learn more about cloud-based Whisper in the documentation.](https://voiceover.manim.community/en/latest/cloud_whisper.html) ## Translate diff --git a/demo_openai_cloud_whisper.py b/demo_openai_cloud_whisper.py new file mode 100644 index 0000000..cac1abe --- /dev/null +++ b/demo_openai_cloud_whisper.py @@ -0,0 +1,110 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.openai import OpenAIService + +class OpenAICloudWhisperDemo(VoiceoverScene): + def construct(self): + # Print the cloud whisper setting + print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + + # Initialize OpenAI speech service with cloud whisper + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", + use_cloud_whisper=True # Use cloud-based Whisper + ) + self.set_speech_service(service) + + # Create a title + title = Text("OpenAI TTS + Cloud Whisper Demo", font_size=48) + self.play(Write(title)) + self.wait(1) + + # Move title to top + self.play(title.animate.to_edge(UP)) + + # Create a subtitle + subtitle = Text("Word-level alignment on ARM64 architectures", + font_size=36, + color=BLUE) + subtitle.next_to(title, DOWN) + self.play(FadeIn(subtitle)) + + # Demonstrate voiceover with bookmarks + with self.voiceover( + """This demonstration uses OpenAI's text-to-speech service + with cloud-based Whisper for + word-level alignment.""" + ) as tracker: + # Wait until the first bookmark + self.wait_until_bookmark("cloud_point") + + # Create and animate the cloud text + cloud_text = Text("☁️ Cloud-based Whisper", color=BLUE, font_size=36) + cloud_text.next_to(subtitle, DOWN, buff=1) + self.play(FadeIn(cloud_text)) + + # Wait until the second bookmark + self.wait_until_bookmark("alignment_point") + + # Create and animate the alignment text + alignment_text = Text("Perfect Word Timing", color=GREEN, font_size=36) + alignment_text.next_to(cloud_text, DOWN, buff=0.5) + self.play(FadeIn(alignment_text)) + + # Continue with demonstration + self.wait(1) + + # Show ARM64 compatibility + arm_title = Text("Works on Apple Silicon!", color=RED, font_size=36) + arm_title.next_to(alignment_text, DOWN, buff=1) + + with self.voiceover( + "This feature is especially useful for ARM64 architectures like your M4 Pro." + ): + self.play(FadeIn(arm_title)) + + # Final animation + self.wait(1) + + with self.voiceover( + "No local Whisper model required. Everything happens in the cloud!" + ): + # Create a final animation + final_group = VGroup(title, subtitle, cloud_text, alignment_text, arm_title) + self.play( + final_group.animate.scale(0.8).to_edge(UP), + ) + + # Create a cloud icon + cloud = Text("☁️", font_size=120) + self.play(FadeIn(cloud)) + + # Add some particles around the cloud + particles = VGroup(*[ + Dot(radius=0.05, color=BLUE).move_to( + cloud.get_center() + np.array([ + np.random.uniform(-3, 3), + np.random.uniform(-2, 2), + 0 + ]) + ) + for _ in range(20) + ]) + self.play(FadeIn(particles)) + + # Animate the particles + self.play( + *[ + p.animate.shift(np.array([ + np.random.uniform(-1, 1), + np.random.uniform(-1, 1), + 0 + ])) + for p in particles + ], + run_time=2 + ) + + self.wait(2) \ No newline at end of file diff --git a/direct_openai_test.py b/direct_openai_test.py new file mode 100644 index 0000000..1f5e7ee --- /dev/null +++ b/direct_openai_test.py @@ -0,0 +1,107 @@ +import os +import json +from pathlib import Path +from dotenv import load_dotenv +import openai + +# Load environment variables from .env file +load_dotenv() + +# Create a temporary directory for audio files +temp_dir = Path("./temp_direct_test") +temp_dir.mkdir(exist_ok=True) + +# Constants for audio offset resolution (same as in manim-voiceover) +AUDIO_OFFSET_RESOLUTION = 1000 # 1000 = milliseconds + +print("=== Direct OpenAI API Test ===") + +# First, generate speech using OpenAI TTS +print("\nGenerating speech from text...") +text = "This is a test of the cloud-based Whisper feature." + +# Generate speech using OpenAI TTS +response = openai.audio.speech.create( + model="tts-1", + voice="alloy", + input=text +) + +audio_path = temp_dir / "direct_test.mp3" +response.stream_to_file(str(audio_path)) + +print(f"Speech generated and saved to {audio_path}") + +# Now, transcribe the audio using OpenAI Whisper API +print("\nTranscribing audio with word-level timestamps...") +with open(audio_path, "rb") as audio_file: + transcription = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + timestamp_granularities=["word"] + ) + +# Print the raw response structure +print("\nRaw API Response Structure:") +print(f"Response type: {type(transcription)}") +print(f"Response attributes: {dir(transcription)}") +print(f"Has 'words' attribute: {hasattr(transcription, 'words')}") + +if hasattr(transcription, 'words'): + print(f"Words type: {type(transcription.words)}") + print(f"Words count: {len(transcription.words)}") + + # Try to access the first word + if len(transcription.words) > 0: + first_word = transcription.words[0] + print(f"First word type: {type(first_word)}") + print(f"First word attributes: {dir(first_word)}") + print(f"First word: {first_word.word if hasattr(first_word, 'word') else 'No word attribute'}") + print(f"First word start: {first_word.start if hasattr(first_word, 'start') else 'No start attribute'}") + +# Convert to word boundaries format used by manim-voiceover +print("\nConverting to word boundaries format...") +word_boundaries = [] +current_text_offset = 0 + +if hasattr(transcription, 'words'): + for word_obj in transcription.words: + try: + word = word_obj.word + start_time = word_obj.start + + # Create a word boundary entry + word_boundary = { + "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + + word_boundaries.append(word_boundary) + current_text_offset += len(word) + 1 # +1 for space + + print(f"Added word boundary: {word} at {start_time}s") + except Exception as e: + print(f"Error processing word: {e}") + +print(f"\nCreated {len(word_boundaries)} word boundaries") + +# Create a cache file that manim-voiceover can use +cache_data = { + "input_text": text, + "input_data": {"input_text": text, "service": "openai"}, + "original_audio": audio_path.name, + "word_boundaries": word_boundaries, + "transcribed_text": transcription.text, + "final_audio": audio_path.name +} + +cache_file = temp_dir / "cache.json" +with open(cache_file, "w") as f: + json.dump([cache_data], f, indent=2) + +print(f"\nCreated cache file at {cache_file}") +print("\nTest completed!") \ No newline at end of file diff --git a/docs/source/cloud_whisper.md b/docs/source/cloud_whisper.md index 78cf5d1..ffe32de 100644 --- a/docs/source/cloud_whisper.md +++ b/docs/source/cloud_whisper.md @@ -25,10 +25,20 @@ pip install "manim-voiceover[openai]" ### Command Line Option -You can enable cloud-based Whisper for any Manim render by adding the `--use-cloud-whisper` flag: +You can enable cloud-based Whisper for any Manim render by using the provided script: ```bash -manim -pql --use-cloud-whisper example.py MyScene +python manim_cloud_whisper.py -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +Or by setting an environment variable: + +```bash +# Set the environment variable +export MANIM_VOICEOVER_USE_CLOUD_WHISPER=1 + +# Run Manim normally +manim -pql examples/cloud_whisper_demo.py CloudWhisperDemo ``` ### Programmatic Usage diff --git a/examples/cloud_whisper_demo.py b/examples/cloud_whisper_demo.py index 7192837..99db975 100644 --- a/examples/cloud_whisper_demo.py +++ b/examples/cloud_whisper_demo.py @@ -1,14 +1,16 @@ from manim import * from manim_voiceover.voiceover_scene import VoiceoverScene from manim_voiceover.services.gtts import GTTSService -from manim_voiceover.services.recorder import RecorderService +from manim_voiceover.services.openai import OpenAIService class CloudWhisperDemo(VoiceoverScene): def construct(self): # Initialize speech service with cloud whisper option # Note: You can also run this with --use-cloud-whisper flag # instead of setting use_cloud_whisper=True here - service = GTTSService( + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd transcription_model="base", # Model name is still required use_cloud_whisper=True # This enables cloud-based Whisper ) @@ -19,10 +21,13 @@ def construct(self): self.play(Write(title)) self.wait() + # Move title to top + self.play(title.animate.to_edge(UP)) + # Demonstrate voiceover with bookmarks with self.voiceover( """This demonstration uses cloud-based Whisper - from OpenAI for speech-to-text alignment. + for word-level alignment. """ ) as tracker: # Wait until the first bookmark @@ -45,7 +50,7 @@ def construct(self): self.wait(1) # Show ARM64 compatibility - arm_title = Text("Works on ARM64 Architectures!", color=YELLOW, font_size=36) + arm_title = Text("Works on ARM64 Architectures!", color=RED, font_size=36) arm_title.next_to(alignment_text, DOWN, buff=1) with self.voiceover( @@ -53,35 +58,20 @@ def construct(self): ): self.play(FadeIn(arm_title)) - # Show how it's used + # Final animation self.wait(1) - code_text = """ -# Run with CLI flag: -manim -pql --use-cloud-whisper example.py MyScene - -# Or enable programmatically: -service = GTTSService( - transcription_model="base", - use_cloud_whisper=True -) - """ - code = Code(code=code_text, language="python", font_size=24) - code.next_to(arm_title, DOWN, buff=1) - - with self.voiceover( - "You can enable cloud-based Whisper using either a command-line flag or programmatically in your code." - ): - self.play(Create(code)) - - self.wait(2) - with self.voiceover( - "This means you can use word-level alignment on any system without installing large local models." + "No local Whisper model required. Everything happens in the cloud!" ): - self.play(FadeOut(code, title, cloud_text, alignment_text, arm_title)) + # Create a final animation + final_group = VGroup(title, cloud_text, alignment_text, arm_title) + self.play( + final_group.animate.scale(0.8).to_edge(UP), + ) - final_text = Text("No Local Models Required!", font_size=48, color=BLUE) - self.play(Write(final_text)) + # Create a cloud icon + cloud = Text("☁️", font_size=120) + self.play(FadeIn(cloud)) self.wait(2) \ No newline at end of file diff --git a/manim_cloud_whisper.py b/manim_cloud_whisper.py new file mode 100755 index 0000000..3c451a5 --- /dev/null +++ b/manim_cloud_whisper.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +""" +Custom entry point for running Manim with cloud-based Whisper. +""" + +import sys +import os +import subprocess + +def main(): + """Run Manim with cloud-based Whisper enabled.""" + # Set the environment variable to enable cloud-based Whisper + os.environ["MANIM_VOICEOVER_USE_CLOUD_WHISPER"] = "1" + + # Get the Manim command arguments + args = sys.argv[1:] + + # Run the Manim command + cmd = ["manim"] + args + print(f"Running: {' '.join(cmd)}") + print("Cloud-based Whisper enabled via environment variable.") + + # Execute the command + result = subprocess.run(cmd) + + # Return the exit code + return result.returncode + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/manim_voiceover/__init__.py b/manim_voiceover/__init__.py index 4ece6e2..8103223 100644 --- a/manim_voiceover/__init__.py +++ b/manim_voiceover/__init__.py @@ -9,9 +9,3 @@ # Add our custom config attribute if not hasattr(config, 'use_cloud_whisper'): config.use_cloud_whisper = False - -# Try to load our CLI extension -try: - from manim_voiceover.cli import main -except ImportError: - pass # CLI module couldn't be loaded diff --git a/manim_voiceover/cli/__init__.py b/manim_voiceover/cli/__init__.py index 67b0a65..77a181f 100644 --- a/manim_voiceover/cli/__init__.py +++ b/manim_voiceover/cli/__init__.py @@ -1 +1,3 @@ -# This file initializes the CLI module \ No newline at end of file +# This file initializes the CLI module +from manim_voiceover.cli.config import add_voiceover_args +from manim_voiceover.cli.main import patched_render_command, patched_main \ No newline at end of file diff --git a/manim_voiceover/cli/main.py b/manim_voiceover/cli/main.py index 5106a83..4649c08 100644 --- a/manim_voiceover/cli/main.py +++ b/manim_voiceover/cli/main.py @@ -7,44 +7,29 @@ import sys from pathlib import Path -from manim.cli.render.commands import main_command, render +from manim.cli.render.commands import main_command as original_main_command +from manim.cli.render.commands import render from manim_voiceover.cli.config import add_voiceover_args -# Hook into the Manim CLI by monkey patching the main command +# Store the original command function original_render_command = render.command def patched_render_command(function): + """Patch the render command to add our custom arguments.""" # Call the original decorated function cmd = original_render_command(function) - # Get the 'params' attribute from the decorated function - params = getattr(cmd, 'params', []) - - # Add our custom arguments to the command line - def param_callback(ctx, param, value): - # Store the use_cloud_whisper value in the context object - ctx.ensure_object(dict) - ctx.obj['use_cloud_whisper'] = value - return value - # Add our parameter to the command from click import option cmd = option('--use-cloud-whisper', is_flag=True, - help='Use OpenAI cloud API for Whisper instead of local model', - callback=param_callback)(cmd) + help='Use OpenAI cloud API for Whisper instead of local model')(cmd) return cmd -# Apply our monkey patch -render.command = patched_render_command - -# Also hook into the argparse version of the CLI -original_main = main_command - def patched_main(): - """Entry point for renderer.""" + """Entry point for renderer with cloud whisper support.""" # Find the render subcommand in the argument parser import argparse @@ -61,7 +46,10 @@ def patched_main(): config.use_cloud_whisper = True # Call the original main command - return original_main() + return original_main_command() + +# Apply our monkey patch +render.command = patched_render_command -# Apply our second monkey patch -main_command = patched_main \ No newline at end of file +# No need for this line since we're directly importing and patching in __init__.py +# main_command = patched_main \ No newline at end of file diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index e212ac9..420d90c 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -23,13 +23,31 @@ def timestamps_to_word_boundaries(segments): word_boundaries = [] current_text_offset = 0 - for segment in segments: - for dict_ in segment["words"]: + + # Check if we have direct word-level timestamps (from OpenAI API) + if isinstance(segments, list) and len(segments) > 0 and "words" in segments[0]: + # Process segment-level timestamps + for segment in segments: + for dict_ in segment["words"]: + word = dict_["word"] + word_boundaries.append( + { + "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + ) + current_text_offset += len(word) + # Check if we have direct word-level timestamps in a flat structure (from OpenAI API) + elif isinstance(segments, list) and len(segments) > 0 and isinstance(segments[0], dict) and "word" in segments[0]: + # Process word-level timestamps directly + for dict_ in segments: word = dict_["word"] word_boundaries.append( { "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), - # "duration_milliseconds": 0, "text_offset": current_text_offset, "word_length": len(word), "text": word, @@ -37,9 +55,21 @@ def timestamps_to_word_boundaries(segments): } ) current_text_offset += len(word) - # If word is not punctuation, add a space - # if word not in [".", ",", "!", "?", ";", ":", "(", ")"]: - # current_text_offset += 1 + else: + # Original implementation for local Whisper + for segment in segments: + for dict_ in segment["words"]: + word = dict_["word"] + word_boundaries.append( + { + "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + ) + current_text_offset += len(word) return word_boundaries @@ -114,53 +144,45 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic model="whisper-1", file=audio_file, response_format="verbose_json", + timestamp_granularities=["word"], **self.transcription_kwargs ) - # Convert OpenAI API response to the format expected by manim-voiceover - segments = [] - for segment in transcription_result.segments: - segment_dict = { - "id": segment.id, - "seek": segment.seek, - "start": segment.start, - "end": segment.end, - "text": segment.text, - "tokens": segment.tokens, - "temperature": segment.temperature, - "avg_logprob": segment.avg_logprob, - "compression_ratio": segment.compression_ratio, - "no_speech_prob": segment.no_speech_prob, - "words": [] - } - - # Process word-level timestamps if available - if hasattr(segment, "words"): - for word in segment.words: - segment_dict["words"].append({ - "word": word.word, - "start": word.start, - "end": word.end, - "probability": word.probability - }) - - segments.append(segment_dict) + # Convert the word timestamps to word boundaries directly + logger.info("Cloud Transcription: " + transcription_result.text) + logger.info(f"Word count: {len(transcription_result.words) if hasattr(transcription_result, 'words') else 0}") - # Create a result object similar to what local Whisper would return - class CloudWhisperResult: - def __init__(self, text, segments): - self.text = text - self.segments = segments - - def segments_to_dicts(self): - return self.segments + word_boundaries = [] + current_text_offset = 0 - transcription_result = CloudWhisperResult( - transcription_result.text, - segments - ) + if hasattr(transcription_result, 'words') and transcription_result.words: + logger.info(f"Processing {len(transcription_result.words)} words") + for word_obj in transcription_result.words: + try: + word = word_obj.word + start_time = word_obj.start + + # Create a word boundary entry + word_boundary = { + "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + + word_boundaries.append(word_boundary) + current_text_offset += len(word) + 1 # +1 for space + + logger.info(f"Added word boundary: {word} at {start_time}s") + except Exception as e: + logger.error(f"Error processing word: {e}") + else: + logger.warning("No words found in transcription result") - logger.info("Cloud Transcription: " + transcription_result.text) + logger.info(f"Created {len(word_boundaries)} word boundaries") + dict_["word_boundaries"] = word_boundaries + dict_["transcribed_text"] = transcription_result.text except ImportError: logger.error( @@ -176,12 +198,19 @@ def segments_to_dicts(self): str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs ) - logger.info("Transcription: " + transcription_result.text) - word_boundaries = timestamps_to_word_boundaries( - transcription_result.segments_to_dicts() - ) - dict_["word_boundaries"] = word_boundaries - dict_["transcribed_text"] = transcription_result.text + logger.info("Transcription: " + transcription_result.text) + + # For local Whisper model, use segments_to_dicts + if hasattr(transcription_result, 'segments_to_dicts'): + word_boundaries = timestamps_to_word_boundaries( + transcription_result.segments_to_dicts() + ) + else: + # For OpenAI API response, we already have word boundaries + pass + + dict_["word_boundaries"] = word_boundaries + dict_["transcribed_text"] = transcription_result.text # Audio callback self.audio_callback(original_audio, dict_, **kwargs) @@ -279,7 +308,7 @@ def generate_from_text( raise NotImplementedError def get_cached_result(self, input_data, cache_dir): - json_path = os.path.join(cache_dir / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME) + json_path = os.path.join(cache_dir, DEFAULT_VOICEOVER_CACHE_JSON_FILENAME) if os.path.exists(json_path): json_data = json.load(open(json_path, "r")) for entry in json_data: diff --git a/manim_voiceover/voiceover_scene.py b/manim_voiceover/voiceover_scene.py index a42b0d1..7d23af8 100644 --- a/manim_voiceover/voiceover_scene.py +++ b/manim_voiceover/voiceover_scene.py @@ -4,6 +4,7 @@ from typing import Optional, Generator import re import typing as t +import os from manim import Scene, config from manim_voiceover.services.base import SpeechService @@ -35,8 +36,13 @@ def set_speech_service( create_subcaption (bool, optional): Whether to create subcaptions for the scene. Defaults to True. If `config.save_last_frame` is True, the argument is ignored and no subcaptions will be created. """ + # Check for environment variable to enable cloud-based Whisper + if os.environ.get("MANIM_VOICEOVER_USE_CLOUD_WHISPER") == "1": + speech_service.use_cloud_whisper = True + print("Cloud-based Whisper enabled via environment variable.") + # Set use_cloud_whisper from the config if it has the attribute - if hasattr(config, "use_cloud_whisper"): + elif hasattr(config, "use_cloud_whisper"): speech_service.use_cloud_whisper = config.use_cloud_whisper self.speech_service = speech_service diff --git a/speech-to-text.md b/speech-to-text.md new file mode 100644 index 0000000..beda2e8 --- /dev/null +++ b/speech-to-text.md @@ -0,0 +1,369 @@ +Speech to text +============== + +Learn how to turn audio into text. + +Overview +-------- + +The Audio API provides two speech to text endpoints, `transcriptions` and `translations`, based on our state-of-the-art open source large-v2 [Whisper model](https://openai.com/blog/whisper/). They can be used to: + +* Transcribe audio into whatever language the audio is in. +* Translate and transcribe the audio into english. + +File uploads are currently limited to 25 MB and the following input file types are supported: `mp3`, `mp4`, `mpeg`, `mpga`, `m4a`, `wav`, and `webm`. + +Quickstart +---------- + +### Transcriptions + +The transcriptions API takes as input the audio file you want to transcribe and the desired output file format for the transcription of the audio. We currently support multiple input and output file formats. + +Transcribe audio + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/audio.mp3"), + model: "whisper-1", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file= open("/path/to/file/audio.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/transcriptions \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/audio.mp3 \ + --form model=whisper-1 +``` + +By default, the response type will be json with the raw text included. + +{ "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger. .... } + +The Audio API also allows you to set additional parameters in a request. For example, if you want to set the `response_format` as `text`, your request would look like the following: + +Additional options + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/speech.mp3"), + model: "whisper-1", + response_format: "text", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text" +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/transcriptions \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/speech.mp3 \ + --form model=whisper-1 \ + --form response_format=text +``` + +The [API Reference](/docs/api-reference/audio) includes the full list of available parameters. + +### Translations + +The translations API takes as input the audio file in any of the supported languages and transcribes, if necessary, the audio into English. This differs from our /Transcriptions endpoint since the output is not in the original input language and is instead translated to English text. + +Translate audio + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.translations.create({ + file: fs.createReadStream("/path/to/file/german.mp3"), + model: "whisper-1", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/german.mp3", "rb") +transcription = client.audio.translations.create( + model="whisper-1", + file=audio_file, +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/translations \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/german.mp3 \ + --form model=whisper-1 \ +``` + +In this case, the inputted audio was german and the outputted text looks like: + +Hello, my name is Wolfgang and I come from Germany. Where are you heading today? + +We only support translation into English at this time. + +Supported languages +------------------- + +We currently [support the following languages](https://github.com/openai/whisper#available-models-and-languages) through both the `transcriptions` and `translations` endpoint: + +Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. + +While the underlying model was trained on 98 languages, we only list the languages that exceeded <50% [word error rate](https://en.wikipedia.org/wiki/Word_error_rate) (WER) which is an industry standard benchmark for speech to text model accuracy. The model will return results for languages not listed above but the quality will be low. + +Timestamps +---------- + +By default, the Whisper API will output a transcript of the provided audio in text. The [`timestamp_granularities[]` parameter](/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities) enables a more structured and timestamped json output format, with timestamps at the segment, word level, or both. This enables word-level precision for transcripts and video edits, which allows for the removal of specific frames tied to individual words. + +Timestamp options + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("audio.mp3"), + model: "whisper-1", + response_format: "verbose_json", + timestamp_granularities: ["word"] +}); + +console.log(transcription.words); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + file=audio_file, + model="whisper-1", + response_format="verbose_json", + timestamp_granularities=["word"] +) + +print(transcription.words) +``` + +```bash +curl https://api.openai.com/v1/audio/transcriptions \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F file="@/path/to/file/audio.mp3" \ + -F "timestamp_granularities[]=word" \ + -F model="whisper-1" \ + -F response_format="verbose_json" +``` + +Longer inputs +------------- + +By default, the Whisper API only supports files that are less than 25 MB. If you have an audio file that is longer than that, you will need to break it up into chunks of 25 MB's or less or used a compressed audio format. To get the best performance, we suggest that you avoid breaking the audio up mid-sentence as this may cause some context to be lost. + +One way to handle this is to use the [PyDub open source Python package](https://github.com/jiaaro/pydub) to split the audio: + +```python +from pydub import AudioSegment + +song = AudioSegment.from_mp3("good_morning.mp3") + +# PyDub handles time in milliseconds +ten_minutes = 10 * 60 * 1000 + +first_10_minutes = song[:ten_minutes] + +first_10_minutes.export("good_morning_10.mp3", format="mp3") +``` + +_OpenAI makes no guarantees about the usability or security of 3rd party software like PyDub._ + +Prompting +--------- + +You can use a [prompt](/docs/api-reference/audio/createTranscription#audio/createTranscription-prompt) to improve the quality of the transcripts generated by the Whisper API. The model tries to match the style of the prompt, so it's more likely to use capitalization and punctuation if the prompt does too. However, the current prompting system is more limited than our other language models and provides limited control over the generated audio. + +Here are some examples of how prompting can help in different scenarios: + +1. Prompts can help correct specific words or acronyms that the model misrecognizes in the audio. For example, the following prompt improves the transcription of the words DALL·E and GPT-3, which were previously written as "GDP 3" and "DALI": "The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity." +2. To preserve the context of a file that was split into segments, prompt the model with the transcript of the preceding segment. The model uses relevant information from the previous audio, improving transcription accuracy. The model only considers the final 224 tokens of the prompt and ignores anything earlier. For multilingual inputs, Whisper uses a custom tokenizer. For English-only inputs, it uses the standard GPT-2 tokenizer. Find both tokenizers in the open source [Whisper Python package](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L361). +3. Sometimes the model skips punctuation in the transcript. To prevent this, use a simple prompt that includes punctuation: "Hello, welcome to my lecture." +4. The model may also leave out common filler words in the audio. If you want to keep the filler words in your transcript, use a prompt that contains them: "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking." +5. Some languages can be written in different ways, such as simplified or traditional Chinese. The model might not always use the writing style that you want for your transcript by default. You can improve this by using a prompt in your preferred writing style. + +Improving reliability +--------------------- + +One of the most common challenges faced when using Whisper is the model often does not recognize uncommon words or acronyms. Here are some different techniques to improve the reliability of Whisper in these cases: + +Using the prompt parameter + +The first method involves using the optional prompt parameter to pass a dictionary of the correct spellings. + +Because it wasn't trained with instruction-following techniques, Whisper operates more like a base GPT model. Keep in mind that Whisper only considers the first 224 tokens of the prompt. + +Prompt parameter + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/speech.mp3"), + model: "whisper-1", + response_format: "text", + prompt:"ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T.", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text", + prompt="ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T." +) + +print(transcription.text) +``` + +While it increases reliability, this technique is limited to 224 tokens, so your list of SKUs needs to be relatively small for this to be a scalable solution. + +Post-processing with GPT-4 + +The second method involves a post-processing step using GPT-4 or GPT-3.5-Turbo. + +We start by providing instructions for GPT-4 through the `system_prompt` variable. Similar to what we did with the prompt parameter earlier, we can define our company and product names. + +Post-processing + +```javascript +const systemPrompt = ` +You are a helpful assistant for the company ZyntriQix. Your task is +to correct any spelling discrepancies in the transcribed text. Make +sure that the names of the following products are spelled correctly: +ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, +OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., +Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary punctuation such as +periods, commas, and capitalization, and use only the context provided. +`; + +const transcript = await transcribe(audioFile); +const completion = await openai.chat.completions.create({ + model: "gpt-4o", + temperature: temperature, + messages: [ + { + role: "system", + content: systemPrompt + }, + { + role: "user", + content: transcript + } + ], + store: true, +}); + +console.log(completion.choices[0].message.content); +``` + +```python +system_prompt = """ +You are a helpful assistant for the company ZyntriQix. Your task is to correct +any spelling discrepancies in the transcribed text. Make sure that the names of +the following products are spelled correctly: ZyntriQix, Digique Plus, +CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal +Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary +punctuation such as periods, commas, and capitalization, and use only the +context provided. +""" + +def generate_corrected_transcript(temperature, system_prompt, audio_file): + response = client.chat.completions.create( + model="gpt-4o", + temperature=temperature, + messages=[ + { + "role": "system", + "content": system_prompt + }, + { + "role": "user", + "content": transcribe(audio_file, "") + } + ] + ) + return completion.choices[0].message.content +corrected_text = generate_corrected_transcript( + 0, system_prompt, fake_company_filepath +) +``` + +If you try this on your own audio file, you'll see that GPT-4 corrects many misspellings in the transcript. Due to its larger context window, this method might be more scalable than using Whisper's prompt parameter. It's also more reliable, as GPT-4 can be instructed and guided in ways that aren't possible with Whisper due to its lack of instruction following. + +Was this page useful? \ No newline at end of file diff --git a/standalone_openai_debug.py b/standalone_openai_debug.py new file mode 100644 index 0000000..2b0fbdf --- /dev/null +++ b/standalone_openai_debug.py @@ -0,0 +1,68 @@ +import os +import sys +import json +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import OpenAI directly +import openai + +print("=== OpenAI API Debug ===") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + sys.exit(1) + +# Create a temporary directory for audio files +temp_dir = Path("./temp_debug") +temp_dir.mkdir(exist_ok=True) + +# First, generate speech using OpenAI TTS +print("\nGenerating speech from text...") +text = "This is a test of the cloud-based Whisper feature." + +# Generate speech using OpenAI TTS +response = openai.audio.speech.create( + model="tts-1", + voice="alloy", + input=text +) + +audio_path = temp_dir / "test_speech.mp3" +response.stream_to_file(str(audio_path)) + +print(f"Speech generated and saved to {audio_path}") + +# Now, transcribe the audio using OpenAI Whisper API +print("\nTranscribing audio with word-level timestamps...") +with open(audio_path, "rb") as audio_file: + transcription = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + timestamp_granularities=["word"] + ) + +# Print the raw response +print("\nRaw API Response:") +print(json.dumps(transcription.model_dump(), indent=2)) + +# Check if word-level timestamps are available +print("\nChecking for word-level timestamps:") +if hasattr(transcription, "words"): + print(f"Found {len(transcription.words)} words with timestamps:") + for i, word in enumerate(transcription.words): + print(f" {i+1}. '{word.word}' from {word.start:.2f}s to {word.end:.2f}s") +else: + print("No word-level timestamps found in the response.") + +print("\nDebug completed!") \ No newline at end of file diff --git a/standalone_openai_demo.py b/standalone_openai_demo.py new file mode 100644 index 0000000..5771fca --- /dev/null +++ b/standalone_openai_demo.py @@ -0,0 +1,108 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import our modules +from manim_voiceover.services.openai import OpenAIService +from manim_voiceover.config import config +from manim_voiceover.helper import remove_bookmarks + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== OpenAI TTS + Cloud-based Whisper Demo ===") +print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_openai_demo") +temp_dir.mkdir(exist_ok=True) + +# Create an OpenAIService with cloud whisper enabled +service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", # Model name is still required + use_cloud_whisper=True, # This enables cloud-based Whisper + cache_dir=str(temp_dir) +) + +print(f"\nOpenAIService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + sys.exit(1) + +# Generate speech from text with bookmarks +print("\nGenerating speech from text with bookmarks...") +text = """This demonstration uses OpenAI's text-to-speech service +with cloud-based Whisper for +word-level alignment.""" + +print("\nText to synthesize:") +print(text) + +# Generate the speech +result = service._wrap_generate_from_text(text) + +print(f"\nSpeech generated successfully!") +print(f"Audio file: {result.get('final_audio')}") +print(f"Audio path: {temp_dir / result.get('final_audio')}") +print(f"Word boundaries available: {'word_boundaries' in result}") +print(f"Word boundaries count: {len(result.get('word_boundaries', []))}") +print(f"Transcribed text: {result.get('transcribed_text', 'Not available')}") + +# Print the raw result for debugging +print("\nRaw result keys:", result.keys()) +for key, value in result.items(): + if key == 'word_boundaries': + print(f"Word boundaries type: {type(value)}") + print(f"Word boundaries length: {len(value)}") + if value and len(value) > 0: + print(f"First word boundary: {value[0]}") + elif key == 'input_data': + print(f"Input data: {value}") + else: + print(f"{key}: {value}") + +print(f"\nWord boundaries:") +if 'word_boundaries' in result and result['word_boundaries']: + for i, boundary in enumerate(result['word_boundaries']): + print(f" {i+1}. '{boundary['text']}' at {boundary['audio_offset']/1000:.2f} seconds") + + # Find the bookmarks + print("\nBookmarks:") + text_without_bookmarks = remove_bookmarks(text).lower() + text_with_bookmarks = text.lower() + + # Find 'cloud_point' bookmark + cloud_index = text_with_bookmarks.find("") + if cloud_index >= 0: + # Find the closest word boundary after the bookmark + cloud_word_index = len(remove_bookmarks(text[:cloud_index]).split()) + if cloud_word_index < len(result['word_boundaries']): + cloud_word = result['word_boundaries'][cloud_word_index] + print(f" - 'cloud_point' bookmark would trigger at word '{cloud_word['text']}' at time {cloud_word['audio_offset']/1000:.2f} seconds") + + # Find 'alignment_point' bookmark + alignment_index = text_with_bookmarks.find("") + if alignment_index >= 0: + # Find the closest word boundary after the bookmark + alignment_word_index = len(remove_bookmarks(text[:alignment_index]).split()) + if alignment_word_index < len(result['word_boundaries']): + alignment_word = result['word_boundaries'][alignment_word_index] + print(f" - 'alignment_point' bookmark would trigger at word '{alignment_word['text']}' at time {alignment_word['audio_offset']/1000:.2f} seconds") +else: + print(" No word boundaries found in the result.") + +print("\nDemo completed!") +print(f"You can listen to the generated audio file at: {temp_dir / result.get('final_audio')}") \ No newline at end of file diff --git a/temp_direct_test/cache.json b/temp_direct_test/cache.json new file mode 100644 index 0000000..3c85419 --- /dev/null +++ b/temp_direct_test/cache.json @@ -0,0 +1,84 @@ +[ + { + "input_text": "This is a test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a test of the cloud-based Whisper feature.", + "service": "openai" + }, + "original_audio": "direct_test.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 219, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 379, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 519, + "text_offset": 10, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 879, + "text_offset": 15, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 1159, + "text_offset": 18, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 1320, + "text_offset": 22, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 1600, + "text_offset": 28, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 1919, + "text_offset": 34, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 2299, + "text_offset": 42, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a test of the cloud-based whisper feature.", + "final_audio": "direct_test.mp3" + } +] \ No newline at end of file diff --git a/temp_openai_demo/cache.json b/temp_openai_demo/cache.json new file mode 100644 index 0000000..6ef0070 --- /dev/null +++ b/temp_openai_demo/cache.json @@ -0,0 +1,114 @@ +[ + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + } +] \ No newline at end of file diff --git a/temp_service_test/cache.json b/temp_service_test/cache.json new file mode 100644 index 0000000..01b8c02 --- /dev/null +++ b/temp_service_test/cache.json @@ -0,0 +1,190 @@ +[ + { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 2000000, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 3600000, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 5000000, + "text_offset": 10, + "word_length": 6, + "text": "direct", + "boundary_type": "Word" + }, + { + "audio_offset": 7799999, + "text_offset": 17, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 12599999, + "text_offset": 22, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 15800000, + "text_offset": 25, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 17200000, + "text_offset": 29, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 19800000, + "text_offset": 35, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 22599999, + "text_offset": 41, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 26199998, + "text_offset": 49, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a direct test of the cloud-based whisper feature.", + "final_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3" + }, + { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 2000000, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 3600000, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 5000000, + "text_offset": 10, + "word_length": 6, + "text": "direct", + "boundary_type": "Word" + }, + { + "audio_offset": 7799999, + "text_offset": 17, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 12599999, + "text_offset": 22, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 15800000, + "text_offset": 25, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 17200000, + "text_offset": 29, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 19800000, + "text_offset": 35, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 22599999, + "text_offset": 41, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 26199998, + "text_offset": 49, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a direct test of the cloud-based whisper feature.", + "final_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3" + } +] \ No newline at end of file diff --git a/temp_test/cache.json b/temp_test/cache.json new file mode 100644 index 0000000..2410ad4 --- /dev/null +++ b/temp_test/cache.json @@ -0,0 +1,13 @@ +[ + { + "input_text": "This is a test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a test of the cloud-based Whisper feature.", + "service": "gtts" + }, + "original_audio": "this-is-a-test-of-the-cloud-based-whisper-feature-c4598c9b.mp3", + "word_boundaries": [], + "transcribed_text": "This is a test of the cloud-based Whisper feature.", + "final_audio": "this-is-a-test-of-the-cloud-based-whisper-feature-c4598c9b.mp3" + } +] \ No newline at end of file diff --git a/test_cli.py b/test_cli.py new file mode 100644 index 0000000..ba6b680 --- /dev/null +++ b/test_cli.py @@ -0,0 +1,33 @@ +import sys +import subprocess + +print("Testing Manim CLI with --use-cloud-whisper flag") + +# Run the manim command with our custom flag +cmd = ["manim", "--help"] +print(f"Running command: {' '.join(cmd)}") +result = subprocess.run(cmd, capture_output=True, text=True) + +# Check if our flag is in the help output +if "--use-cloud-whisper" in result.stdout: + print("✅ Success: --use-cloud-whisper flag is available in the help output") +else: + print("❌ Error: --use-cloud-whisper flag is not available in the help output") + print("Help output:") + print(result.stdout) + +# Try running the command with our flag +cmd = ["manim", "-pql", "--use-cloud-whisper", "examples/cloud_whisper_demo.py", "CloudWhisperDemo"] +print(f"\nRunning command: {' '.join(cmd)}") +print("(This will not actually run the command, just checking if the flag is recognized)") + +# Just check if the flag is recognized, don't actually run the command +cmd = ["manim", "--use-cloud-whisper", "--help"] +result = subprocess.run(cmd, capture_output=True, text=True) + +if result.returncode == 0: + print("✅ Success: --use-cloud-whisper flag is recognized") +else: + print("❌ Error: --use-cloud-whisper flag is not recognized") + print("Error output:") + print(result.stderr) \ No newline at end of file diff --git a/test_cloud_whisper.py b/test_cloud_whisper.py new file mode 100644 index 0000000..6712e85 --- /dev/null +++ b/test_cloud_whisper.py @@ -0,0 +1,29 @@ +from manim import config +from manim_voiceover.services.base import SpeechService +from manim_voiceover.services.gtts import GTTSService + +# Test 1: Check if the use_cloud_whisper attribute exists +print("Test 1: Checking if use_cloud_whisper attribute exists in config") +if hasattr(config, 'use_cloud_whisper'): + print("✅ Success: config.use_cloud_whisper attribute exists") + print(f"Current value: {config.use_cloud_whisper}") +else: + print("❌ Error: config.use_cloud_whisper attribute does not exist") + +# Test 2: Create a SpeechService with use_cloud_whisper=True +print("\nTest 2: Creating SpeechService with use_cloud_whisper=True") +try: + service = SpeechService(use_cloud_whisper=True, transcription_model='base') + print(f"✅ Success: SpeechService created with use_cloud_whisper={service.use_cloud_whisper}") +except Exception as e: + print(f"❌ Error: Failed to create SpeechService: {str(e)}") + +# Test 3: Create a GTTSService with use_cloud_whisper=True +print("\nTest 3: Creating GTTSService with use_cloud_whisper=True") +try: + service = GTTSService(use_cloud_whisper=True, transcription_model='base') + print(f"✅ Success: GTTSService created with use_cloud_whisper={service.use_cloud_whisper}") +except Exception as e: + print(f"❌ Error: Failed to create GTTSService: {str(e)}") + +print("\nAll tests completed!") \ No newline at end of file diff --git a/test_cloud_whisper_simple.py b/test_cloud_whisper_simple.py new file mode 100644 index 0000000..5942a2f --- /dev/null +++ b/test_cloud_whisper_simple.py @@ -0,0 +1,56 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import our modules +from manim_voiceover.services.gtts import GTTSService +from manim_voiceover.config import config + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== Testing Cloud-based Whisper Implementation ===") +print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_test") +temp_dir.mkdir(exist_ok=True) + +# Create a GTTSService with cloud whisper enabled +service = GTTSService( + transcription_model="base", # Model name is still required + use_cloud_whisper=True, # This enables cloud-based Whisper + cache_dir=str(temp_dir) +) + +print(f"\nGTTSService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + print("Skipping the actual API call test.") +else: + # Generate speech from text + print("\nGenerating speech from text...") + text = "This is a test of the cloud-based Whisper feature." + result = service._wrap_generate_from_text(text) + + print(f"\nSpeech generated successfully!") + print(f"Audio file: {result.get('final_audio')}") + print(f"Word boundaries available: {'word_boundaries' in result}") + + if 'word_boundaries' in result: + print(f"\nWord boundaries:") + for boundary in result['word_boundaries'][:5]: # Show first 5 boundaries + print(f" - {boundary['text']} at {boundary['audio_offset']}") + +print("\nTest completed!") \ No newline at end of file diff --git a/test_scene.py b/test_scene.py new file mode 100644 index 0000000..3804a9c --- /dev/null +++ b/test_scene.py @@ -0,0 +1,24 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.gtts import GTTSService + +class TestScene(VoiceoverScene): + def construct(self): + # Print the cloud whisper setting + print(f"Cloud Whisper enabled: {self.config.use_cloud_whisper}") + + # Initialize speech service + service = GTTSService(transcription_model="base") + self.set_speech_service(service) + + # Create a simple circle + circle = Circle() + + # Add voiceover with a bookmark + with self.voiceover( + """This is a test of the cloud-based Whisper feature.""" + ): + self.wait_until_bookmark("circle_appears") + self.play(Create(circle)) + + self.wait(1) \ No newline at end of file diff --git a/test_speech_service.py b/test_speech_service.py new file mode 100644 index 0000000..b594a8e --- /dev/null +++ b/test_speech_service.py @@ -0,0 +1,56 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Import our modules +from manim_voiceover.services.openai import OpenAIService +from manim_voiceover.config import config + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== Testing SpeechService with Cloud Whisper ===") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_service_test") +temp_dir.mkdir(exist_ok=True) + +# Create an OpenAIService with cloud whisper enabled +service = OpenAIService( + voice="alloy", + model="tts-1", + transcription_model="base", + use_cloud_whisper=True, + cache_dir=str(temp_dir) +) + +print(f"\nOpenAIService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Generate speech from text +print("\nGenerating speech from text...") +text = "This is a direct test of the cloud-based Whisper feature." + +# Call the _wrap_generate_from_text method directly +result = service._wrap_generate_from_text(text) + +print(f"\nSpeech generated successfully!") +print(f"Audio file: {result.get('final_audio')}") +print(f"Word boundaries available: {'word_boundaries' in result}") +print(f"Word boundaries count: {len(result.get('word_boundaries', []))}") +print(f"Transcribed text: {result.get('transcribed_text', 'Not available')}") + +# Print the word boundaries +if 'word_boundaries' in result and result['word_boundaries']: + print("\nWord boundaries:") + for i, boundary in enumerate(result['word_boundaries']): + # Convert from milliseconds to seconds + time_in_seconds = boundary['audio_offset'] / 1000 + print(f" {i+1}. '{boundary['text']}' at {time_in_seconds:.2f} seconds") +else: + print("\nNo word boundaries found in the result.") + +print("\nTest completed!") \ No newline at end of file From dfe2ad0049f2715b139a5673d9dd6fdd84509190 Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 18:25:11 +0000 Subject: [PATCH 3/6] example --- linear_regression_voiceover.py | 223 +++++++++++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 linear_regression_voiceover.py diff --git a/linear_regression_voiceover.py b/linear_regression_voiceover.py new file mode 100644 index 0000000..2c4b0b6 --- /dev/null +++ b/linear_regression_voiceover.py @@ -0,0 +1,223 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.openai import OpenAIService + +# Import the SimpleLinearRegression class from the example +import numpy as np + +class LinearRegressionWithVoiceover(VoiceoverScene): + def construct(self): + # Initialize OpenAI speech service with cloud whisper + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", + use_cloud_whisper=True # This enables cloud-based Whisper + ) + self.set_speech_service(service) + + # Add title with voiceover introduction + with self.voiceover( + """Welcome to this demonstration of linear regression using Manim. + Linear regression is one of the most fundamental + machine learning algorithms.""" + ): + self.wait(1) + # Add title + title = Text("Linear Regression", font_size=36) + title.to_edge(UP) + self.wait_until_bookmark("title_appears") + self.play(FadeIn(title)) + + # Set up axes with voiceover + with self.voiceover( + """Let's start by setting up our coordinate system. + We'll use this to plot our data points and regression line.""" + ): + # Set up axes + axes = Axes( + x_range=(-1, 12), + y_range=(-1, 10), + x_length=10, + y_length=6, + axis_config={"include_numbers": True} + ) + axes.to_edge(DOWN) + self.wait_until_bookmark("axes_appear") + self.play(Create(axes)) + + # Add data points with voiceover + with self.voiceover( + """Now, let's generate some random data points that follow a linear pattern + with some added noise. These yellow dots represent + our training data.""" + ): + # Add data points + n_data_points = 30 + m = 0.75 # slope + y0 = 1 # intercept + + np.random.seed(42) # For reproducibility + points = [] + for _ in range(n_data_points): + x = np.random.uniform(2, 10) + y = y0 + m * x + 0.75 * np.random.normal(0, 1) + points.append(axes.c2p(x, y)) + + dots = VGroup(*[Dot(point, color=YELLOW) for point in points]) + self.wait_until_bookmark("dots_appear") + self.play(FadeIn(dots)) + + # Create line with voiceover + with self.voiceover( + """In linear regression, we want to find a line that best fits our data. + The equation of this line is y = mx + b, where m is the slope + and b is the y-intercept.""" + ): + # Create line + m_tracker = ValueTracker(m) + y0_tracker = ValueTracker(y0) + + def get_line(): + curr_m = m_tracker.get_value() + curr_y0 = y0_tracker.get_value() + + # Create a line manually + x_min, x_max = axes.x_range[0], axes.x_range[1] + line = Line( + start=axes.coords_to_point(x_min, curr_y0 + curr_m * x_min), + end=axes.coords_to_point(x_max, curr_y0 + curr_m * x_max), + color=BLUE + ) + return line + + line = get_line() + self.wait_until_bookmark("line_appear") + self.play(Create(line)) + + # Show slope with voiceover + with self.voiceover( + """Let's look at the slope parameter. The slope determines + how steep our line is. If we increase the slope, + the line becomes steeper.""" + ): + # Show slope + slope_label = MathTex(r"slope = ").next_to(title, DOWN) + slope_value = DecimalNumber(m) + slope_value.next_to(slope_label, RIGHT) + slope_value.add_updater(lambda d: d.set_value(m_tracker.get_value())) + + self.wait_until_bookmark("slope_appear") + self.play(Write(slope_label), Write(slope_value)) + + # Adjust slope + self.wait_until_bookmark("slope_change") + new_m = 1.5 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + Transform(line, new_line), # Transform to new line + run_time=2, + ) + line = new_line # Update line reference + + # Show intercept with voiceover + with self.voiceover( + """The y-intercept is where our line crosses the y-axis. + If we decrease the y-intercept, + the entire line shifts downward.""" + ): + # Show intercept + intercept_label = MathTex(r"y\text{-intercept} = ").next_to(slope_label, DOWN) + intercept_value = DecimalNumber(y0) + intercept_value.next_to(intercept_label, RIGHT) + intercept_value.add_updater(lambda d: d.set_value(y0_tracker.get_value())) + + self.wait_until_bookmark("intercept_appear") + self.play(Write(intercept_label), Write(intercept_value)) + + # Adjust intercept + self.wait_until_bookmark("intercept_change") + new_y0 = -2 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=2 + ) + line = new_line # Update line reference + + # Try different values with voiceover + with self.voiceover( + """In linear regression, we use an optimization algorithm to find + the values of slope and intercept that best fit our data. + Let's try a few different combinations + to see how well they fit.""" + ): + # Try different values to show the fitting process + self.wait_until_bookmark("fit1") + new_m, new_y0 = 0.5, 0 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + self.wait_until_bookmark("fit2") + new_m, new_y0 = 0.7, 0.8 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + self.wait_until_bookmark("fit3") + new_m, new_y0 = 0.75, 1 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + # Add prediction point with voiceover + with self.voiceover( + """Once we have our regression line, we can use it to make predictions. + For example, if x equals 8, + our model predicts that y will be approximately 7.""" + ): + # Add a prediction point + test_x = 8 + test_point = Dot(axes.c2p(test_x, y0 + m * test_x), color=RED, radius=0.1) + prediction_line = DashedLine( + axes.c2p(test_x, 0), + axes.c2p(test_x, y0 + m * test_x), + color=RED + ) + + self.wait_until_bookmark("prediction_appear") + self.play( + Create(test_point), + Create(prediction_line) + ) + + # Conclusion with voiceover + with self.voiceover( + """And that's a basic demonstration of linear regression. + This simple model forms the foundation for many more complex + machine learning algorithms.""" + ): + self.wait(2) \ No newline at end of file From d1d97e0997f2a45139197c62abe550a10e6c8361 Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 19:45:59 +0000 Subject: [PATCH 4/6] whisper cloud on by default --- linear_regression_voiceover.py | 2 -- manim_voiceover/services/base.py | 12 ++++++------ manim_voiceover/services/openai.py | 12 +++++++++++- pyproject.toml | 2 +- 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/linear_regression_voiceover.py b/linear_regression_voiceover.py index 2c4b0b6..3d0f770 100644 --- a/linear_regression_voiceover.py +++ b/linear_regression_voiceover.py @@ -11,8 +11,6 @@ def construct(self): service = OpenAIService( voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer model="tts-1", # tts-1 or tts-1-hd - transcription_model="base", - use_cloud_whisper=True # This enables cloud-based Whisper ) self.set_speech_service(service) diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index 420d90c..dbece86 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -126,7 +126,7 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic original_audio = dict_["original_audio"] # Check whether word boundaries exist and if not run stt - if "word_boundaries" not in dict_ and self._whisper_model is not None: + if "word_boundaries" not in dict_ and (self._whisper_model is not None or self.use_cloud_whisper): if self.use_cloud_whisper: # Use OpenAI's cloud-based Whisper API try: @@ -246,14 +246,17 @@ def set_transcription(self, model: str = None, kwargs: dict = {}): model (str, optional): The Whisper model to use for transcription. Defaults to None. kwargs (dict, optional): Keyword arguments to pass to the transcribe() function. Defaults to {}. """ - if model != self.transcription_model: + self.transcription_model = model + self.transcription_kwargs = kwargs + + if model != self.transcription_model or self._whisper_model is None: if model is not None: if self.use_cloud_whisper: # For cloud-based Whisper, we don't need to load a local model # but we still need the OpenAI package try: import openai - self._whisper_model = True # Just a placeholder to indicate we have a model + self._whisper_model = True # Just a placeholder to indicate we can use cloud whisper except ImportError: logger.error( 'Missing packages. Run `pip install "manim-voiceover[openai]"` to use cloud-based Whisper.' @@ -278,9 +281,6 @@ def set_transcription(self, model: str = None, kwargs: dict = {}): else: self._whisper_model = None - self.transcription_model = model - self.transcription_kwargs = kwargs - def get_audio_basename(self, data: dict) -> str: dumped_data = json.dumps(data) data_hash = hashlib.sha256(dumped_data.encode("utf-8")).hexdigest() diff --git a/manim_voiceover/services/openai.py b/manim_voiceover/services/openai.py index 025fc40..00da069 100644 --- a/manim_voiceover/services/openai.py +++ b/manim_voiceover/services/openai.py @@ -50,6 +50,7 @@ def __init__( voice: str = "alloy", model: str = "tts-1-hd", transcription_model="base", + use_cloud_whisper: bool = True, **kwargs ): """ @@ -60,12 +61,21 @@ def __init__( model (str, optional): The TTS model to use. See the `API page `__ for all the available options. Defaults to ``"tts-1-hd"``. + transcription_model (str, optional): The Whisper model to use for transcription. + Defaults to ``"base"``. + use_cloud_whisper (bool, optional): Whether to use OpenAI's cloud-based + Whisper API for transcription instead of the local model. Defaults to True. """ prompt_ask_missing_extras("openai", "openai", "OpenAIService") self.voice = voice self.model = model - SpeechService.__init__(self, transcription_model=transcription_model, **kwargs) + SpeechService.__init__( + self, + transcription_model=transcription_model, + use_cloud_whisper=use_cloud_whisper, + **kwargs + ) def generate_from_text( self, text: str, cache_dir: str = None, path: str = None, **kwargs diff --git a/pyproject.toml b/pyproject.toml index fe21952..4329b14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.4.0" +version = "0.5.0" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT" From af082268f3ba69cad82e4c34055b5148cca2814e Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 20:09:50 +0000 Subject: [PATCH 5/6] robust whisper check --- manim_voiceover/services/base.py | 48 +++++++++++++++++++------------- pyproject.toml | 2 +- 2 files changed, 29 insertions(+), 21 deletions(-) diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index dbece86..2bf75a8 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -119,7 +119,7 @@ def __init__( self.additional_kwargs = kwargs def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dict: - # Replace newlines with lines, reduce multiple consecutive spaces to single + # Replace newlines with spaces, reduce multiple consecutive spaces to single text = " ".join(text.split()) dict_ = self.generate_from_text(text, cache_dir=None, path=path, **kwargs) @@ -162,7 +162,6 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic word = word_obj.word start_time = word_obj.start - # Create a word boundary entry word_boundary = { "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), "text_offset": current_text_offset, @@ -193,24 +192,33 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic logger.error(f"Error using cloud-based Whisper: {str(e)}") return dict_ else: - # Use local Whisper model - transcription_result = self._whisper_model.transcribe( - str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs - ) - - logger.info("Transcription: " + transcription_result.text) - - # For local Whisper model, use segments_to_dicts - if hasattr(transcription_result, 'segments_to_dicts'): - word_boundaries = timestamps_to_word_boundaries( - transcription_result.segments_to_dicts() - ) + # Use local Whisper model only if it's properly loaded + if self._whisper_model is not None and self._whisper_model is not False: + try: + transcription_result = self._whisper_model.transcribe( + str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs + ) + + logger.info("Transcription: " + transcription_result.text) + + # For local Whisper model, use segments_to_dicts + if hasattr(transcription_result, 'segments_to_dicts'): + word_boundaries = timestamps_to_word_boundaries( + transcription_result.segments_to_dicts() + ) + dict_["word_boundaries"] = word_boundaries + dict_["transcribed_text"] = transcription_result.text + else: + logger.error("Local Whisper model returned unexpected result format.") + return dict_ + except Exception as e: + logger.error(f"Error using local Whisper model: {str(e)}") + return dict_ else: - # For OpenAI API response, we already have word boundaries - pass - - dict_["word_boundaries"] = word_boundaries - dict_["transcribed_text"] = transcription_result.text + logger.error( + "Local Whisper model is not available. Please set use_cloud_whisper=True or install the local model with `pip install \"manim-voiceover[transcribe]\"`." + ) + return dict_ # Audio callback self.audio_callback(original_audio, dict_, **kwargs) @@ -237,7 +245,7 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic Path(self.cache_dir) / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME, dict_ ) return dict_ - + def set_transcription(self, model: str = None, kwargs: dict = {}): """Set the transcription model and keyword arguments to be passed to the transcribe() function. diff --git a/pyproject.toml b/pyproject.toml index 4329b14..7aa510d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.5.0" +version = "0.5.1" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT" From 5c78d8287dd0242c19b6c0be5e2353840805d729 Mon Sep 17 00:00:00 2001 From: Pavlos Date: Sun, 9 Mar 2025 20:53:25 +0000 Subject: [PATCH 6/6] whisper cloud default --- linear_regression_voiceover.py | 2 +- manim_voiceover/__init__.py | 2 +- manim_voiceover/config.py | 2 +- manim_voiceover/services/base.py | 35 ++++++++++++++++-------------- manim_voiceover/services/openai.py | 1 + pyproject.toml | 2 +- 6 files changed, 24 insertions(+), 20 deletions(-) diff --git a/linear_regression_voiceover.py b/linear_regression_voiceover.py index 3d0f770..620069b 100644 --- a/linear_regression_voiceover.py +++ b/linear_regression_voiceover.py @@ -10,7 +10,7 @@ def construct(self): # Initialize OpenAI speech service with cloud whisper service = OpenAIService( voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer - model="tts-1", # tts-1 or tts-1-hd + model="tts-1" ) self.set_speech_service(service) diff --git a/manim_voiceover/__init__.py b/manim_voiceover/__init__.py index 8103223..b7a63e5 100644 --- a/manim_voiceover/__init__.py +++ b/manim_voiceover/__init__.py @@ -8,4 +8,4 @@ # Add our custom config attribute if not hasattr(config, 'use_cloud_whisper'): - config.use_cloud_whisper = False + config.use_cloud_whisper = True diff --git a/manim_voiceover/config.py b/manim_voiceover/config.py index 1227492..c7b2533 100644 --- a/manim_voiceover/config.py +++ b/manim_voiceover/config.py @@ -3,4 +3,4 @@ import tempfile # Whether to use cloud-based Whisper API -config.use_cloud_whisper = False \ No newline at end of file +config.use_cloud_whisper = True \ No newline at end of file diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index 2bf75a8..0263380 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -81,25 +81,24 @@ def __init__( self, global_speed: float = 1.00, cache_dir: t.Optional[str] = None, - transcription_model: t.Optional[str] = None, + transcription_model: t.Optional[str] = "whisper-1", transcription_kwargs: dict = {}, - use_cloud_whisper: bool = False, + use_cloud_whisper: bool = True, **kwargs, ): - """ + """Initialize the speech service. + Args: - global_speed (float, optional): The speed at which to play the audio. - Defaults to 1.00. - cache_dir (str, optional): The directory to save the audio - files to. Defaults to ``voiceovers/``. - transcription_model (str, optional): The - `OpenAI Whisper model `_ - to use for transcription. Defaults to None. - transcription_kwargs (dict, optional): Keyword arguments to - pass to the transcribe() function. Defaults to {}. + global_speed (float, optional): The global speed factor for the + generated audio. Defaults to 1.00. + cache_dir (t.Optional[str], optional): The directory where the + generated audio will be cached. Defaults to None. + transcription_model (t.Optional[str], optional): The Whisper model + to use for transcription. Defaults to "whisper-1". + transcription_kwargs (dict, optional): Keyword arguments to pass + to the transcribe() function. Defaults to {}. use_cloud_whisper (bool, optional): Whether to use OpenAI's cloud-based - Whisper API for transcription instead of the local model. Useful for - ARM64 architectures where local Whisper may not work. Defaults to False. + Whisper API for transcription instead of the local model. Defaults to True. """ self.global_speed = global_speed @@ -159,9 +158,13 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic logger.info(f"Processing {len(transcription_result.words)} words") for word_obj in transcription_result.words: try: - word = word_obj.word + word = word_obj.word.strip() # Remove any leading/trailing whitespace start_time = word_obj.start + # Skip words that are just punctuation or empty + if not word or word.isspace() or (len(word) == 1 and not word.isalnum()): + continue + word_boundary = { "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), "text_offset": current_text_offset, @@ -193,7 +196,7 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic return dict_ else: # Use local Whisper model only if it's properly loaded - if self._whisper_model is not None and self._whisper_model is not False: + if self._whisper_model is not None and not isinstance(self._whisper_model, bool): try: transcription_result = self._whisper_model.transcribe( str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs diff --git a/manim_voiceover/services/openai.py b/manim_voiceover/services/openai.py index 00da069..16b2e12 100644 --- a/manim_voiceover/services/openai.py +++ b/manim_voiceover/services/openai.py @@ -124,6 +124,7 @@ def generate_from_text( "input_text": text, "input_data": input_data, "original_audio": audio_path, + "final_audio": audio_path, } return json_dict diff --git a/pyproject.toml b/pyproject.toml index 7aa510d..ee47e59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.5.1" +version = "0.5.5" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT"