diff --git a/README.md b/README.md index 5d7e3ec..fb08980 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ Manim Voiceover is a [Manim](https://manim.community) plugin for all things voic - Record voiceovers with your microphone during rendering with a simple command line interface. - Develop animations with auto-generated AI voices from various free and proprietary services. - Per-word timing of animations, i.e. trigger animations at specific words in the voiceover, even for the recordings. This works thanks to [OpenAI Whisper](https://github.com/openai/whisper). +- **NEW**: Supports both local and cloud-based Whisper for ARM64 architectures (like Apple Silicon) where the local model may not work. Here is a demo: @@ -41,6 +42,38 @@ Currently supported TTS services (aside from the CLI that allows you to records [Check out the example gallery to get inspired.](https://voiceover.manim.community/en/latest/examples.html) +## Cloud Whisper Support + +For ARM64 architectures (like Apple Silicon Macs) or systems where installing the local Whisper model is problematic, you can now use OpenAI's cloud-based Whisper API for speech-to-text alignment: + +```bash +# Run with the provided script +python manim_cloud_whisper.py -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +Or enable it programmatically: + +```python +service = OpenAIService( + voice="alloy", + model="tts-1", + transcription_model="base", + use_cloud_whisper=True # This enables cloud-based Whisper +) +``` + +You can also set an environment variable to enable cloud-based Whisper: + +```bash +# Set the environment variable +export MANIM_VOICEOVER_USE_CLOUD_WHISPER=1 + +# Run Manim normally +manim -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +[Learn more about cloud-based Whisper in the documentation.](https://voiceover.manim.community/en/latest/cloud_whisper.html) + ## Translate Manim Voiceover can use machine translation services like [DeepL](https://www.deepl.com/) to translate voiceovers into other languages. [Check out the docs for more details.](https://voiceover.manim.community/en/latest/translate.html) \ No newline at end of file diff --git a/demo_openai_cloud_whisper.py b/demo_openai_cloud_whisper.py new file mode 100644 index 0000000..cac1abe --- /dev/null +++ b/demo_openai_cloud_whisper.py @@ -0,0 +1,110 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.openai import OpenAIService + +class OpenAICloudWhisperDemo(VoiceoverScene): + def construct(self): + # Print the cloud whisper setting + print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + + # Initialize OpenAI speech service with cloud whisper + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", + use_cloud_whisper=True # Use cloud-based Whisper + ) + self.set_speech_service(service) + + # Create a title + title = Text("OpenAI TTS + Cloud Whisper Demo", font_size=48) + self.play(Write(title)) + self.wait(1) + + # Move title to top + self.play(title.animate.to_edge(UP)) + + # Create a subtitle + subtitle = Text("Word-level alignment on ARM64 architectures", + font_size=36, + color=BLUE) + subtitle.next_to(title, DOWN) + self.play(FadeIn(subtitle)) + + # Demonstrate voiceover with bookmarks + with self.voiceover( + """This demonstration uses OpenAI's text-to-speech service + with cloud-based Whisper for + word-level alignment.""" + ) as tracker: + # Wait until the first bookmark + self.wait_until_bookmark("cloud_point") + + # Create and animate the cloud text + cloud_text = Text("☁️ Cloud-based Whisper", color=BLUE, font_size=36) + cloud_text.next_to(subtitle, DOWN, buff=1) + self.play(FadeIn(cloud_text)) + + # Wait until the second bookmark + self.wait_until_bookmark("alignment_point") + + # Create and animate the alignment text + alignment_text = Text("Perfect Word Timing", color=GREEN, font_size=36) + alignment_text.next_to(cloud_text, DOWN, buff=0.5) + self.play(FadeIn(alignment_text)) + + # Continue with demonstration + self.wait(1) + + # Show ARM64 compatibility + arm_title = Text("Works on Apple Silicon!", color=RED, font_size=36) + arm_title.next_to(alignment_text, DOWN, buff=1) + + with self.voiceover( + "This feature is especially useful for ARM64 architectures like your M4 Pro." + ): + self.play(FadeIn(arm_title)) + + # Final animation + self.wait(1) + + with self.voiceover( + "No local Whisper model required. Everything happens in the cloud!" + ): + # Create a final animation + final_group = VGroup(title, subtitle, cloud_text, alignment_text, arm_title) + self.play( + final_group.animate.scale(0.8).to_edge(UP), + ) + + # Create a cloud icon + cloud = Text("☁️", font_size=120) + self.play(FadeIn(cloud)) + + # Add some particles around the cloud + particles = VGroup(*[ + Dot(radius=0.05, color=BLUE).move_to( + cloud.get_center() + np.array([ + np.random.uniform(-3, 3), + np.random.uniform(-2, 2), + 0 + ]) + ) + for _ in range(20) + ]) + self.play(FadeIn(particles)) + + # Animate the particles + self.play( + *[ + p.animate.shift(np.array([ + np.random.uniform(-1, 1), + np.random.uniform(-1, 1), + 0 + ])) + for p in particles + ], + run_time=2 + ) + + self.wait(2) \ No newline at end of file diff --git a/direct_openai_test.py b/direct_openai_test.py new file mode 100644 index 0000000..1f5e7ee --- /dev/null +++ b/direct_openai_test.py @@ -0,0 +1,107 @@ +import os +import json +from pathlib import Path +from dotenv import load_dotenv +import openai + +# Load environment variables from .env file +load_dotenv() + +# Create a temporary directory for audio files +temp_dir = Path("./temp_direct_test") +temp_dir.mkdir(exist_ok=True) + +# Constants for audio offset resolution (same as in manim-voiceover) +AUDIO_OFFSET_RESOLUTION = 1000 # 1000 = milliseconds + +print("=== Direct OpenAI API Test ===") + +# First, generate speech using OpenAI TTS +print("\nGenerating speech from text...") +text = "This is a test of the cloud-based Whisper feature." + +# Generate speech using OpenAI TTS +response = openai.audio.speech.create( + model="tts-1", + voice="alloy", + input=text +) + +audio_path = temp_dir / "direct_test.mp3" +response.stream_to_file(str(audio_path)) + +print(f"Speech generated and saved to {audio_path}") + +# Now, transcribe the audio using OpenAI Whisper API +print("\nTranscribing audio with word-level timestamps...") +with open(audio_path, "rb") as audio_file: + transcription = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + timestamp_granularities=["word"] + ) + +# Print the raw response structure +print("\nRaw API Response Structure:") +print(f"Response type: {type(transcription)}") +print(f"Response attributes: {dir(transcription)}") +print(f"Has 'words' attribute: {hasattr(transcription, 'words')}") + +if hasattr(transcription, 'words'): + print(f"Words type: {type(transcription.words)}") + print(f"Words count: {len(transcription.words)}") + + # Try to access the first word + if len(transcription.words) > 0: + first_word = transcription.words[0] + print(f"First word type: {type(first_word)}") + print(f"First word attributes: {dir(first_word)}") + print(f"First word: {first_word.word if hasattr(first_word, 'word') else 'No word attribute'}") + print(f"First word start: {first_word.start if hasattr(first_word, 'start') else 'No start attribute'}") + +# Convert to word boundaries format used by manim-voiceover +print("\nConverting to word boundaries format...") +word_boundaries = [] +current_text_offset = 0 + +if hasattr(transcription, 'words'): + for word_obj in transcription.words: + try: + word = word_obj.word + start_time = word_obj.start + + # Create a word boundary entry + word_boundary = { + "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + + word_boundaries.append(word_boundary) + current_text_offset += len(word) + 1 # +1 for space + + print(f"Added word boundary: {word} at {start_time}s") + except Exception as e: + print(f"Error processing word: {e}") + +print(f"\nCreated {len(word_boundaries)} word boundaries") + +# Create a cache file that manim-voiceover can use +cache_data = { + "input_text": text, + "input_data": {"input_text": text, "service": "openai"}, + "original_audio": audio_path.name, + "word_boundaries": word_boundaries, + "transcribed_text": transcription.text, + "final_audio": audio_path.name +} + +cache_file = temp_dir / "cache.json" +with open(cache_file, "w") as f: + json.dump([cache_data], f, indent=2) + +print(f"\nCreated cache file at {cache_file}") +print("\nTest completed!") \ No newline at end of file diff --git a/docs/source/cloud_whisper.md b/docs/source/cloud_whisper.md new file mode 100644 index 0000000..ffe32de --- /dev/null +++ b/docs/source/cloud_whisper.md @@ -0,0 +1,102 @@ +# Cloud-based Whisper Transcription + +## Overview + +Manim-voiceover now supports cloud-based transcription using OpenAI's Whisper API. This is particularly useful for: + +- ARM64 architectures (like Apple Silicon Macs) where installing the local Whisper model might be problematic +- Systems where you don't want to install the large Whisper model +- When you need higher accuracy transcription than the local model provides + +## Setup + +To use cloud-based Whisper, you'll need: + +1. An OpenAI API key +2. The OpenAI Python package + +Install the necessary dependencies: + +```bash +pip install "manim-voiceover[openai]" +``` + +## Usage + +### Command Line Option + +You can enable cloud-based Whisper for any Manim render by using the provided script: + +```bash +python manim_cloud_whisper.py -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +Or by setting an environment variable: + +```bash +# Set the environment variable +export MANIM_VOICEOVER_USE_CLOUD_WHISPER=1 + +# Run Manim normally +manim -pql examples/cloud_whisper_demo.py CloudWhisperDemo +``` + +### Programmatic Usage + +You can also enable cloud-based Whisper programmatically when initializing any speech service: + +```python +from manim_voiceover.services.azure import AzureService +from manim_voiceover.voiceover_scene import VoiceoverScene + +class MyScene(VoiceoverScene): + def construct(self): + # Use cloud-based Whisper for transcription + service = AzureService( + voice="en-US-GuyNeural", + transcription_model="base", # Still specify a model name + use_cloud_whisper=True # This enables cloud-based Whisper + ) + self.set_speech_service(service) + + # Rest of your scene... +``` + +## How It Works + +When cloud-based Whisper is enabled: + +1. The speech service will use OpenAI's API to transcribe your audio files +2. Word-level alignment will still work for bookmarks and animations +3. Your audio files will be sent to OpenAI's servers for transcription +4. An OpenAI API key is required and you'll be prompted to enter one if not found + +## Pricing + +Using cloud-based Whisper incurs costs based on OpenAI's pricing model: + +- Audio transcription is billed per minute of audio +- Check [OpenAI's pricing page](https://openai.com/pricing) for the most up-to-date information + +## Switching Between Local and Cloud + +You can use both local and cloud-based Whisper in the same project: + +- Use the `--use-cloud-whisper` flag when you need cloud-based transcription +- Omit the flag to use the local Whisper model + +## Troubleshooting + +### API Key Issues + +If you encounter errors related to the API key: + +1. Check that you have set the `OPENAI_API_KEY` environment variable +2. Alternatively, create a `.env` file in your project directory with `OPENAI_API_KEY=your_key_here` + +### Response Format Issues + +The cloud API might return a different format than expected. If you encounter errors: + +1. Check that you're using the latest version of manim-voiceover +2. Try using a different transcription model \ No newline at end of file diff --git a/examples/cloud_whisper_demo.py b/examples/cloud_whisper_demo.py new file mode 100644 index 0000000..99db975 --- /dev/null +++ b/examples/cloud_whisper_demo.py @@ -0,0 +1,77 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.gtts import GTTSService +from manim_voiceover.services.openai import OpenAIService + +class CloudWhisperDemo(VoiceoverScene): + def construct(self): + # Initialize speech service with cloud whisper option + # Note: You can also run this with --use-cloud-whisper flag + # instead of setting use_cloud_whisper=True here + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", # Model name is still required + use_cloud_whisper=True # This enables cloud-based Whisper + ) + self.set_speech_service(service) + + # Create a title + title = Text("Cloud Whisper Demo", font_size=48) + self.play(Write(title)) + self.wait() + + # Move title to top + self.play(title.animate.to_edge(UP)) + + # Demonstrate voiceover with bookmarks + with self.voiceover( + """This demonstration uses cloud-based Whisper + for word-level alignment. + """ + ) as tracker: + # Wait until the first bookmark + self.wait_until_bookmark("cloud_point") + + # Create and animate the cloud text + cloud_text = Text("☁️ Cloud-based", color=BLUE, font_size=36) + cloud_text.next_to(title, DOWN, buff=1) + self.play(FadeIn(cloud_text)) + + # Wait until the second bookmark + self.wait_until_bookmark("alignment_point") + + # Create and animate the alignment text + alignment_text = Text("Word-level Alignment", color=GREEN, font_size=36) + alignment_text.next_to(cloud_text, DOWN, buff=0.5) + self.play(FadeIn(alignment_text)) + + # Continue with demonstration + self.wait(1) + + # Show ARM64 compatibility + arm_title = Text("Works on ARM64 Architectures!", color=RED, font_size=36) + arm_title.next_to(alignment_text, DOWN, buff=1) + + with self.voiceover( + "This feature is especially useful for ARM64 architectures like Apple Silicon." + ): + self.play(FadeIn(arm_title)) + + # Final animation + self.wait(1) + + with self.voiceover( + "No local Whisper model required. Everything happens in the cloud!" + ): + # Create a final animation + final_group = VGroup(title, cloud_text, alignment_text, arm_title) + self.play( + final_group.animate.scale(0.8).to_edge(UP), + ) + + # Create a cloud icon + cloud = Text("☁️", font_size=120) + self.play(FadeIn(cloud)) + + self.wait(2) \ No newline at end of file diff --git a/linear_regression_voiceover.py b/linear_regression_voiceover.py new file mode 100644 index 0000000..620069b --- /dev/null +++ b/linear_regression_voiceover.py @@ -0,0 +1,221 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.openai import OpenAIService + +# Import the SimpleLinearRegression class from the example +import numpy as np + +class LinearRegressionWithVoiceover(VoiceoverScene): + def construct(self): + # Initialize OpenAI speech service with cloud whisper + service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1" + ) + self.set_speech_service(service) + + # Add title with voiceover introduction + with self.voiceover( + """Welcome to this demonstration of linear regression using Manim. + Linear regression is one of the most fundamental + machine learning algorithms.""" + ): + self.wait(1) + # Add title + title = Text("Linear Regression", font_size=36) + title.to_edge(UP) + self.wait_until_bookmark("title_appears") + self.play(FadeIn(title)) + + # Set up axes with voiceover + with self.voiceover( + """Let's start by setting up our coordinate system. + We'll use this to plot our data points and regression line.""" + ): + # Set up axes + axes = Axes( + x_range=(-1, 12), + y_range=(-1, 10), + x_length=10, + y_length=6, + axis_config={"include_numbers": True} + ) + axes.to_edge(DOWN) + self.wait_until_bookmark("axes_appear") + self.play(Create(axes)) + + # Add data points with voiceover + with self.voiceover( + """Now, let's generate some random data points that follow a linear pattern + with some added noise. These yellow dots represent + our training data.""" + ): + # Add data points + n_data_points = 30 + m = 0.75 # slope + y0 = 1 # intercept + + np.random.seed(42) # For reproducibility + points = [] + for _ in range(n_data_points): + x = np.random.uniform(2, 10) + y = y0 + m * x + 0.75 * np.random.normal(0, 1) + points.append(axes.c2p(x, y)) + + dots = VGroup(*[Dot(point, color=YELLOW) for point in points]) + self.wait_until_bookmark("dots_appear") + self.play(FadeIn(dots)) + + # Create line with voiceover + with self.voiceover( + """In linear regression, we want to find a line that best fits our data. + The equation of this line is y = mx + b, where m is the slope + and b is the y-intercept.""" + ): + # Create line + m_tracker = ValueTracker(m) + y0_tracker = ValueTracker(y0) + + def get_line(): + curr_m = m_tracker.get_value() + curr_y0 = y0_tracker.get_value() + + # Create a line manually + x_min, x_max = axes.x_range[0], axes.x_range[1] + line = Line( + start=axes.coords_to_point(x_min, curr_y0 + curr_m * x_min), + end=axes.coords_to_point(x_max, curr_y0 + curr_m * x_max), + color=BLUE + ) + return line + + line = get_line() + self.wait_until_bookmark("line_appear") + self.play(Create(line)) + + # Show slope with voiceover + with self.voiceover( + """Let's look at the slope parameter. The slope determines + how steep our line is. If we increase the slope, + the line becomes steeper.""" + ): + # Show slope + slope_label = MathTex(r"slope = ").next_to(title, DOWN) + slope_value = DecimalNumber(m) + slope_value.next_to(slope_label, RIGHT) + slope_value.add_updater(lambda d: d.set_value(m_tracker.get_value())) + + self.wait_until_bookmark("slope_appear") + self.play(Write(slope_label), Write(slope_value)) + + # Adjust slope + self.wait_until_bookmark("slope_change") + new_m = 1.5 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + Transform(line, new_line), # Transform to new line + run_time=2, + ) + line = new_line # Update line reference + + # Show intercept with voiceover + with self.voiceover( + """The y-intercept is where our line crosses the y-axis. + If we decrease the y-intercept, + the entire line shifts downward.""" + ): + # Show intercept + intercept_label = MathTex(r"y\text{-intercept} = ").next_to(slope_label, DOWN) + intercept_value = DecimalNumber(y0) + intercept_value.next_to(intercept_label, RIGHT) + intercept_value.add_updater(lambda d: d.set_value(y0_tracker.get_value())) + + self.wait_until_bookmark("intercept_appear") + self.play(Write(intercept_label), Write(intercept_value)) + + # Adjust intercept + self.wait_until_bookmark("intercept_change") + new_y0 = -2 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=2 + ) + line = new_line # Update line reference + + # Try different values with voiceover + with self.voiceover( + """In linear regression, we use an optimization algorithm to find + the values of slope and intercept that best fit our data. + Let's try a few different combinations + to see how well they fit.""" + ): + # Try different values to show the fitting process + self.wait_until_bookmark("fit1") + new_m, new_y0 = 0.5, 0 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + self.wait_until_bookmark("fit2") + new_m, new_y0 = 0.7, 0.8 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + self.wait_until_bookmark("fit3") + new_m, new_y0 = 0.75, 1 + new_line = get_line() # Get current line + self.remove(line) # Remove old line + self.play( + m_tracker.animate.set_value(new_m), + y0_tracker.animate.set_value(new_y0), + Transform(line, new_line), # Transform to new line + run_time=1.5, + ) + line = new_line # Update line reference + + # Add prediction point with voiceover + with self.voiceover( + """Once we have our regression line, we can use it to make predictions. + For example, if x equals 8, + our model predicts that y will be approximately 7.""" + ): + # Add a prediction point + test_x = 8 + test_point = Dot(axes.c2p(test_x, y0 + m * test_x), color=RED, radius=0.1) + prediction_line = DashedLine( + axes.c2p(test_x, 0), + axes.c2p(test_x, y0 + m * test_x), + color=RED + ) + + self.wait_until_bookmark("prediction_appear") + self.play( + Create(test_point), + Create(prediction_line) + ) + + # Conclusion with voiceover + with self.voiceover( + """And that's a basic demonstration of linear regression. + This simple model forms the foundation for many more complex + machine learning algorithms.""" + ): + self.wait(2) \ No newline at end of file diff --git a/manim_cloud_whisper.py b/manim_cloud_whisper.py new file mode 100755 index 0000000..3c451a5 --- /dev/null +++ b/manim_cloud_whisper.py @@ -0,0 +1,30 @@ +#!/usr/bin/env python3 +""" +Custom entry point for running Manim with cloud-based Whisper. +""" + +import sys +import os +import subprocess + +def main(): + """Run Manim with cloud-based Whisper enabled.""" + # Set the environment variable to enable cloud-based Whisper + os.environ["MANIM_VOICEOVER_USE_CLOUD_WHISPER"] = "1" + + # Get the Manim command arguments + args = sys.argv[1:] + + # Run the Manim command + cmd = ["manim"] + args + print(f"Running: {' '.join(cmd)}") + print("Cloud-based Whisper enabled via environment variable.") + + # Execute the command + result = subprocess.run(cmd) + + # Return the exit code + return result.returncode + +if __name__ == "__main__": + sys.exit(main()) \ No newline at end of file diff --git a/manim_voiceover/__init__.py b/manim_voiceover/__init__.py index d2bd2f1..b7a63e5 100644 --- a/manim_voiceover/__init__.py +++ b/manim_voiceover/__init__.py @@ -1,6 +1,11 @@ from manim_voiceover.tracker import VoiceoverTracker from manim_voiceover.voiceover_scene import VoiceoverScene +from manim import config import pkg_resources __version__: str = pkg_resources.get_distribution(__name__).version + +# Add our custom config attribute +if not hasattr(config, 'use_cloud_whisper'): + config.use_cloud_whisper = True diff --git a/manim_voiceover/cli/__init__.py b/manim_voiceover/cli/__init__.py new file mode 100644 index 0000000..77a181f --- /dev/null +++ b/manim_voiceover/cli/__init__.py @@ -0,0 +1,3 @@ +# This file initializes the CLI module +from manim_voiceover.cli.config import add_voiceover_args +from manim_voiceover.cli.main import patched_render_command, patched_main \ No newline at end of file diff --git a/manim_voiceover/cli/config.py b/manim_voiceover/cli/config.py new file mode 100644 index 0000000..dcf56d3 --- /dev/null +++ b/manim_voiceover/cli/config.py @@ -0,0 +1,36 @@ +""" +Configuration support for manim-voiceover CLI +""" + +import os +from manim.utils.file_ops import guarantee_existence +from manim._config import config_file, library_wide_cfg_file, ManimConfig +import manim.config as manim_config + +# The Manim config system doesn't provide an easy way to extend the CLI from plugins +# So instead, we'll monkey patch the ManimConfig class to add our custom flag +original_digest_args = manim_config.ManimConfig.digest_args + +def patched_digest_args(self, args, namespace=''): + # Call original method + original_digest_args(self, args, namespace) + + # Handle our custom CLI flags + if hasattr(args, 'use_cloud_whisper'): + self.use_cloud_whisper = args.use_cloud_whisper + +# Apply the monkey patch +manim_config.ManimConfig.digest_args = patched_digest_args + +# Make sure the config object has our flag +if not hasattr(manim_config.config, 'use_cloud_whisper'): + manim_config.config.use_cloud_whisper = False + +def add_voiceover_args(parser): + """Add manim-voiceover specific arguments to the parser.""" + whisper_group = parser.add_argument_group("Manim Voiceover") + whisper_group.add_argument( + "--use-cloud-whisper", + action="store_true", + help="Use OpenAI's cloud Whisper API instead of local model for transcription", + ) \ No newline at end of file diff --git a/manim_voiceover/cli/main.py b/manim_voiceover/cli/main.py new file mode 100644 index 0000000..4649c08 --- /dev/null +++ b/manim_voiceover/cli/main.py @@ -0,0 +1,55 @@ +""" +CLI entrypoint for manim-voiceover +""" + +import inspect +import os +import sys +from pathlib import Path + +from manim.cli.render.commands import main_command as original_main_command +from manim.cli.render.commands import render + +from manim_voiceover.cli.config import add_voiceover_args + +# Store the original command function +original_render_command = render.command + +def patched_render_command(function): + """Patch the render command to add our custom arguments.""" + # Call the original decorated function + cmd = original_render_command(function) + + # Add our parameter to the command + from click import option + cmd = option('--use-cloud-whisper', + is_flag=True, + help='Use OpenAI cloud API for Whisper instead of local model')(cmd) + + return cmd + +def patched_main(): + """Entry point for renderer with cloud whisper support.""" + # Find the render subcommand in the argument parser + import argparse + + # Create a dummy parser just to intercept the args + dummy_parser = argparse.ArgumentParser(add_help=False) + dummy_parser.add_argument("--use-cloud-whisper", action="store_true") + + # Parse known args to get our flags + args, unknown = dummy_parser.parse_known_args() + + # Set the global config value + from manim.config import config + if hasattr(args, 'use_cloud_whisper') and args.use_cloud_whisper: + config.use_cloud_whisper = True + + # Call the original main command + return original_main_command() + +# Apply our monkey patch +render.command = patched_render_command + +# No need for this line since we're directly importing and patching in __init__.py +# main_command = patched_main \ No newline at end of file diff --git a/manim_voiceover/config.py b/manim_voiceover/config.py new file mode 100644 index 0000000..c7b2533 --- /dev/null +++ b/manim_voiceover/config.py @@ -0,0 +1,6 @@ +from manim import config +from pathlib import Path +import tempfile + +# Whether to use cloud-based Whisper API +config.use_cloud_whisper = True \ No newline at end of file diff --git a/manim_voiceover/services/base.py b/manim_voiceover/services/base.py index fc6c898..0263380 100644 --- a/manim_voiceover/services/base.py +++ b/manim_voiceover/services/base.py @@ -23,13 +23,31 @@ def timestamps_to_word_boundaries(segments): word_boundaries = [] current_text_offset = 0 - for segment in segments: - for dict_ in segment["words"]: + + # Check if we have direct word-level timestamps (from OpenAI API) + if isinstance(segments, list) and len(segments) > 0 and "words" in segments[0]: + # Process segment-level timestamps + for segment in segments: + for dict_ in segment["words"]: + word = dict_["word"] + word_boundaries.append( + { + "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + ) + current_text_offset += len(word) + # Check if we have direct word-level timestamps in a flat structure (from OpenAI API) + elif isinstance(segments, list) and len(segments) > 0 and isinstance(segments[0], dict) and "word" in segments[0]: + # Process word-level timestamps directly + for dict_ in segments: word = dict_["word"] word_boundaries.append( { "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), - # "duration_milliseconds": 0, "text_offset": current_text_offset, "word_length": len(word), "text": word, @@ -37,9 +55,21 @@ def timestamps_to_word_boundaries(segments): } ) current_text_offset += len(word) - # If word is not punctuation, add a space - # if word not in [".", ",", "!", "?", ";", ":", "(", ")"]: - # current_text_offset += 1 + else: + # Original implementation for local Whisper + for segment in segments: + for dict_ in segment["words"]: + word = dict_["word"] + word_boundaries.append( + { + "audio_offset": int(dict_["start"] * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + ) + current_text_offset += len(word) return word_boundaries @@ -51,21 +81,24 @@ def __init__( self, global_speed: float = 1.00, cache_dir: t.Optional[str] = None, - transcription_model: t.Optional[str] = None, + transcription_model: t.Optional[str] = "whisper-1", transcription_kwargs: dict = {}, + use_cloud_whisper: bool = True, **kwargs, ): - """ + """Initialize the speech service. + Args: - global_speed (float, optional): The speed at which to play the audio. - Defaults to 1.00. - cache_dir (str, optional): The directory to save the audio - files to. Defaults to ``voiceovers/``. - transcription_model (str, optional): The - `OpenAI Whisper model `_ - to use for transcription. Defaults to None. - transcription_kwargs (dict, optional): Keyword arguments to - pass to the transcribe() function. Defaults to {}. + global_speed (float, optional): The global speed factor for the + generated audio. Defaults to 1.00. + cache_dir (t.Optional[str], optional): The directory where the + generated audio will be cached. Defaults to None. + transcription_model (t.Optional[str], optional): The Whisper model + to use for transcription. Defaults to "whisper-1". + transcription_kwargs (dict, optional): Keyword arguments to pass + to the transcribe() function. Defaults to {}. + use_cloud_whisper (bool, optional): Whether to use OpenAI's cloud-based + Whisper API for transcription instead of the local model. Defaults to True. """ self.global_speed = global_speed @@ -79,28 +112,116 @@ def __init__( self.transcription_model = None self._whisper_model = None + self.use_cloud_whisper = use_cloud_whisper self.set_transcription(model=transcription_model, kwargs=transcription_kwargs) self.additional_kwargs = kwargs def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dict: - # Replace newlines with lines, reduce multiple consecutive spaces to single + # Replace newlines with spaces, reduce multiple consecutive spaces to single text = " ".join(text.split()) dict_ = self.generate_from_text(text, cache_dir=None, path=path, **kwargs) original_audio = dict_["original_audio"] # Check whether word boundaries exist and if not run stt - if "word_boundaries" not in dict_ and self._whisper_model is not None: - transcription_result = self._whisper_model.transcribe( - str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs - ) - logger.info("Transcription: " + transcription_result.text) - word_boundaries = timestamps_to_word_boundaries( - transcription_result.segments_to_dicts() - ) - dict_["word_boundaries"] = word_boundaries - dict_["transcribed_text"] = transcription_result.text + if "word_boundaries" not in dict_ and (self._whisper_model is not None or self.use_cloud_whisper): + if self.use_cloud_whisper: + # Use OpenAI's cloud-based Whisper API + try: + import openai + from dotenv import find_dotenv, load_dotenv + load_dotenv(find_dotenv(usecwd=True)) + + if os.getenv("OPENAI_API_KEY") is None: + from manim_voiceover.services.openai import create_dotenv_openai + create_dotenv_openai() + + audio_file_path = str(Path(self.cache_dir) / original_audio) + with open(audio_file_path, "rb") as audio_file: + transcription_result = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + timestamp_granularities=["word"], + **self.transcription_kwargs + ) + + # Convert the word timestamps to word boundaries directly + logger.info("Cloud Transcription: " + transcription_result.text) + logger.info(f"Word count: {len(transcription_result.words) if hasattr(transcription_result, 'words') else 0}") + + word_boundaries = [] + current_text_offset = 0 + + if hasattr(transcription_result, 'words') and transcription_result.words: + logger.info(f"Processing {len(transcription_result.words)} words") + for word_obj in transcription_result.words: + try: + word = word_obj.word.strip() # Remove any leading/trailing whitespace + start_time = word_obj.start + + # Skip words that are just punctuation or empty + if not word or word.isspace() or (len(word) == 1 and not word.isalnum()): + continue + + word_boundary = { + "audio_offset": int(start_time * AUDIO_OFFSET_RESOLUTION), + "text_offset": current_text_offset, + "word_length": len(word), + "text": word, + "boundary_type": "Word", + } + + word_boundaries.append(word_boundary) + current_text_offset += len(word) + 1 # +1 for space + + logger.info(f"Added word boundary: {word} at {start_time}s") + except Exception as e: + logger.error(f"Error processing word: {e}") + else: + logger.warning("No words found in transcription result") + + logger.info(f"Created {len(word_boundaries)} word boundaries") + dict_["word_boundaries"] = word_boundaries + dict_["transcribed_text"] = transcription_result.text + + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[openai]"` to use cloud-based Whisper.' + ) + return dict_ + except Exception as e: + logger.error(f"Error using cloud-based Whisper: {str(e)}") + return dict_ + else: + # Use local Whisper model only if it's properly loaded + if self._whisper_model is not None and not isinstance(self._whisper_model, bool): + try: + transcription_result = self._whisper_model.transcribe( + str(Path(self.cache_dir) / original_audio), **self.transcription_kwargs + ) + + logger.info("Transcription: " + transcription_result.text) + + # For local Whisper model, use segments_to_dicts + if hasattr(transcription_result, 'segments_to_dicts'): + word_boundaries = timestamps_to_word_boundaries( + transcription_result.segments_to_dicts() + ) + dict_["word_boundaries"] = word_boundaries + dict_["transcribed_text"] = transcription_result.text + else: + logger.error("Local Whisper model returned unexpected result format.") + return dict_ + except Exception as e: + logger.error(f"Error using local Whisper model: {str(e)}") + return dict_ + else: + logger.error( + "Local Whisper model is not available. Please set use_cloud_whisper=True or install the local model with `pip install \"manim-voiceover[transcribe]\"`." + ) + return dict_ # Audio callback self.audio_callback(original_audio, dict_, **kwargs) @@ -127,7 +248,7 @@ def _wrap_generate_from_text(self, text: str, path: str = None, **kwargs) -> dic Path(self.cache_dir) / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME, dict_ ) return dict_ - + def set_transcription(self, model: str = None, kwargs: dict = {}): """Set the transcription model and keyword arguments to be passed to the transcribe() function. @@ -136,27 +257,41 @@ def set_transcription(self, model: str = None, kwargs: dict = {}): model (str, optional): The Whisper model to use for transcription. Defaults to None. kwargs (dict, optional): Keyword arguments to pass to the transcribe() function. Defaults to {}. """ - if model != self.transcription_model: + self.transcription_model = model + self.transcription_kwargs = kwargs + + if model != self.transcription_model or self._whisper_model is None: if model is not None: - try: - import whisper as __tmp - import stable_whisper as whisper - except ImportError: - logger.error( - 'Missing packages. Run `pip install "manim-voiceover[transcribe]"` to be able to transcribe voiceovers.' - ) + if self.use_cloud_whisper: + # For cloud-based Whisper, we don't need to load a local model + # but we still need the OpenAI package + try: + import openai + self._whisper_model = True # Just a placeholder to indicate we can use cloud whisper + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[openai]"` to use cloud-based Whisper.' + ) + self._whisper_model = None + else: + # Load local Whisper model + try: + import whisper as __tmp + import stable_whisper as whisper + except ImportError: + logger.error( + 'Missing packages. Run `pip install "manim-voiceover[transcribe]"` to be able to transcribe voiceovers.' + ) - prompt_ask_missing_extras( - ["whisper", "stable_whisper"], - "transcribe", - "SpeechService.set_transcription()", - ) - self._whisper_model = whisper.load_model(model) + prompt_ask_missing_extras( + ["whisper", "stable_whisper"], + "transcribe", + "SpeechService.set_transcription()", + ) + self._whisper_model = whisper.load_model(model) else: self._whisper_model = None - self.transcription_kwargs = kwargs - def get_audio_basename(self, data: dict) -> str: dumped_data = json.dumps(data) data_hash = hashlib.sha256(dumped_data.encode("utf-8")).hexdigest() @@ -184,7 +319,7 @@ def generate_from_text( raise NotImplementedError def get_cached_result(self, input_data, cache_dir): - json_path = os.path.join(cache_dir / DEFAULT_VOICEOVER_CACHE_JSON_FILENAME) + json_path = os.path.join(cache_dir, DEFAULT_VOICEOVER_CACHE_JSON_FILENAME) if os.path.exists(json_path): json_data = json.load(open(json_path, "r")) for entry in json_data: diff --git a/manim_voiceover/services/openai.py b/manim_voiceover/services/openai.py index 025fc40..16b2e12 100644 --- a/manim_voiceover/services/openai.py +++ b/manim_voiceover/services/openai.py @@ -50,6 +50,7 @@ def __init__( voice: str = "alloy", model: str = "tts-1-hd", transcription_model="base", + use_cloud_whisper: bool = True, **kwargs ): """ @@ -60,12 +61,21 @@ def __init__( model (str, optional): The TTS model to use. See the `API page `__ for all the available options. Defaults to ``"tts-1-hd"``. + transcription_model (str, optional): The Whisper model to use for transcription. + Defaults to ``"base"``. + use_cloud_whisper (bool, optional): Whether to use OpenAI's cloud-based + Whisper API for transcription instead of the local model. Defaults to True. """ prompt_ask_missing_extras("openai", "openai", "OpenAIService") self.voice = voice self.model = model - SpeechService.__init__(self, transcription_model=transcription_model, **kwargs) + SpeechService.__init__( + self, + transcription_model=transcription_model, + use_cloud_whisper=use_cloud_whisper, + **kwargs + ) def generate_from_text( self, text: str, cache_dir: str = None, path: str = None, **kwargs @@ -114,6 +124,7 @@ def generate_from_text( "input_text": text, "input_data": input_data, "original_audio": audio_path, + "final_audio": audio_path, } return json_dict diff --git a/manim_voiceover/voiceover_scene.py b/manim_voiceover/voiceover_scene.py index 8030aa6..7d23af8 100644 --- a/manim_voiceover/voiceover_scene.py +++ b/manim_voiceover/voiceover_scene.py @@ -4,6 +4,7 @@ from typing import Optional, Generator import re import typing as t +import os from manim import Scene, config from manim_voiceover.services.base import SpeechService @@ -35,6 +36,15 @@ def set_speech_service( create_subcaption (bool, optional): Whether to create subcaptions for the scene. Defaults to True. If `config.save_last_frame` is True, the argument is ignored and no subcaptions will be created. """ + # Check for environment variable to enable cloud-based Whisper + if os.environ.get("MANIM_VOICEOVER_USE_CLOUD_WHISPER") == "1": + speech_service.use_cloud_whisper = True + print("Cloud-based Whisper enabled via environment variable.") + + # Set use_cloud_whisper from the config if it has the attribute + elif hasattr(config, "use_cloud_whisper"): + speech_service.use_cloud_whisper = config.use_cloud_whisper + self.speech_service = speech_service self.current_tracker = None if config.save_last_frame: diff --git a/pyproject.toml b/pyproject.toml index 911f17c..ee47e59 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "manim-voiceover" -version = "0.3.7" +version = "0.5.5" description = "Manim plugin for all things voiceover" authors = ["The Manim Community Developers "] license = "MIT" @@ -66,7 +66,7 @@ elevenlabs = {version = "^0.2.27", optional = true} [tool.poetry.extras] azure = ["azure-cognitiveservices-speech"] gtts = ["gTTS"] -openai = ["openai"] +openai = ["openai", "python-dotenv"] pyttsx3 = ["pyttsx3"] # coqui = ["torch", "TTS"] coqui = [] # Removed TTS as deps for now @@ -83,6 +83,7 @@ all = [ "PyAudio", "pynput", "openai", + "python-dotenv", "deepl", "openai-whisper", "stable-ts", diff --git a/speech-to-text.md b/speech-to-text.md new file mode 100644 index 0000000..beda2e8 --- /dev/null +++ b/speech-to-text.md @@ -0,0 +1,369 @@ +Speech to text +============== + +Learn how to turn audio into text. + +Overview +-------- + +The Audio API provides two speech to text endpoints, `transcriptions` and `translations`, based on our state-of-the-art open source large-v2 [Whisper model](https://openai.com/blog/whisper/). They can be used to: + +* Transcribe audio into whatever language the audio is in. +* Translate and transcribe the audio into english. + +File uploads are currently limited to 25 MB and the following input file types are supported: `mp3`, `mp4`, `mpeg`, `mpga`, `m4a`, `wav`, and `webm`. + +Quickstart +---------- + +### Transcriptions + +The transcriptions API takes as input the audio file you want to transcribe and the desired output file format for the transcription of the audio. We currently support multiple input and output file formats. + +Transcribe audio + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/audio.mp3"), + model: "whisper-1", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file= open("/path/to/file/audio.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/transcriptions \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/audio.mp3 \ + --form model=whisper-1 +``` + +By default, the response type will be json with the raw text included. + +{ "text": "Imagine the wildest idea that you've ever had, and you're curious about how it might scale to something that's a 100, a 1,000 times bigger. .... } + +The Audio API also allows you to set additional parameters in a request. For example, if you want to set the `response_format` as `text`, your request would look like the following: + +Additional options + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/speech.mp3"), + model: "whisper-1", + response_format: "text", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text" +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/transcriptions \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/speech.mp3 \ + --form model=whisper-1 \ + --form response_format=text +``` + +The [API Reference](/docs/api-reference/audio) includes the full list of available parameters. + +### Translations + +The translations API takes as input the audio file in any of the supported languages and transcribes, if necessary, the audio into English. This differs from our /Transcriptions endpoint since the output is not in the original input language and is instead translated to English text. + +Translate audio + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.translations.create({ + file: fs.createReadStream("/path/to/file/german.mp3"), + model: "whisper-1", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/german.mp3", "rb") +transcription = client.audio.translations.create( + model="whisper-1", + file=audio_file, +) + +print(transcription.text) +``` + +```bash +curl --request POST \ + --url https://api.openai.com/v1/audio/translations \ + --header "Authorization: Bearer $OPENAI_API_KEY" \ + --header 'Content-Type: multipart/form-data' \ + --form file=@/path/to/file/german.mp3 \ + --form model=whisper-1 \ +``` + +In this case, the inputted audio was german and the outputted text looks like: + +Hello, my name is Wolfgang and I come from Germany. Where are you heading today? + +We only support translation into English at this time. + +Supported languages +------------------- + +We currently [support the following languages](https://github.com/openai/whisper#available-models-and-languages) through both the `transcriptions` and `translations` endpoint: + +Afrikaans, Arabic, Armenian, Azerbaijani, Belarusian, Bosnian, Bulgarian, Catalan, Chinese, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, Galician, German, Greek, Hebrew, Hindi, Hungarian, Icelandic, Indonesian, Italian, Japanese, Kannada, Kazakh, Korean, Latvian, Lithuanian, Macedonian, Malay, Marathi, Maori, Nepali, Norwegian, Persian, Polish, Portuguese, Romanian, Russian, Serbian, Slovak, Slovenian, Spanish, Swahili, Swedish, Tagalog, Tamil, Thai, Turkish, Ukrainian, Urdu, Vietnamese, and Welsh. + +While the underlying model was trained on 98 languages, we only list the languages that exceeded <50% [word error rate](https://en.wikipedia.org/wiki/Word_error_rate) (WER) which is an industry standard benchmark for speech to text model accuracy. The model will return results for languages not listed above but the quality will be low. + +Timestamps +---------- + +By default, the Whisper API will output a transcript of the provided audio in text. The [`timestamp_granularities[]` parameter](/docs/api-reference/audio/createTranscription#audio-createtranscription-timestamp_granularities) enables a more structured and timestamped json output format, with timestamps at the segment, word level, or both. This enables word-level precision for transcripts and video edits, which allows for the removal of specific frames tied to individual words. + +Timestamp options + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("audio.mp3"), + model: "whisper-1", + response_format: "verbose_json", + timestamp_granularities: ["word"] +}); + +console.log(transcription.words); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + file=audio_file, + model="whisper-1", + response_format="verbose_json", + timestamp_granularities=["word"] +) + +print(transcription.words) +``` + +```bash +curl https://api.openai.com/v1/audio/transcriptions \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + -H "Content-Type: multipart/form-data" \ + -F file="@/path/to/file/audio.mp3" \ + -F "timestamp_granularities[]=word" \ + -F model="whisper-1" \ + -F response_format="verbose_json" +``` + +Longer inputs +------------- + +By default, the Whisper API only supports files that are less than 25 MB. If you have an audio file that is longer than that, you will need to break it up into chunks of 25 MB's or less or used a compressed audio format. To get the best performance, we suggest that you avoid breaking the audio up mid-sentence as this may cause some context to be lost. + +One way to handle this is to use the [PyDub open source Python package](https://github.com/jiaaro/pydub) to split the audio: + +```python +from pydub import AudioSegment + +song = AudioSegment.from_mp3("good_morning.mp3") + +# PyDub handles time in milliseconds +ten_minutes = 10 * 60 * 1000 + +first_10_minutes = song[:ten_minutes] + +first_10_minutes.export("good_morning_10.mp3", format="mp3") +``` + +_OpenAI makes no guarantees about the usability or security of 3rd party software like PyDub._ + +Prompting +--------- + +You can use a [prompt](/docs/api-reference/audio/createTranscription#audio/createTranscription-prompt) to improve the quality of the transcripts generated by the Whisper API. The model tries to match the style of the prompt, so it's more likely to use capitalization and punctuation if the prompt does too. However, the current prompting system is more limited than our other language models and provides limited control over the generated audio. + +Here are some examples of how prompting can help in different scenarios: + +1. Prompts can help correct specific words or acronyms that the model misrecognizes in the audio. For example, the following prompt improves the transcription of the words DALL·E and GPT-3, which were previously written as "GDP 3" and "DALI": "The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity." +2. To preserve the context of a file that was split into segments, prompt the model with the transcript of the preceding segment. The model uses relevant information from the previous audio, improving transcription accuracy. The model only considers the final 224 tokens of the prompt and ignores anything earlier. For multilingual inputs, Whisper uses a custom tokenizer. For English-only inputs, it uses the standard GPT-2 tokenizer. Find both tokenizers in the open source [Whisper Python package](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py#L361). +3. Sometimes the model skips punctuation in the transcript. To prevent this, use a simple prompt that includes punctuation: "Hello, welcome to my lecture." +4. The model may also leave out common filler words in the audio. If you want to keep the filler words in your transcript, use a prompt that contains them: "Umm, let me think like, hmm... Okay, here's what I'm, like, thinking." +5. Some languages can be written in different ways, such as simplified or traditional Chinese. The model might not always use the writing style that you want for your transcript by default. You can improve this by using a prompt in your preferred writing style. + +Improving reliability +--------------------- + +One of the most common challenges faced when using Whisper is the model often does not recognize uncommon words or acronyms. Here are some different techniques to improve the reliability of Whisper in these cases: + +Using the prompt parameter + +The first method involves using the optional prompt parameter to pass a dictionary of the correct spellings. + +Because it wasn't trained with instruction-following techniques, Whisper operates more like a base GPT model. Keep in mind that Whisper only considers the first 224 tokens of the prompt. + +Prompt parameter + +```javascript +import fs from "fs"; +import OpenAI from "openai"; + +const openai = new OpenAI(); + +const transcription = await openai.audio.transcriptions.create({ + file: fs.createReadStream("/path/to/file/speech.mp3"), + model: "whisper-1", + response_format: "text", + prompt:"ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T.", +}); + +console.log(transcription.text); +``` + +```python +from openai import OpenAI +client = OpenAI() + +audio_file = open("/path/to/file/speech.mp3", "rb") +transcription = client.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="text", + prompt="ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T." +) + +print(transcription.text) +``` + +While it increases reliability, this technique is limited to 224 tokens, so your list of SKUs needs to be relatively small for this to be a scalable solution. + +Post-processing with GPT-4 + +The second method involves a post-processing step using GPT-4 or GPT-3.5-Turbo. + +We start by providing instructions for GPT-4 through the `system_prompt` variable. Similar to what we did with the prompt parameter earlier, we can define our company and product names. + +Post-processing + +```javascript +const systemPrompt = ` +You are a helpful assistant for the company ZyntriQix. Your task is +to correct any spelling discrepancies in the transcribed text. Make +sure that the names of the following products are spelled correctly: +ZyntriQix, Digique Plus, CynapseFive, VortiQore V8, EchoNix Array, +OrbitalLink Seven, DigiFractal Matrix, PULSE, RAPT, B.R.I.C.K., +Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary punctuation such as +periods, commas, and capitalization, and use only the context provided. +`; + +const transcript = await transcribe(audioFile); +const completion = await openai.chat.completions.create({ + model: "gpt-4o", + temperature: temperature, + messages: [ + { + role: "system", + content: systemPrompt + }, + { + role: "user", + content: transcript + } + ], + store: true, +}); + +console.log(completion.choices[0].message.content); +``` + +```python +system_prompt = """ +You are a helpful assistant for the company ZyntriQix. Your task is to correct +any spelling discrepancies in the transcribed text. Make sure that the names of +the following products are spelled correctly: ZyntriQix, Digique Plus, +CynapseFive, VortiQore V8, EchoNix Array, OrbitalLink Seven, DigiFractal +Matrix, PULSE, RAPT, B.R.I.C.K., Q.U.A.R.T.Z., F.L.I.N.T. Only add necessary +punctuation such as periods, commas, and capitalization, and use only the +context provided. +""" + +def generate_corrected_transcript(temperature, system_prompt, audio_file): + response = client.chat.completions.create( + model="gpt-4o", + temperature=temperature, + messages=[ + { + "role": "system", + "content": system_prompt + }, + { + "role": "user", + "content": transcribe(audio_file, "") + } + ] + ) + return completion.choices[0].message.content +corrected_text = generate_corrected_transcript( + 0, system_prompt, fake_company_filepath +) +``` + +If you try this on your own audio file, you'll see that GPT-4 corrects many misspellings in the transcript. Due to its larger context window, this method might be more scalable than using Whisper's prompt parameter. It's also more reliable, as GPT-4 can be instructed and guided in ways that aren't possible with Whisper due to its lack of instruction following. + +Was this page useful? \ No newline at end of file diff --git a/standalone_openai_debug.py b/standalone_openai_debug.py new file mode 100644 index 0000000..2b0fbdf --- /dev/null +++ b/standalone_openai_debug.py @@ -0,0 +1,68 @@ +import os +import sys +import json +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import OpenAI directly +import openai + +print("=== OpenAI API Debug ===") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + sys.exit(1) + +# Create a temporary directory for audio files +temp_dir = Path("./temp_debug") +temp_dir.mkdir(exist_ok=True) + +# First, generate speech using OpenAI TTS +print("\nGenerating speech from text...") +text = "This is a test of the cloud-based Whisper feature." + +# Generate speech using OpenAI TTS +response = openai.audio.speech.create( + model="tts-1", + voice="alloy", + input=text +) + +audio_path = temp_dir / "test_speech.mp3" +response.stream_to_file(str(audio_path)) + +print(f"Speech generated and saved to {audio_path}") + +# Now, transcribe the audio using OpenAI Whisper API +print("\nTranscribing audio with word-level timestamps...") +with open(audio_path, "rb") as audio_file: + transcription = openai.audio.transcriptions.create( + model="whisper-1", + file=audio_file, + response_format="verbose_json", + timestamp_granularities=["word"] + ) + +# Print the raw response +print("\nRaw API Response:") +print(json.dumps(transcription.model_dump(), indent=2)) + +# Check if word-level timestamps are available +print("\nChecking for word-level timestamps:") +if hasattr(transcription, "words"): + print(f"Found {len(transcription.words)} words with timestamps:") + for i, word in enumerate(transcription.words): + print(f" {i+1}. '{word.word}' from {word.start:.2f}s to {word.end:.2f}s") +else: + print("No word-level timestamps found in the response.") + +print("\nDebug completed!") \ No newline at end of file diff --git a/standalone_openai_demo.py b/standalone_openai_demo.py new file mode 100644 index 0000000..5771fca --- /dev/null +++ b/standalone_openai_demo.py @@ -0,0 +1,108 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import our modules +from manim_voiceover.services.openai import OpenAIService +from manim_voiceover.config import config +from manim_voiceover.helper import remove_bookmarks + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== OpenAI TTS + Cloud-based Whisper Demo ===") +print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_openai_demo") +temp_dir.mkdir(exist_ok=True) + +# Create an OpenAIService with cloud whisper enabled +service = OpenAIService( + voice="alloy", # Available voices: alloy, echo, fable, onyx, nova, shimmer + model="tts-1", # tts-1 or tts-1-hd + transcription_model="base", # Model name is still required + use_cloud_whisper=True, # This enables cloud-based Whisper + cache_dir=str(temp_dir) +) + +print(f"\nOpenAIService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + sys.exit(1) + +# Generate speech from text with bookmarks +print("\nGenerating speech from text with bookmarks...") +text = """This demonstration uses OpenAI's text-to-speech service +with cloud-based Whisper for +word-level alignment.""" + +print("\nText to synthesize:") +print(text) + +# Generate the speech +result = service._wrap_generate_from_text(text) + +print(f"\nSpeech generated successfully!") +print(f"Audio file: {result.get('final_audio')}") +print(f"Audio path: {temp_dir / result.get('final_audio')}") +print(f"Word boundaries available: {'word_boundaries' in result}") +print(f"Word boundaries count: {len(result.get('word_boundaries', []))}") +print(f"Transcribed text: {result.get('transcribed_text', 'Not available')}") + +# Print the raw result for debugging +print("\nRaw result keys:", result.keys()) +for key, value in result.items(): + if key == 'word_boundaries': + print(f"Word boundaries type: {type(value)}") + print(f"Word boundaries length: {len(value)}") + if value and len(value) > 0: + print(f"First word boundary: {value[0]}") + elif key == 'input_data': + print(f"Input data: {value}") + else: + print(f"{key}: {value}") + +print(f"\nWord boundaries:") +if 'word_boundaries' in result and result['word_boundaries']: + for i, boundary in enumerate(result['word_boundaries']): + print(f" {i+1}. '{boundary['text']}' at {boundary['audio_offset']/1000:.2f} seconds") + + # Find the bookmarks + print("\nBookmarks:") + text_without_bookmarks = remove_bookmarks(text).lower() + text_with_bookmarks = text.lower() + + # Find 'cloud_point' bookmark + cloud_index = text_with_bookmarks.find("") + if cloud_index >= 0: + # Find the closest word boundary after the bookmark + cloud_word_index = len(remove_bookmarks(text[:cloud_index]).split()) + if cloud_word_index < len(result['word_boundaries']): + cloud_word = result['word_boundaries'][cloud_word_index] + print(f" - 'cloud_point' bookmark would trigger at word '{cloud_word['text']}' at time {cloud_word['audio_offset']/1000:.2f} seconds") + + # Find 'alignment_point' bookmark + alignment_index = text_with_bookmarks.find("") + if alignment_index >= 0: + # Find the closest word boundary after the bookmark + alignment_word_index = len(remove_bookmarks(text[:alignment_index]).split()) + if alignment_word_index < len(result['word_boundaries']): + alignment_word = result['word_boundaries'][alignment_word_index] + print(f" - 'alignment_point' bookmark would trigger at word '{alignment_word['text']}' at time {alignment_word['audio_offset']/1000:.2f} seconds") +else: + print(" No word boundaries found in the result.") + +print("\nDemo completed!") +print(f"You can listen to the generated audio file at: {temp_dir / result.get('final_audio')}") \ No newline at end of file diff --git a/temp_direct_test/cache.json b/temp_direct_test/cache.json new file mode 100644 index 0000000..3c85419 --- /dev/null +++ b/temp_direct_test/cache.json @@ -0,0 +1,84 @@ +[ + { + "input_text": "This is a test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a test of the cloud-based Whisper feature.", + "service": "openai" + }, + "original_audio": "direct_test.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 219, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 379, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 519, + "text_offset": 10, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 879, + "text_offset": 15, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 1159, + "text_offset": 18, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 1320, + "text_offset": 22, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 1600, + "text_offset": 28, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 1919, + "text_offset": 34, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 2299, + "text_offset": 42, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a test of the cloud-based whisper feature.", + "final_audio": "direct_test.mp3" + } +] \ No newline at end of file diff --git a/temp_openai_demo/cache.json b/temp_openai_demo/cache.json new file mode 100644 index 0000000..6ef0070 --- /dev/null +++ b/temp_openai_demo/cache.json @@ -0,0 +1,114 @@ +[ + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + }, + { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "input_data": { + "input_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word-level alignment.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3", + "word_boundaries": [], + "transcribed_text": "This demonstration uses OpenAI's text-to-speech service with cloud-based Whisper for word level alignment.", + "final_audio": "this-demonstration-uses-openai-s-text-to-speech-15df98ee.mp3" + } +] \ No newline at end of file diff --git a/temp_service_test/cache.json b/temp_service_test/cache.json new file mode 100644 index 0000000..01b8c02 --- /dev/null +++ b/temp_service_test/cache.json @@ -0,0 +1,190 @@ +[ + { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 2000000, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 3600000, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 5000000, + "text_offset": 10, + "word_length": 6, + "text": "direct", + "boundary_type": "Word" + }, + { + "audio_offset": 7799999, + "text_offset": 17, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 12599999, + "text_offset": 22, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 15800000, + "text_offset": 25, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 17200000, + "text_offset": 29, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 19800000, + "text_offset": 35, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 22599999, + "text_offset": 41, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 26199998, + "text_offset": 49, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a direct test of the cloud-based whisper feature.", + "final_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3" + }, + { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a direct test of the cloud-based Whisper feature.", + "service": "openai", + "config": { + "voice": "alloy", + "model": "tts-1", + "speed": 1.0 + } + }, + "original_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3", + "word_boundaries": [ + { + "audio_offset": 0, + "text_offset": 0, + "word_length": 4, + "text": "This", + "boundary_type": "Word" + }, + { + "audio_offset": 2000000, + "text_offset": 5, + "word_length": 2, + "text": "is", + "boundary_type": "Word" + }, + { + "audio_offset": 3600000, + "text_offset": 8, + "word_length": 1, + "text": "a", + "boundary_type": "Word" + }, + { + "audio_offset": 5000000, + "text_offset": 10, + "word_length": 6, + "text": "direct", + "boundary_type": "Word" + }, + { + "audio_offset": 7799999, + "text_offset": 17, + "word_length": 4, + "text": "test", + "boundary_type": "Word" + }, + { + "audio_offset": 12599999, + "text_offset": 22, + "word_length": 2, + "text": "of", + "boundary_type": "Word" + }, + { + "audio_offset": 15800000, + "text_offset": 25, + "word_length": 3, + "text": "the", + "boundary_type": "Word" + }, + { + "audio_offset": 17200000, + "text_offset": 29, + "word_length": 5, + "text": "cloud", + "boundary_type": "Word" + }, + { + "audio_offset": 19800000, + "text_offset": 35, + "word_length": 5, + "text": "based", + "boundary_type": "Word" + }, + { + "audio_offset": 22599999, + "text_offset": 41, + "word_length": 7, + "text": "whisper", + "boundary_type": "Word" + }, + { + "audio_offset": 26199998, + "text_offset": 49, + "word_length": 7, + "text": "feature", + "boundary_type": "Word" + } + ], + "transcribed_text": "This is a direct test of the cloud-based whisper feature.", + "final_audio": "this-is-a-direct-test-of-the-cloud-based-whisper-841ab5cd.mp3" + } +] \ No newline at end of file diff --git a/temp_test/cache.json b/temp_test/cache.json new file mode 100644 index 0000000..2410ad4 --- /dev/null +++ b/temp_test/cache.json @@ -0,0 +1,13 @@ +[ + { + "input_text": "This is a test of the cloud-based Whisper feature.", + "input_data": { + "input_text": "This is a test of the cloud-based Whisper feature.", + "service": "gtts" + }, + "original_audio": "this-is-a-test-of-the-cloud-based-whisper-feature-c4598c9b.mp3", + "word_boundaries": [], + "transcribed_text": "This is a test of the cloud-based Whisper feature.", + "final_audio": "this-is-a-test-of-the-cloud-based-whisper-feature-c4598c9b.mp3" + } +] \ No newline at end of file diff --git a/test_cli.py b/test_cli.py new file mode 100644 index 0000000..ba6b680 --- /dev/null +++ b/test_cli.py @@ -0,0 +1,33 @@ +import sys +import subprocess + +print("Testing Manim CLI with --use-cloud-whisper flag") + +# Run the manim command with our custom flag +cmd = ["manim", "--help"] +print(f"Running command: {' '.join(cmd)}") +result = subprocess.run(cmd, capture_output=True, text=True) + +# Check if our flag is in the help output +if "--use-cloud-whisper" in result.stdout: + print("✅ Success: --use-cloud-whisper flag is available in the help output") +else: + print("❌ Error: --use-cloud-whisper flag is not available in the help output") + print("Help output:") + print(result.stdout) + +# Try running the command with our flag +cmd = ["manim", "-pql", "--use-cloud-whisper", "examples/cloud_whisper_demo.py", "CloudWhisperDemo"] +print(f"\nRunning command: {' '.join(cmd)}") +print("(This will not actually run the command, just checking if the flag is recognized)") + +# Just check if the flag is recognized, don't actually run the command +cmd = ["manim", "--use-cloud-whisper", "--help"] +result = subprocess.run(cmd, capture_output=True, text=True) + +if result.returncode == 0: + print("✅ Success: --use-cloud-whisper flag is recognized") +else: + print("❌ Error: --use-cloud-whisper flag is not recognized") + print("Error output:") + print(result.stderr) \ No newline at end of file diff --git a/test_cloud_whisper.py b/test_cloud_whisper.py new file mode 100644 index 0000000..6712e85 --- /dev/null +++ b/test_cloud_whisper.py @@ -0,0 +1,29 @@ +from manim import config +from manim_voiceover.services.base import SpeechService +from manim_voiceover.services.gtts import GTTSService + +# Test 1: Check if the use_cloud_whisper attribute exists +print("Test 1: Checking if use_cloud_whisper attribute exists in config") +if hasattr(config, 'use_cloud_whisper'): + print("✅ Success: config.use_cloud_whisper attribute exists") + print(f"Current value: {config.use_cloud_whisper}") +else: + print("❌ Error: config.use_cloud_whisper attribute does not exist") + +# Test 2: Create a SpeechService with use_cloud_whisper=True +print("\nTest 2: Creating SpeechService with use_cloud_whisper=True") +try: + service = SpeechService(use_cloud_whisper=True, transcription_model='base') + print(f"✅ Success: SpeechService created with use_cloud_whisper={service.use_cloud_whisper}") +except Exception as e: + print(f"❌ Error: Failed to create SpeechService: {str(e)}") + +# Test 3: Create a GTTSService with use_cloud_whisper=True +print("\nTest 3: Creating GTTSService with use_cloud_whisper=True") +try: + service = GTTSService(use_cloud_whisper=True, transcription_model='base') + print(f"✅ Success: GTTSService created with use_cloud_whisper={service.use_cloud_whisper}") +except Exception as e: + print(f"❌ Error: Failed to create GTTSService: {str(e)}") + +print("\nAll tests completed!") \ No newline at end of file diff --git a/test_cloud_whisper_simple.py b/test_cloud_whisper_simple.py new file mode 100644 index 0000000..5942a2f --- /dev/null +++ b/test_cloud_whisper_simple.py @@ -0,0 +1,56 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Add the current directory to the path so we can import our modules +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# Import our modules +from manim_voiceover.services.gtts import GTTSService +from manim_voiceover.config import config + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== Testing Cloud-based Whisper Implementation ===") +print(f"Cloud Whisper enabled: {config.use_cloud_whisper}") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_test") +temp_dir.mkdir(exist_ok=True) + +# Create a GTTSService with cloud whisper enabled +service = GTTSService( + transcription_model="base", # Model name is still required + use_cloud_whisper=True, # This enables cloud-based Whisper + cache_dir=str(temp_dir) +) + +print(f"\nGTTSService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Check if OpenAI API key is set +openai_api_key = os.getenv("OPENAI_API_KEY") +if not openai_api_key or openai_api_key == "your_openai_api_key_here": + print("\n⚠️ Warning: OPENAI_API_KEY is not set or is using the default value.") + print("Please edit the .env file with your actual OpenAI API key.") + print("Skipping the actual API call test.") +else: + # Generate speech from text + print("\nGenerating speech from text...") + text = "This is a test of the cloud-based Whisper feature." + result = service._wrap_generate_from_text(text) + + print(f"\nSpeech generated successfully!") + print(f"Audio file: {result.get('final_audio')}") + print(f"Word boundaries available: {'word_boundaries' in result}") + + if 'word_boundaries' in result: + print(f"\nWord boundaries:") + for boundary in result['word_boundaries'][:5]: # Show first 5 boundaries + print(f" - {boundary['text']} at {boundary['audio_offset']}") + +print("\nTest completed!") \ No newline at end of file diff --git a/test_scene.py b/test_scene.py new file mode 100644 index 0000000..3804a9c --- /dev/null +++ b/test_scene.py @@ -0,0 +1,24 @@ +from manim import * +from manim_voiceover.voiceover_scene import VoiceoverScene +from manim_voiceover.services.gtts import GTTSService + +class TestScene(VoiceoverScene): + def construct(self): + # Print the cloud whisper setting + print(f"Cloud Whisper enabled: {self.config.use_cloud_whisper}") + + # Initialize speech service + service = GTTSService(transcription_model="base") + self.set_speech_service(service) + + # Create a simple circle + circle = Circle() + + # Add voiceover with a bookmark + with self.voiceover( + """This is a test of the cloud-based Whisper feature.""" + ): + self.wait_until_bookmark("circle_appears") + self.play(Create(circle)) + + self.wait(1) \ No newline at end of file diff --git a/test_speech_service.py b/test_speech_service.py new file mode 100644 index 0000000..b594a8e --- /dev/null +++ b/test_speech_service.py @@ -0,0 +1,56 @@ +import os +import sys +from pathlib import Path +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Import our modules +from manim_voiceover.services.openai import OpenAIService +from manim_voiceover.config import config + +# Set the cloud whisper flag manually +config.use_cloud_whisper = True + +print("=== Testing SpeechService with Cloud Whisper ===") + +# Create a temporary directory for audio files +temp_dir = Path("./temp_service_test") +temp_dir.mkdir(exist_ok=True) + +# Create an OpenAIService with cloud whisper enabled +service = OpenAIService( + voice="alloy", + model="tts-1", + transcription_model="base", + use_cloud_whisper=True, + cache_dir=str(temp_dir) +) + +print(f"\nOpenAIService created with use_cloud_whisper={service.use_cloud_whisper}") + +# Generate speech from text +print("\nGenerating speech from text...") +text = "This is a direct test of the cloud-based Whisper feature." + +# Call the _wrap_generate_from_text method directly +result = service._wrap_generate_from_text(text) + +print(f"\nSpeech generated successfully!") +print(f"Audio file: {result.get('final_audio')}") +print(f"Word boundaries available: {'word_boundaries' in result}") +print(f"Word boundaries count: {len(result.get('word_boundaries', []))}") +print(f"Transcribed text: {result.get('transcribed_text', 'Not available')}") + +# Print the word boundaries +if 'word_boundaries' in result and result['word_boundaries']: + print("\nWord boundaries:") + for i, boundary in enumerate(result['word_boundaries']): + # Convert from milliseconds to seconds + time_in_seconds = boundary['audio_offset'] / 1000 + print(f" {i+1}. '{boundary['text']}' at {time_in_seconds:.2f} seconds") +else: + print("\nNo word boundaries found in the result.") + +print("\nTest completed!") \ No newline at end of file