|
1 | 1 | --- |
2 | | -title: 'Use AssemblyAI with Pyannote to generate custom Speaker Labels' |
| 2 | +title: 'How to generate custom Speaker Labels with Pyannote' |
3 | 3 | --- |
4 | 4 |
|
5 | 5 |
|
6 | 6 |
|
7 | 7 |
|
8 | | -# Use AssemblyAI with Pyannote to generate custom Speaker Labels |
9 | 8 |
|
10 | 9 | In this guide, we'll show you how to generate Speaker Labels using Pyannote with an AssemblyAI transcript. This can be used to generate Speaker Labels for languages we currently do not support for speaker labelling. |
11 | 10 |
|
| 11 | +Here’s the full sample code for what you’ll build in this tutorial: |
| 12 | + |
| 13 | +```python |
| 14 | +import os |
| 15 | +import assemblyai as aai |
| 16 | +from pyannote.audio import Pipeline |
| 17 | +import torch |
| 18 | +import pandas as pd |
| 19 | +import numpy as np |
| 20 | + |
| 21 | +# Assign your API keys |
| 22 | +HUGGING_FACE_TOKEN = os.getenv("HF_TOKEN") |
| 23 | +ASSEMBLYAI_API_KEY = os.getenv("ASSEMBLYAI_API_KEY") |
| 24 | + |
| 25 | +# Authenticate with AssemblyAI |
| 26 | +aai.settings.api_key = ASSEMBLYAI_API_KEY |
| 27 | + |
| 28 | +def transcribe_audio(audio_file, language="en"): |
| 29 | + """ |
| 30 | + Transcribe an audio file using AssemblyAI. |
| 31 | +
|
| 32 | + Args: |
| 33 | + audio_file (str): Path to the audio file. |
| 34 | + language (str, optional): Language code for transcription. Defaults to "en". |
| 35 | +
|
| 36 | + Returns: |
| 37 | + aai.Transcript: The transcription result. |
| 38 | + """ |
| 39 | + |
| 40 | + transcriber = aai.Transcriber(config=aai.TranscriptionConfig(speech_model='nano', language_code=language)) |
| 41 | + transcript = transcriber.transcribe(audio_file) |
| 42 | + print(f"Transcript ID: {transcript.id}") |
| 43 | + return transcript |
| 44 | + |
| 45 | +def get_speaker_labels(audio_file, transcript: aai.Transcript): |
| 46 | + """ |
| 47 | + Perform speaker diarization on an audio file and combine results with the transcript. |
| 48 | +
|
| 49 | + Args: |
| 50 | + audio_file (str): Path to the audio file. |
| 51 | + transcript (aai.Transcript): The transcription result from AssemblyAI. |
| 52 | +
|
| 53 | + Returns: |
| 54 | + str: A formatted string containing the transcript with speaker labels and timestamps. |
| 55 | + """ |
| 56 | + # Initialize the speaker diarization pipeline with GPU support |
| 57 | + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
| 58 | + pipeline = Pipeline.from_pretrained( |
| 59 | + "pyannote/speaker-diarization", |
| 60 | + use_auth_token=HUGGING_FACE_TOKEN, |
| 61 | + ) |
| 62 | + |
| 63 | + if pipeline is None: |
| 64 | + raise ValueError("Failed to initialize the pipeline. Please check your authentication token and internet connection.") |
| 65 | + else: |
| 66 | + pipeline = pipeline.to(device) |
| 67 | + |
| 68 | + # Apply the pipeline to the audio file |
| 69 | + diarization = pipeline(audio_file) |
| 70 | + |
| 71 | + # Create a dictionary to store speaker segments |
| 72 | + speaker_segments = {} |
| 73 | + |
| 74 | + # Process diarization results |
| 75 | + for turn, _, speaker in diarization.itertracks(yield_label=True): |
| 76 | + start, end = turn.start, turn.end |
| 77 | + if speaker not in speaker_segments: |
| 78 | + speaker_segments[speaker] = [] |
| 79 | + speaker_segments[speaker].append((start, end)) |
| 80 | + |
| 81 | + # Convert speaker_segments to a DataFrame |
| 82 | + diarize_df = pd.DataFrame([(speaker, start, end) |
| 83 | + for speaker, segments in speaker_segments.items() |
| 84 | + for start, end in segments], |
| 85 | + columns=['speaker', 'start', 'end']) |
| 86 | + |
| 87 | + # Assign speakers to transcript words |
| 88 | + for word in transcript.words: |
| 89 | + word_start = float(word.start) / 1000 |
| 90 | + word_end = float(word.end) / 1000 |
| 91 | + |
| 92 | + overlaps = diarize_df[ |
| 93 | + (diarize_df['start'] <= word_end) & (diarize_df['end'] >= word_start) |
| 94 | + ].copy() |
| 95 | + |
| 96 | + if not overlaps.empty: |
| 97 | + overlaps['overlap'] = np.minimum(overlaps['end'], word_end) - np.maximum(overlaps['start'], word_start) |
| 98 | + word.speaker = overlaps.loc[overlaps['overlap'].idxmax(), 'speaker'] |
| 99 | + else: |
| 100 | + word.speaker = "Unknown" |
| 101 | + |
| 102 | + full_transcript = '' |
| 103 | + |
| 104 | + # Update segment speakers based on the majority speaker of its words |
| 105 | + for segment in transcript.get_sentences(): |
| 106 | + segment_start = float(segment.start) / 1000 |
| 107 | + segment_end = float(segment.end) / 1000 |
| 108 | + |
| 109 | + overlaps = diarize_df[ |
| 110 | + (diarize_df['start'] <= segment_end) & (diarize_df['end'] >= segment_start) |
| 111 | + ].copy() |
| 112 | + |
| 113 | + if not overlaps.empty: |
| 114 | + overlaps['overlap'] = np.minimum(overlaps['end'], segment_end) - np.maximum(overlaps['start'], segment_start) |
| 115 | + segment.speaker = overlaps.loc[overlaps['overlap'].idxmax(), 'speaker'] |
| 116 | + speaker_label = segment.speaker.replace('SPEAKER_', 'SPEAKER ') |
| 117 | + full_transcript += f'[{format_timestamp(segment_start)}] {speaker_label}: {segment.text}\n' |
| 118 | + else: |
| 119 | + segment.speaker = "Unknown" |
| 120 | + full_transcript += f'[{format_timestamp(segment_start)}] Unknown: {segment.text}\n' |
| 121 | + |
| 122 | + return full_transcript |
| 123 | + |
| 124 | +def format_timestamp(seconds): |
| 125 | + """ |
| 126 | + Convert seconds to a formatted timestamp string (HH:MM:SS). |
| 127 | +
|
| 128 | + Args: |
| 129 | + seconds (float): Time in seconds. |
| 130 | +
|
| 131 | + Returns: |
| 132 | + str: Formatted timestamp string. |
| 133 | + """ |
| 134 | + hours, remainder = divmod(int(seconds), 3600) |
| 135 | + minutes, seconds = divmod(remainder, 60) |
| 136 | + return f"{hours:02d}:{minutes:02d}:{seconds:02d}" |
| 137 | + |
| 138 | +audio_file = "audio.wav" # your local file path |
| 139 | +transcript: aai.Transcript = transcribe_audio(audio_file, language="hr") # select a language code |
| 140 | +transcript_with_speakers = get_speaker_labels(audio_file, transcript) |
| 141 | +print(transcript_with_speakers) |
| 142 | +``` |
12 | 143 |
|
13 | 144 | ### Get Started |
14 | 145 |
|
15 | 146 | Before we begin, make sure you have an AssemblyAI account and an API key. You can [sign up](https://assemblyai.com/dashboard/signup) for a free account and get your API key from your dashboard. |
16 | 147 |
|
17 | 148 | You'll also need a HuggingFace account and API key. You can [sign up](https://huggingface.co/join) for a free account and get your API key [here](https://huggingface.co/settings/tokens). Create a **Read** type API token to ensure the necessary permissions are enabled. |
18 | 149 |
|
19 | | ---- |
| 150 | +<Info> |
20 | 151 | Browse to the [speaker-diarization](https://huggingface.co/pyannote/speaker-diarization) and [segmentation](https://huggingface.co/pyannote/segmentation) model pages and accept the **Gated Model** Terms & Conditions by entering your **Company/University**, **Website** and **Use Case** details in order to gain access to the use of these models. |
| 152 | +</Info> |
21 | 153 |
|
22 | | ---- |
23 | 154 | ### Step-by-Step Instructions |
24 | 155 |
|
25 | 156 | Install the necessary dependencies. |
@@ -161,23 +292,22 @@ def get_speaker_labels(audio_file, transcript: aai.Transcript): |
161 | 292 | return full_transcript |
162 | 293 | ``` |
163 | 294 |
|
164 | | ---- |
165 | | -***Advanced Usage*** |
| 295 | +<Accordion title="How can I set the number of speakers?"> |
166 | 296 |
|
167 | 297 | If you know the number of speakers in advance, you can use the `num_speakers` parameter to set the number of speakers: |
168 | 298 |
|
169 | | -``` |
| 299 | +```python |
170 | 300 | # Apply the pipeline to the audio file |
171 | 301 | diarization = pipeline(audio_file, num_speakers=4) |
172 | 302 | ``` |
173 | 303 |
|
174 | 304 | You can also provide upper/lower bands on the number of speakers using the `min_speakers` and `max_speakers` parameters: |
175 | 305 |
|
176 | | -``` |
| 306 | +```python |
177 | 307 | # Apply the pipeline to the audio file |
178 | 308 | diarization = pipeline(audio_file, min_speakers=2, max_speakers=5) |
179 | 309 | ``` |
180 | | ---- |
| 310 | +</Accordion> |
181 | 311 |
|
182 | 312 |
|
183 | 313 | Create the `format_timestamp`, this will handle the timestamps conversion to improve the readability of the final speaker labelled transcript. |
|
0 commit comments