121121 "srp" , "swe" , "swh" , "tam" , "tel" , "tgk" , "tgl" , "tha" , "tur" , "ukr" , "urd" , "uzn" , "vie" , "yor" , "yue" , "zul" ,
122122} # fmt: skip
123123
124+ # Eleven Labs Scribe v1 - supports 99 languages with 3-letter ISO codes
125+ ELEVENLABS_SUPPORTED = {
126+ "afr" , "amh" , "ara" , "hye" , "asm" , "ast" , "aze" , "bel" , "ben" , "bos" , "bul" , "mya" , "yue" , "cat" , "ceb" , "nya" ,
127+ "hrv" , "ces" , "dan" , "nld" , "eng" , "est" , "fil" , "fin" , "fra" , "ful" , "glg" , "lug" , "kat" , "deu" , "ell" , "guj" ,
128+ "hau" , "heb" , "hin" , "hun" , "isl" , "ibo" , "ind" , "gle" , "ita" , "jpn" , "jav" , "kea" , "kan" , "kaz" , "khm" , "kor" ,
129+ "kur" , "kir" , "lao" , "lav" , "lin" , "lit" , "luo" , "ltz" , "mkd" , "msa" , "mal" , "mlt" , "zho" , "mri" , "mar" , "mon" ,
130+ "nep" , "nso" , "nor" , "oci" , "ori" , "pus" , "fas" , "pol" , "por" , "pan" , "ron" , "rus" , "srp" , "sna" , "snd" , "slk" ,
131+ "slv" , "som" , "spa" , "swa" , "swe" , "tam" , "tgk" , "tel" , "tha" , "tur" , "ukr" , "umb" , "urd" , "uzb" , "vie" , "cym" ,
132+ "wol" , "xho" , "zul" ,
133+ } # fmt: skip
134+
124135AZURE_SUPPORTED = {
125136 "af-ZA" , "am-ET" , "ar-AE" , "ar-BH" , "ar-DZ" , "ar-EG" , "ar-IL" , "ar-IQ" , "ar-JO" , "ar-KW" , "ar-LB" , "ar-LY" , "ar-MA" ,
126137 "ar-OM" , "ar-PS" , "ar-QA" , "ar-SA" , "ar-SY" , "ar-TN" , "ar-YE" , "az-AZ" , "bg-BG" , "bn-IN" , "bs-BA" , "ca-ES" , "cs-CZ" ,
@@ -260,6 +271,7 @@ class AsrModels(Enum):
260271 usm = "Chirp / USM (Google V2)"
261272 deepgram = "Deepgram"
262273 azure = "Azure Speech"
274+ elevenlabs = "ElevenLabs Scribe v1"
263275 seamless_m4t_v2 = "Seamless M4T v2 (Facebook Research)"
264276 mms_1b_all = "Massively Multilingual Speech (MMS) (Facebook Research)"
265277
@@ -329,6 +341,7 @@ def supports_input_prompt(self) -> bool:
329341 AsrModels .seamless_m4t_v2 : "facebook/seamless-m4t-v2-large" ,
330342 AsrModels .mms_1b_all : "facebook/mms-1b-all" ,
331343 AsrModels .lelapa : "lelapa-vulavula" ,
344+ AsrModels .elevenlabs : "elevenlabs-scribe-v1" ,
332345}
333346
334347forced_asr_languages = {
@@ -354,6 +367,7 @@ def supports_input_prompt(self) -> bool:
354367 AsrModels .gcp_v1 : GCP_V1_SUPPORTED ,
355368 AsrModels .usm : CHIRP_SUPPORTED ,
356369 AsrModels .deepgram : DEEPGRAM_SUPPORTED ,
370+ AsrModels .elevenlabs : ELEVENLABS_SUPPORTED ,
357371 AsrModels .seamless_m4t_v2 : SEAMLESS_v2_ASR_SUPPORTED ,
358372 AsrModels .azure : AZURE_SUPPORTED ,
359373 AsrModels .mms_1b_all : MMS_SUPPORTED ,
@@ -971,6 +985,33 @@ def get_google_auth_session(*scopes: str) -> tuple[AuthorizedSession, str]:
971985 return AuthorizedSession (credentials = creds ), project
972986
973987
988+ def elevenlabs_asr (audio_url : str , language : str = None ) -> dict :
989+ """
990+ Call ElevenLabs Speech-to-Text API
991+ """
992+ audio_r = requests .get (audio_url )
993+ raise_for_status (audio_r , is_user_url = True )
994+
995+ # Set up the files and form data for the multipart request
996+ files = {"file" : audio_r .content }
997+ data = {"model_id" : "scribe_v1" }
998+ headers = {"xi-api-key" : settings .ELEVEN_LABS_API_KEY }
999+
1000+ # Language parameter is sent in the form data
1001+ if language :
1002+ data ["language_code" ] = language
1003+
1004+ response = requests .post (
1005+ "https://api.elevenlabs.io/v1/speech-to-text" ,
1006+ files = files ,
1007+ headers = headers ,
1008+ data = data ,
1009+ )
1010+ raise_for_status (response )
1011+
1012+ return response .json ()
1013+
1014+
9741015def run_asr (
9751016 audio_url : str ,
9761017 selected_model : str ,
@@ -1017,6 +1058,21 @@ def run_asr(
10171058
10181059 if selected_model == AsrModels .azure :
10191060 return azure_asr (audio_url , language )
1061+ elif selected_model == AsrModels .elevenlabs :
1062+ result = elevenlabs_asr (audio_url , language )
1063+ chunks = []
1064+ for word_data in result .get ("words" , []):
1065+ if word_data .get ("type" ) == "word" :
1066+ speaker = word_data .get ("speaker_id" , 0 )
1067+ else :
1068+ speaker = None
1069+ chunk = {
1070+ "timestamp" : (word_data ["start" ], word_data ["end" ]),
1071+ "text" : word_data ["text" ],
1072+ "speaker" : speaker ,
1073+ }
1074+ chunks .append (chunk )
1075+ data = {"text" : result ["text" ], "chunks" : chunks }
10201076 elif selected_model == AsrModels .whisper_large_v3 :
10211077 import replicate
10221078
0 commit comments