@@ -283,6 +283,7 @@ class AsrModels(Enum):
283
283
ghana_nlp_asr_v2 = "Ghana NLP ASR v2"
284
284
lelapa = "Vulavula (Lelapa AI)"
285
285
whisper_sunbird_large_v3 = "Sunbird Ugandan Whisper v3 (Sunbird AI)"
286
+ whisper_akera_large_v3 = "Akera Whisper v3 (akera)"
286
287
whisper_swahili_medium_v3 = "Jacaranda Health Swahili Whisper v3 (Jacaranda Health)"
287
288
mbaza_ctc_large = "Mbaza Conformer LG (MbazaNLP)"
288
289
@@ -336,6 +337,7 @@ def supports_input_prompt(self) -> bool:
336
337
337
338
338
339
asr_model_ids = {
340
+ AsrModels .whisper_akera_large_v3 : "akera/whisper-large-v3-kik-full_v2" ,
339
341
AsrModels .gpt_4_o_audio : "gpt-4o-transcribe" ,
340
342
AsrModels .gpt_4_o_mini_audio : "gpt-4o-mini-transcribe" ,
341
343
AsrModels .whisper_large_v3 : "vaibhavs10/incredibly-fast-whisper:3ab86df6c8f54c11309d4d1f930ac292bad43ace52d10c80d87eb258b3c9f79c" ,
@@ -362,6 +364,7 @@ def supports_input_prompt(self) -> bool:
362
364
AsrModels .vakyansh_bhojpuri : "bho" ,
363
365
AsrModels .nemo_english : "en" ,
364
366
AsrModels .nemo_hindi : "hi" ,
367
+ AsrModels .whisper_akera_large_v3 : "kik" ,
365
368
}
366
369
367
370
asr_supported_languages = {
@@ -386,6 +389,7 @@ def supports_input_prompt(self) -> bool:
386
389
AsrModels .lelapa : LELAPA_ASR_SUPPORTED ,
387
390
AsrModels .whisper_sunbird_large_v3 : SUNBIRD_SUPPORTED_LANGUAGES ,
388
391
AsrModels .whisper_swahili_medium_v3 : {"sw" , "en" },
392
+ AsrModels .whisper_akera_large_v3 : {"kik" },
389
393
AsrModels .mbaza_ctc_large : {"sw" , "rw" , "lg" },
390
394
}
391
395
@@ -1286,13 +1290,17 @@ def run_asr(
1286
1290
)
1287
1291
# call one of the self-hosted models
1288
1292
else :
1289
- kwargs = {}
1293
+ kwargs = {"task" : "translate" if speech_translation_target else "transcribe" }
1290
1294
if "vakyansh" in selected_model .name :
1291
1295
# fixes https://github.com/huggingface/transformers/issues/15275#issuecomment-1624879632
1292
1296
kwargs ["decoder_kwargs" ] = dict (skip_special_tokens = True )
1293
1297
kwargs ["chunk_length_s" ] = 60
1294
1298
kwargs ["stride_length_s" ] = (6 , 0 )
1295
1299
kwargs ["batch_size" ] = 32
1300
+ elif "akera" in selected_model .name :
1301
+ # don't pass language or task
1302
+ kwargs .pop ("task" )
1303
+ kwargs ["max_length" ] = 448
1296
1304
elif "whisper" in selected_model .name :
1297
1305
forced_lang = forced_asr_languages .get (selected_model )
1298
1306
if forced_lang :
@@ -1308,7 +1316,6 @@ def run_asr(
1308
1316
),
1309
1317
inputs = dict (
1310
1318
audio = audio_url ,
1311
- task = "translate" if speech_translation_target else "transcribe" ,
1312
1319
return_timestamps = output_format != AsrOutputFormat .text ,
1313
1320
** kwargs ,
1314
1321
),
0 commit comments