TukaTek · devin-ai-integration · Jun 29, 2025 · Jun 29, 2025
diff --git a/backend/onyx/main.py b/backend/onyx/main.py
@@ -98,6 +98,8 @@
 from onyx.server.query_and_chat.query_backend import basic_router as query_router
 from onyx.server.settings.api import admin_router as settings_admin_router
 from onyx.server.settings.api import basic_router as settings_router
+from onyx.server.audio.api import router as audio_router
+from onyx.server.audio.api import admin_router as audio_admin_router
 from onyx.server.token_rate_limits.api import (
     router as token_rate_limit_settings_router,
 )
@@ -357,6 +359,8 @@ def get_application(lifespan_override: Lifespan | None = None) -> FastAPI:
     include_router_with_global_prefix_prepended(application, long_term_logs_router)
     include_router_with_global_prefix_prepended(application, api_key_router)
     include_router_with_global_prefix_prepended(application, standard_oauth_router)
+    include_router_with_global_prefix_prepended(application, audio_router)
+    include_router_with_global_prefix_prepended(application, audio_admin_router)
 
     if AUTH_TYPE == AuthType.DISABLED:
         # Server logs this during auth setup verification step

diff --git a/backend/onyx/server/audio/__init__.py b/backend/onyx/server/audio/__init__.py
@@ -0,0 +1 @@
+
diff --git a/backend/onyx/server/audio/api.py b/backend/onyx/server/audio/api.py
@@ -0,0 +1,253 @@
+import tempfile
+import os
+from typing import Optional
+from fastapi import APIRouter, Depends, File, UploadFile, HTTPException
+from sqlalchemy.orm import Session
+
+from onyx.auth.users import current_admin_user, current_user
+from onyx.db.engine import get_session
+from onyx.db.models import User
+from onyx.key_value_store.factory import get_kv_store
+from onyx.key_value_store.interface import KvKeyNotFoundError
+from onyx.server.audio.models import (
+    AudioConfig,
+    STTConfigForm,
+    TranscriptionResponse,
+    TestEngineRequest,
+    TestEngineResponse,
+)
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+router = APIRouter(prefix="/audio")
+admin_router = APIRouter(prefix="/admin/audio")
+
+AUDIO_CONFIG_KEY = "audio_config"
+
+
+def get_audio_config_from_store() -> AudioConfig:
+    """Get audio configuration from key-value store"""
+    try:
+        kv_store = get_kv_store()
+        config_data = kv_store.load(AUDIO_CONFIG_KEY)
+        return AudioConfig.model_validate(config_data)
+    except KvKeyNotFoundError:
+        return AudioConfig(
+            stt=STTConfigForm(
+                engine="",
+                openai_api_key="",
+                openai_api_base_url="https://api.openai.com/v1",
+                whisper_model="whisper-1",
+                deepgram_api_key="",
+                azure_api_key="",
+                azure_region="",
+            )
+        )
+
+
+def store_audio_config(config: AudioConfig) -> None:
+    """Store audio configuration in key-value store"""
+    kv_store = get_kv_store()
+    kv_store.store(AUDIO_CONFIG_KEY, config.model_dump())
+
+
+@admin_router.get("/config")
+async def get_audio_config(user: User = Depends(current_admin_user)) -> AudioConfig:
+    """Get current audio configuration"""
+    return get_audio_config_from_store()
+
+
+@admin_router.post("/config")
+async def update_audio_config(
+    config: AudioConfig, user: User = Depends(current_admin_user)
+) -> None:
+    """Update audio configuration"""
+    store_audio_config(config)
+
+
+@admin_router.post("/test")
+async def test_audio_engine(
+    request: TestEngineRequest, user: User = Depends(current_admin_user)
+) -> TestEngineResponse:
+    """Test audio engine configuration"""
+    config = get_audio_config_from_store()
+
+    try:
+        if request.engine == "openai":
+            if not config.stt.openai_api_key:
+                return TestEngineResponse(
+                    success=False, message="OpenAI API key is required"
+                )
+            return TestEngineResponse(success=True, message="OpenAI configuration valid")
+
+        elif request.engine == "deepgram":
+            if not config.stt.deepgram_api_key:
+                return TestEngineResponse(
+                    success=False, message="Deepgram API key is required"
+                )
+            return TestEngineResponse(success=True, message="Deepgram configuration valid")
+
+        elif request.engine == "azure":
+            if not config.stt.azure_api_key or not config.stt.azure_region:
+                return TestEngineResponse(
+                    success=False, message="Azure API key and region are required"
+                )
+            return TestEngineResponse(success=True, message="Azure configuration valid")
+
+        elif request.engine == "web":
+            return TestEngineResponse(success=True, message="Web Speech API available")
+
+        else:
+            return TestEngineResponse(
+                success=False, message=f"Unknown engine: {request.engine}"
+            )
+
+    except Exception as e:
+        logger.error(f"Error testing audio engine {request.engine}: {e}")
+        return TestEngineResponse(
+            success=False, message=f"Error testing engine: {str(e)}"
+        )
+
+
+async def transcribe_with_openai(
+    audio_file_path: str, config: STTConfigForm
+) -> str:
+    """Transcribe audio using OpenAI Whisper API"""
+    try:
+        import openai
+
+        client = openai.OpenAI(
+            api_key=config.openai_api_key,
+            base_url=config.openai_api_base_url,
+        )
+
+        with open(audio_file_path, "rb") as audio_file:
+            transcript = client.audio.transcriptions.create(
+                model=config.whisper_model,
+                file=audio_file,
+            )
+
+        return transcript.text
+    except Exception as e:
+        logger.error(f"OpenAI transcription error: {e}")
+        raise HTTPException(status_code=500, detail=f"OpenAI transcription failed: {str(e)}")
+
+
+async def transcribe_with_deepgram(
+    audio_file_path: str, config: STTConfigForm
+) -> str:
+    """Transcribe audio using Deepgram API"""
+    try:
+        from deepgram import DeepgramClient, PrerecordedOptions
+
+        deepgram = DeepgramClient(config.deepgram_api_key)
+
+        with open(audio_file_path, "rb") as audio_file:
+            buffer_data = audio_file.read()
+
+        payload = {"buffer": buffer_data}
+        options = PrerecordedOptions(
+            model="nova-2",
+            smart_format=True,
+        )
+
+        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
+
+        if response.results and response.results.channels:
+            transcript = response.results.channels[0].alternatives[0].transcript
+            return transcript
+        else:
+            raise Exception("No transcription results from Deepgram")
+
+    except Exception as e:
+        logger.error(f"Deepgram transcription error: {e}")
+        raise HTTPException(status_code=500, detail=f"Deepgram transcription failed: {str(e)}")
+
+
+async def transcribe_with_azure(
+    audio_file_path: str, config: STTConfigForm
+) -> str:
+    """Transcribe audio using Azure Speech Services"""
+    try:
+        import azure.cognitiveservices.speech as speechsdk
+
+        speech_config = speechsdk.SpeechConfig(
+            subscription=config.azure_api_key,
+            region=config.azure_region
+        )
+        speech_config.speech_recognition_language = "en-US"
+
+        audio_config = speechsdk.audio.AudioConfig(filename=audio_file_path)
+        speech_recognizer = speechsdk.SpeechRecognizer(
+            speech_config=speech_config,
+            audio_config=audio_config
+        )
+
+        result = speech_recognizer.recognize_once()
+
+        if result.reason == speechsdk.ResultReason.RecognizedSpeech:
+            return result.text
+        elif result.reason == speechsdk.ResultReason.NoMatch:
+            raise Exception("No speech could be recognized")
+        elif result.reason == speechsdk.ResultReason.Canceled:
+            cancellation_details = result.cancellation_details
+            raise Exception(f"Speech recognition canceled: {cancellation_details.reason}")
+        else:
+            raise Exception("Unknown error in speech recognition")
+
+    except Exception as e:
+        logger.error(f"Azure transcription error: {e}")
+        raise HTTPException(status_code=500, detail=f"Azure transcription failed: {str(e)}")
+
+
+@router.post("/transcribe")
+async def transcribe_audio(
+    file: UploadFile = File(...), user: User = Depends(current_user)
+) -> TranscriptionResponse:
+    """Transcribe uploaded audio file"""
+    config = get_audio_config_from_store()
+
+    if not config.stt.engine:
+        raise HTTPException(
+            status_code=400, 
+            detail="No STT engine configured. Please configure audio settings in admin panel."
+        )
+
+    if not file.content_type or not file.content_type.startswith("audio/"):
+        raise HTTPException(
+            status_code=400, 
+            detail="Invalid file type. Please upload an audio file."
+        )
+
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_file:
+        try:
+            content = await file.read()
+            temp_file.write(content)
+            temp_file.flush()
+
+            if config.stt.engine == "openai":
+                transcript = await transcribe_with_openai(temp_file.name, config.stt)
+            elif config.stt.engine == "deepgram":
+                transcript = await transcribe_with_deepgram(temp_file.name, config.stt)
+            elif config.stt.engine == "azure":
+                transcript = await transcribe_with_azure(temp_file.name, config.stt)
+            else:
+                raise HTTPException(
+                    status_code=400, 
+                    detail=f"Unsupported STT engine: {config.stt.engine}"
+                )
+
+            return TranscriptionResponse(text=transcript)
+
+        except HTTPException:
+            raise
+        except Exception as e:
+            logger.error(f"Transcription error: {e}")
+            raise HTTPException(
+                status_code=500, 
+                detail=f"Transcription failed: {str(e)}"
+            )
+        finally:
+            if os.path.exists(temp_file.name):
+                os.unlink(temp_file.name)
diff --git a/backend/onyx/server/audio/models.py b/backend/onyx/server/audio/models.py
@@ -0,0 +1,29 @@
+from pydantic import BaseModel
+from typing import Optional
+
+
+class STTConfigForm(BaseModel):
+    engine: str = ""
+    openai_api_key: str = ""
+    openai_api_base_url: str = "https://api.openai.com/v1"
+    whisper_model: str = "whisper-1"
+    deepgram_api_key: str = ""
+    azure_api_key: str = ""
+    azure_region: str = ""
+
+
+class AudioConfig(BaseModel):
+    stt: STTConfigForm
+
+
+class TranscriptionResponse(BaseModel):
+    text: str
+
+
+class TestEngineRequest(BaseModel):
+    engine: str
+
+
+class TestEngineResponse(BaseModel):
+    success: bool
+    message: Optional[str] = None
diff --git a/deployment/docker_compose/docker-compose.dev.yml b/deployment/docker_compose/docker-compose.dev.yml
@@ -114,9 +114,9 @@ services:
       - HARD_DELETE_CHATS=${HARD_DELETE_CHATS:-}
 
       # Enables the use of bedrock models or IAM Auth
-      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-}
-      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-}
-      - AWS_REGION_NAME=${AWS_REGION_NAME:-}
+      - AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID:-dummy_key}
+      - AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY:-dummy_secret}
+      - AWS_REGION_NAME=${AWS_REGION_NAME:-us-east-1}
 
       # Enterprise Edition only
       - ENABLE_PAID_ENTERPRISE_EDITION_FEATURES=${ENABLE_PAID_ENTERPRISE_EDITION_FEATURES:-false}