mobiusml
diff --git a/‎.devcontainer/Dockerfile
Lines changed: 1 addition & 1 deletion b/‎.devcontainer/Dockerfile
Lines changed: 1 addition & 1 deletion
diff --git a/‎aana/api/api_generation.py
Lines changed: 6 additions & 6 deletions b/‎aana/api/api_generation.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎aana/api/app.py
Lines changed: 2 additions & 2 deletions b/‎aana/api/app.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎aana/api/request_handler.py
Lines changed: 1 addition & 1 deletion b/‎aana/api/request_handler.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎aana/configs/db.py
Lines changed: 2 additions & 1 deletion b/‎aana/configs/db.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎aana/configs/deployments.py
Lines changed: 5 additions & 4 deletions b/‎aana/configs/deployments.py
Lines changed: 5 additions & 4 deletions
diff --git a/‎aana/configs/settings.py
Lines changed: 3 additions & 3 deletions b/‎aana/configs/settings.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎aana/deployments/hf_blip2_deployment.py
Lines changed: 2 additions & 1 deletion b/‎aana/deployments/hf_blip2_deployment.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎aana/deployments/vllm_deployment.py
Lines changed: 17 additions & 6 deletions b/‎aana/deployments/vllm_deployment.py
Lines changed: 17 additions & 6 deletions
diff --git a/‎aana/deployments/whisper_deployment.py
Lines changed: 4 additions & 3 deletions b/‎aana/deployments/whisper_deployment.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎aana/models/db/transcript.py
Lines changed: 1 addition & 1 deletion b/‎aana/models/db/transcript.py
Lines changed: 1 addition & 1 deletion
@@ -1,2 +1,2 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 ffmpeg
@@ -7,7 +7,7 @@
 from fastapi.responses import StreamingResponse
 from mobius_pipeline.node.socket import Socket
 from mobius_pipeline.pipeline.pipeline import Pipeline
-from pydantic import BaseModel, Field, ValidationError, create_model, parse_raw_as
+from pydantic import BaseModel, Field, ValidationError, create_model
 
 from aana.api.app import custom_exception_handler
 from aana.api.responses import AanaJSONResponse
@@ -237,9 +237,9 @@ def get_file_upload_field(
                 continue
 
             # check if pydantic model has file_upload field and it's set to True
-            file_upload_enabled = getattr(data_model.Config, "file_upload", False)
-            file_upload_description = getattr(
-                data_model.Config, "file_upload_description", ""
+            file_upload_enabled = data_model.model_config.get("file_upload", False)
+            file_upload_description = data_model.model_config.get(
+                "file_upload_description", ""
             )
 
             if file_upload_enabled and file_upload_field is None:
@@ -330,7 +330,7 @@ def create_endpoint_func(  # noqa: C901
 
         async def route_func_body(body: str, files: list[UploadFile] | None = None):  # noqa: C901
             # parse form data as a pydantic model and validate it
-            data = parse_raw_as(RequestModel, body)
+            data = RequestModel.model_validate_json(body)
 
             # if the input requires file upload, add the files to the data
             if file_upload_field and files:
@@ -341,7 +341,7 @@ async def route_func_body(body: str, files: list[UploadFile] | None = None):  #
             # data.dict() will convert all nested models to dicts
             # and we want to keep them as pydantic models
             data_dict = {}
-            for field_name in data.__fields__:
+            for field_name in data.model_fields:
                 field_value = getattr(data, field_name)
                 data_dict[field_name] = field_value
 
 
@@ -28,7 +28,7 @@ async def validation_exception_handler(request: Request, exc: ValidationError):
             error="ValidationError",
             message="Validation error",
             data=exc.errors(),
-        ).dict(),
+        ).model_dump(),
     )
 
 
@@ -77,7 +77,7 @@ def custom_exception_handler(request: Request | None, exc_raw: Exception):
         status_code=status_code,
         content=ExceptionResponseModel(
             error=error, message=message, data=data, stacktrace=stacktrace
-        ).dict(),
+        ).model_dump(),
     )
 
 
 
@@ -11,7 +11,7 @@
 # TODO: improve type annotations
 
 
-@serve.deployment(route_prefix="/", num_replicas=1, ray_actor_options={"num_cpus": 0.1})
+@serve.deployment(ray_actor_options={"num_cpus": 0.1})
 @serve.ingress(app)
 class RequestHandler:
     """This class is used to handle requests to the Aana application."""
 
@@ -1,11 +1,12 @@
 from enum import Enum
 from os import PathLike
 from pathlib import Path
-from typing import TypeAlias, TypedDict
+from typing import TypeAlias
 
 from alembic import command
 from alembic.config import Config
 from sqlalchemy import String, TypeDecorator, create_engine
+from typing_extensions import TypedDict
 
 from aana.models.pydantic.media_id import MediaId
 
 
@@ -22,12 +22,13 @@
             model="TheBloke/Llama-2-7b-Chat-AWQ",
             dtype="auto",
             quantization="awq",
-            gpu_memory_reserved=10000,
+            gpu_memory_reserved=13000,
+            enforce_eager=True,
             default_sampling_params=SamplingParams(
                 temperature=0.0, top_p=1.0, top_k=-1, max_tokens=1024
             ),
             chat_template="llama2",
-        ).dict(),
+        ).model_dump(),
     ),
     "hf_blip2_deployment_opt_2_7b": HFBlip2Deployment.options(
         num_replicas=1,
@@ -38,7 +39,7 @@
             dtype=Dtype.FLOAT16,
             batch_size=2,
             num_processing_threads=2,
-        ).dict(),
+        ).model_dump(),
     ),
     "whisper_deployment_medium": WhisperDeployment.options(
         num_replicas=1,
@@ -47,7 +48,7 @@
         user_config=WhisperConfig(
             model_size=WhisperModelSize.MEDIUM,
             compute_type=WhisperComputeType.FLOAT16,
-        ).dict(),
+        ).model_dump(),
     ),
     "stablediffusion2_deployment": StableDiffusion2Deployment.options(
         num_replicas=1,
 
@@ -1,6 +1,6 @@
 from pathlib import Path
 
-from pydantic import BaseSettings
+from pydantic_settings import BaseSettings
 
 from aana.configs.db import DBConfig
 
@@ -17,8 +17,8 @@ class Settings(BaseSettings):
     """A pydantic model for SDK settings."""
 
     tmp_data_dir: Path = Path("/tmp/aana_data")  # noqa: S108
-    image_dir = tmp_data_dir / "images"
-    video_dir = tmp_data_dir / "videos"
+    image_dir: Path = tmp_data_dir / "images"
+    video_dir: Path = tmp_data_dir / "videos"
     num_workers: int = 2
 
     db_config: DBConfig = {
 
@@ -1,10 +1,11 @@
-from typing import Any, TypedDict
+from typing import Any
 
 import torch
 import transformers
 from pydantic import BaseModel, Field
 from ray import serve
 from transformers import Blip2ForConditionalGeneration, Blip2Processor
+from typing_extensions import TypedDict
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException
 
@@ -1,20 +1,26 @@
+import contextlib
 from collections.abc import AsyncGenerator
-from typing import Any, TypedDict
+from typing import Any
 
 from pydantic import BaseModel, Field
 from ray import serve
+from typing_extensions import TypedDict
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.model_executor.utils import set_random_seed
+
+with contextlib.suppress(ImportError):
+    from vllm.model_executor.utils import (
+        set_random_seed,  # Ignore if we don't have GPU and only run on CPU with test cache
+    )
 from vllm.sampling_params import SamplingParams as VLLMSamplingParams
-from vllm.utils import get_gpu_memory, random_uuid
+from vllm.utils import random_uuid
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException, PromptTooLongException
 from aana.models.pydantic.chat_message import ChatDialog, ChatMessage
 from aana.models.pydantic.sampling_params import SamplingParams
 from aana.utils.chat_template import apply_chat_template
-from aana.utils.general import merged_options
+from aana.utils.general import get_gpu_memory, merged_options
 from aana.utils.test import test_cache
 
 
@@ -28,6 +34,9 @@ class VLLMConfig(BaseModel):
         gpu_memory_reserved (float): the GPU memory reserved for the model in mb
         default_sampling_params (SamplingParams): the default sampling parameters.
         max_model_len (int): the maximum generated text length in tokens (optional, default: None)
+        chat_template (str): the name of the chat template, if not provided, the chat template from the model will be used
+                             but some models may not have a chat template (optional, default: None)
+        enforce_eager (bool): whether to enforce eager execution (optional, default: False)
     """
 
     model: str
@@ -37,6 +46,7 @@ class VLLMConfig(BaseModel):
     default_sampling_params: SamplingParams
     max_model_len: int | None = Field(default=None)
     chat_template: str | None = Field(default=None)
+    enforce_eager: bool | None = Field(default=False)
 
 
 class LLMOutput(TypedDict):
@@ -107,6 +117,7 @@ async def apply_config(self, config: dict[str, Any]):
             model=config_obj.model,
             dtype=config_obj.dtype,
             quantization=config_obj.quantization,
+            enforce_eager=config_obj.enforce_eager,
             gpu_memory_utilization=self.gpu_memory_utilization,
             max_model_len=config_obj.max_model_len,
         )
@@ -116,7 +127,7 @@ async def apply_config(self, config: dict[str, Any]):
 
         # create the engine
         self.engine = AsyncLLMEngine.from_engine_args(args)
-        self.tokenizer = self.engine.engine.tokenizer
+        self.tokenizer = self.engine.engine.tokenizer.tokenizer
         self.model_config = await self.engine.get_model_config()
 
     @test_cache
@@ -148,7 +159,7 @@ async def generate_stream(
         try:
             # convert SamplingParams to VLLMSamplingParams
             sampling_params_vllm = VLLMSamplingParams(
-                **sampling_params.dict(exclude_unset=True)
+                **sampling_params.model_dump(exclude_unset=True)
             )
             # start the request
             request_id = random_uuid()
 
@@ -1,11 +1,12 @@
 from collections.abc import AsyncGenerator
 from enum import Enum
-from typing import Any, TypedDict, cast
+from typing import Any, cast
 
 import torch
 from faster_whisper import WhisperModel
 from pydantic import BaseModel, Field
 from ray import serve
+from typing_extensions import TypedDict
 
 from aana.deployments.base_deployment import BaseDeployment
 from aana.exceptions.general import InferenceException
@@ -161,7 +162,7 @@ async def transcribe(
             params = WhisperParams()
         media_path: str = str(media.path)
         try:
-            segments, info = self.model.transcribe(media_path, **params.dict())
+            segments, info = self.model.transcribe(media_path, **params.model_dump())
         except Exception as e:
             raise InferenceException(self.model_name) from e
 
@@ -196,7 +197,7 @@ async def transcribe_stream(
             params = WhisperParams()
         media_path: str = str(media.path)
         try:
-            segments, info = self.model.transcribe(media_path, **params.dict())
+            segments, info = self.model.transcribe(media_path, **params.model_dump())
         except Exception as e:
             raise InferenceException(self.model_name) from e
 
 
@@ -69,5 +69,5 @@ def from_asr_output(
             language=info.language,
             language_confidence=info.language_confidence,
             transcript=transcription.text,
-            segments=[s.dict() for s in segments],
+            segments=[s.model_dump() for s in segments],
         )
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04`
	`1`	`+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04`
`2`	`2`	`RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 ffmpeg`
Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,7 @@ async def validation_exception_handler(request: Request, exc: ValidationError):`
`28`	`28`	`error="ValidationError",`
`29`	`29`	`message="Validation error",`
`30`	`30`	`data=exc.errors(),`
`31`		`- ).dict(),`
	`31`	`+ ).model_dump(),`
`32`	`32`	`)`
`33`	`33`
`34`	`34`
`@@ -77,7 +77,7 @@ def custom_exception_handler(request: Request \| None, exc_raw: Exception):`
`77`	`77`	`status_code=status_code,`
`78`	`78`	`content=ExceptionResponseModel(`
`79`	`79`	`error=error, message=message, data=data, stacktrace=stacktrace`
`80`		`- ).dict(),`
	`80`	`+ ).model_dump(),`
`81`	`81`	`)`
`82`	`82`
`83`	`83`
Original file line number	Diff line number	Diff line change
`@@ -69,5 +69,5 @@ def from_asr_output(`
`69`	`69`	`language=info.language,`
`70`	`70`	`language_confidence=info.language_confidence,`
`71`	`71`	`transcript=transcription.text,`
`72`		`- segments=[s.dict() for s in segments],`
	`72`	`+ segments=[s.model_dump() for s in segments],`
`73`	`73`	`)`