Merge pull request #202 from grillazz/201-ml-streaming-endpoint

grillazz · web-flow · commit b2711dc7d432 · 2025-05-03T13:31:14.000+02:00
201 ml streaming endpoint
diff --git a/.env b/.env
@@ -2,14 +2,14 @@ PYTHONDONTWRITEBYTECODE=1
 PYTHONUNBUFFERED=1
 
 # Postgres
-POSTGRES_HOST=db
+POSTGRES_HOST=localhost
 POSTGRES_PORT=5432
 POSTGRES_DB=devdb
 POSTGRES_USER=devdb
 POSTGRES_PASSWORD=secret
 
 # Redis
-REDIS_HOST=inmemory
+REDIS_HOST=localhost
 REDIS_PORT=6379
 REDIS_DB=2
 
diff --git a/Makefile b/Makefile
@@ -43,7 +43,7 @@ py-upgrade:	## Upgrade project py files with pyupgrade library for python versio
 
 .PHONY: lint
 lint:  ## Lint project code.
-	poetry run ruff check --fix .
+	uv run ruff check --fix .
 
 .PHONY: slim-build
 slim-build: ## with power of docker-slim build smaller and safer images
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@
         <li><a href="#worker-aware-async-scheduler">Schedule jobs</a></li>
         <li><a href="#smtp-setup">Email Configuration</a></li>
         <li><a href="#uv-knowledge-and-inspirations">UV knowledge and inspirations</a></li> 
+        <li><a href="#large-language-model">Integration with local LLM</a></li>  
       </ul>
     </li>
     <li><a href="#acknowledgments">Acknowledgments</a></li>
@@ -162,6 +163,24 @@ This service supports plaintext and HTML emails, and also allows sending templat
 It is implemented as a singleton to ensure that only one SMTP connection is maintained
 throughout the application lifecycle, optimizing resource usage.
 
+<p align="right">(<a href="#readme-top">back to top</a>)</p>
+
+### Large Language Model
+The `/v1/ml/chat/` endpoint is designed to handle chat-based interactions with the LLM model.
+It accepts a user prompt and streams responses back in real-time.
+The endpoint leverages FastAPI's asynchronous capabilities to efficiently manage multiple simultaneous requests,
+ensuring low latency and high throughput.
+
+FastAPI's async support is particularly beneficial for reducing I/O bottlenecks when connecting to the LLM model.
+By using asynchronous HTTP clients like `httpx`,
+the application can handle multiple I/O-bound tasks concurrently,
+such as sending requests to the LLM server and streaming responses back to the client.
+This approach minimizes idle time and optimizes resource utilization, making it ideal for high-performance applications.
+
+Install ollama and run the server
+```shell
+ollama run llama3.2
+```
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
@@ -215,6 +234,7 @@ I've included a few of my favorites to kick things off!
 - **[DEC 16 2024]** bump project to Python 3.13 :fast_forward:
 - **[JAN 28 2025]** add SMTP setup :email:
 - **[MAR 8 2025]** switch from poetry to uv :fast_forward:
+- **[MAY 3 2025]** add large language model integration :robot:
 
 <p align="right">(<a href="#readme-top">back to top</a>)</p>
 
diff --git a/app/api/ml.py b/app/api/ml.py
@@ -0,0 +1,16 @@
+from typing import Annotated
+
+from fastapi import APIRouter, Depends, Form
+from fastapi.responses import StreamingResponse
+
+from app.services.llm import get_llm_service
+from app.utils.logging import AppLogger
+
+logger = AppLogger().get_logger()
+
+router = APIRouter()
+
+
+@router.post("/chat/")
+async def chat(prompt: Annotated[str, Form()], llm_service=Depends(get_llm_service)):
+    return StreamingResponse(llm_service.stream_chat(prompt), media_type="text/plain")
diff --git a/app/main.py b/app/main.py
@@ -7,6 +7,7 @@
 from fastapi import Depends, FastAPI
 
 from app.api.health import router as health_router
+from app.api.ml import router as ml_router
 from app.api.nonsense import router as nonsense_router
 from app.api.shakespeare import router as shakespeare_router
 from app.api.stuff import router as stuff_router
@@ -45,12 +46,13 @@ async def lifespan(_app: FastAPI):
         await _app.postgres_pool.close()
 
 
-app = FastAPI(title="Stuff And Nonsense API", version="0.17", lifespan=lifespan)
+app = FastAPI(title="Stuff And Nonsense API", version="0.18.0", lifespan=lifespan)
 
 app.include_router(stuff_router)
 app.include_router(nonsense_router)
 app.include_router(shakespeare_router)
 app.include_router(user_router)
+app.include_router(ml_router, prefix="/v1/ml", tags=["ML"])
 
 
 app.include_router(health_router, prefix="/v1/public/health", tags=["Health, Public"])
diff --git a/app/services/llm.py b/app/services/llm.py
@@ -0,0 +1,52 @@
+from collections.abc import AsyncGenerator
+
+import httpx
+import orjson
+
+
+class StreamLLMService:
+    def __init__(self, base_url: str = "http://localhost:11434/v1"):
+        self.base_url = base_url
+        self.model = "llama3.2"
+
+    async def stream_chat(self, prompt: str) -> AsyncGenerator[bytes]:
+        """Stream chat completion responses from LLM."""
+        # Send the user a message first
+        user_msg = {
+            "role": "user",
+            "content": prompt,
+        }
+        yield orjson.dumps(user_msg) + b"\n"
+
+        # Open client as context manager and stream responses
+        async with httpx.AsyncClient(base_url=self.base_url) as client:
+            async with client.stream(
+                "POST",
+                "/chat/completions",
+                json={
+                    "model": self.model,
+                    "messages": [{"role": "user", "content": prompt}],
+                    "stream": True,
+                },
+                timeout=60.0,
+            ) as response:
+                async for line in response.aiter_lines():
+                    if line.startswith("data: ") and line != "data: [DONE]":
+                        try:
+                            json_line = line[6:]  # Remove "data: " prefix
+                            data = orjson.loads(json_line)
+                            content = (
+                                data.get("choices", [{}])[0]
+                                .get("delta", {})
+                                .get("content", "")
+                            )
+                            if content:
+                                model_msg = {"role": "model", "content": content}
+                                yield orjson.dumps(model_msg) + b"\n"
+                        except Exception:
+                            pass
+
+
+# FastAPI dependency
+def get_llm_service(base_url: str | None = None) -> StreamLLMService:
+    return StreamLLMService(base_url=base_url or "http://localhost:11434/v1")
diff --git a/compose.yml b/compose.yml
@@ -1,6 +1,7 @@
 services:
   app:
     container_name: fsap_app
+    network_mode: host
     build: .
     env_file:
       - .env
@@ -22,6 +23,7 @@ services:
 
   db:
     container_name: fsap_db
+    network_mode: host
     build:
       context: ./db
       dockerfile: Dockerfile
@@ -46,6 +48,7 @@ services:
 
   inmemory:
     image: redis:latest
+    network_mode: host
     container_name: fsap_inmemory
     ports:
       - "6379:6379"
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,45 +1,45 @@
 [project]
 name = "fastapi-sqlalchemy-asyncpg"
-version = "0.1.0"
+version = "0.18.0"
 description = "A modern FastAPI application with SQLAlchemy 2.0 and AsyncPG for high-performance async database operations. Features include JWT authentication with Redis token storage, password hashing, connection pooling, data processing with Polars, Rich logging, task scheduling with APScheduler, and Shakespeare datasets integration."
 readme = "README.md"
 requires-python = ">=3.13"
 dependencies = [
-    "fastapi[all]>=0.115.11",
-    "pydantic[email]>=2.10.6",
-    "pydantic-settings>=2.8.1",
-    "sqlalchemy>=2.0.38",
-    "uvicorn[standard]>=0.34.0",
+    "fastapi[all]>=0.115.12",
+    "pydantic[email]>=2.11.4",
+    "pydantic-settings>=2.9.1",
+    "sqlalchemy>=2.0.40",
+    "uvicorn[standard]>=0.34.2",
     "asyncpg>=0.30.0",
-    "alembic>=1.15.1",
+    "alembic>=1.15.2",
     "httpx>=0.28.1",
     "pytest>=8.3.5",
-    "pytest-cov>=6.0.0",
+    "pytest-cov>=6.1.1",
     "uvloop>=0.21.0",
     "httptools>=0.6.4",
-    "rich>=13.9.4",
+    "rich>=14.0.0",
     "pyjwt>=2.10.1",
-    "redis>=5.2.1",
+    "redis>=6.0.0",
     "bcrypt>=4.3.0",
-    "polars>=1.24.0",
+    "polars>=1.29.0",
     "python-multipart>=0.0.20",
-    "fastexcel>=0.13.0",
-    "inline-snapshot>=0.17.0",
-    "dirty-equals>=0.8.0",
-    "polyfactory>=2.18.1",
-    "granian>=1.7.0",
-    "apscheduler[redis,sqlalchemy]>=4.0.0a5",
+    "fastexcel>=0.14.0",
+    "inline-snapshot>=0.23.0",
+    "dirty-equals>=0.9.0",
+    "polyfactory>=2.21.0",
+    "granian>=2.2.5",
+    "apscheduler[redis,sqlalchemy]>=4.0.0a6",
 ]
 
 [tool.uv]
 dev-dependencies = [
-    "ruff>=0.9.10",
+    "ruff>=0.11.8",
     "devtools[pygments]>=0.12.2",
     "pyupgrade>=3.19.1",
-    "ipython>=9.0.2",
+    "ipython>=9.2.0",
     "sqlacodegen>=3.0.0",
     "tryceratops>=2.4.1",
-    "locust>=2.33.0"
+    "locust>=2.36.2"
 
 ]
 
diff --git a/tests/chat.py b/tests/chat.py
@@ -0,0 +1,32 @@
+import anyio
+import httpx
+import orjson
+
+
+async def chat_with_endpoint():
+    async with httpx.AsyncClient() as client:
+        while True:
+            # Get user input
+            prompt = input("\nYou: ")
+            if prompt.lower() == "exit":
+                break
+
+            # Send request to the API
+            print("\nModel: ", end="", flush=True)
+            async with client.stream(
+                "POST",
+                "http://0.0.0.0:8080/v1/ml/chat/",
+                data={"prompt": prompt},
+                timeout=60,
+            ) as response:
+                async for chunk in response.aiter_lines():
+                    if chunk:
+                        try:
+                            data = orjson.loads(chunk)
+                            print(data["content"], end="", flush=True)
+                        except Exception as e:
+                            print(f"\nError parsing chunk: {e}")
+
+
+if __name__ == "__main__":
+    anyio.run(chat_with_endpoint)
diff --git a/uv.lock b/uv.lock