diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 3e932555..0b17684d 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -159,56 +159,131 @@ jobs: [graph] model = "openai_gpt4-1" enable_entity_resolution = true + EOF - # Start container in detached mode with config mounted - CONTAINER_ID=$(docker run -d -p 8000:8000 \ - -e POSTGRES_URI="postgresql://morphik:morphik@localhost:5432/morphik" \ + # Create a custom network for the containers to communicate + NETWORK_NAME="morphik-test-network" + docker network create $NETWORK_NAME + + # Start pgvector PostgreSQL container + echo "Starting pgvector PostgreSQL container..." + DB_CONTAINER_ID=$(docker run -d \ + --name pgvector-test \ + --network $NETWORK_NAME \ + -e POSTGRES_USER=morphik \ + -e POSTGRES_PASSWORD=morphik \ + -e POSTGRES_DB=morphik \ + -p 5432:5432 \ + ankane/pgvector:latest) + + # Wait for PostgreSQL to be ready + echo "Waiting for PostgreSQL to be ready..." + for i in {1..30}; do + if docker exec $DB_CONTAINER_ID pg_isready -U morphik -d morphik; then + echo "PostgreSQL is ready!" + break + fi + if [ $i -eq 30 ]; then + echo "PostgreSQL failed to start" + docker logs $DB_CONTAINER_ID + exit 1 + fi + sleep 1 + done + + # Enable pgvector extension + echo "Enabling pgvector extension..." + docker exec $DB_CONTAINER_ID psql -U morphik -d morphik -c 'CREATE EXTENSION IF NOT EXISTS vector;' + + # Start Redis container (NOTE: must be named "redis") + echo "Starting Redis container..." + REDIS_CONTAINER_ID=$(docker run -d \ + --name redis \ + --network $NETWORK_NAME \ + -p 6379:6379 \ + -v redis_data:/data \ + redis:7-alpine \ + redis-server --appendonly yes) + + # Wait for Redis to be ready + echo "Waiting for Redis to be ready..." + for i in {1..30}; do + if docker exec $REDIS_CONTAINER_ID redis-cli ping | grep -q PONG; then + echo "Redis is ready!" + break + fi + if [ $i -eq 30 ]; then + echo "Redis failed to start" + docker logs $REDIS_CONTAINER_ID + exit 1 + fi + sleep 1 + done + + # Start application container + echo "Starting application container..." + CONTAINER_ID=$(docker run -d \ + --network $NETWORK_NAME \ + -p 8000:8000 \ + -e POSTGRES_URI="postgresql+asyncpg://morphik:morphik@pgvector-test:5432/morphik" \ -v "$(pwd)/morphik.toml.test:/app/morphik.toml" \ "$IMAGE_TAG") - + echo "Started container: $CONTAINER_ID" - - # Wait for server to be ready with 60 second timeout - timeout=60 + + # Wait for server to be ready + timeout=300 # long timeout required to load checkpoint shards interval=2 elapsed=0 - + echo "Waiting for server to be ready..." while [ $elapsed -lt $timeout ]; do if curl -f -s http://localhost:8000/ping > /dev/null 2>&1; then echo "✅ Server is responding to /ping endpoint" break fi - + echo "⏳ Waiting for server... (${elapsed}s/${timeout}s)" sleep $interval elapsed=$((elapsed + interval)) done - + # Check if we timed out if [ $elapsed -ge $timeout ]; then echo "❌ Server failed to respond within ${timeout} seconds" echo "Container logs:" docker logs "$CONTAINER_ID" - docker stop "$CONTAINER_ID" - docker rm "$CONTAINER_ID" + echo "Database logs:" + docker logs "$DB_CONTAINER_ID" + echo "Redis logs:" + docker logs "$REDIS_CONTAINER_ID" + docker stop "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker rm -f "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker network rm "$NETWORK_NAME" || true + docker volume rm redis_data 2>/dev/null || true exit 1 fi - + # Verify the response is actually 200 - HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/ping) - if [ "$HTTP_CODE" = "200" ]; then - echo "✅ Health check passed - /ping returned HTTP $HTTP_CODE" - else - echo "❌ Health check failed - /ping returned HTTP $HTTP_CODE" + response_code=$(curl -s -o /dev/null -w "%{http_code}" http://localhost:8000/ping) + if [ "$response_code" -ne 200 ]; then + echo "❌ Unexpected response code: $response_code" + echo "Container logs:" docker logs "$CONTAINER_ID" - docker stop "$CONTAINER_ID" - docker rm "$CONTAINER_ID" + echo "Database logs:" + docker logs "$DB_CONTAINER_ID" + echo "Redis logs:" + docker logs "$REDIS_CONTAINER_ID" + docker stop "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker rm -f "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker network rm "$NETWORK_NAME" || true + docker volume rm redis_data 2>/dev/null || true exit 1 fi - - # Clean up - echo "🧹 Cleaning up container" - docker stop "$CONTAINER_ID" - docker rm "$CONTAINER_ID" - echo "✅ Test completed successfully" \ No newline at end of file + + echo "✅ Tests passed!" + docker stop "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker rm -f "$CONTAINER_ID" "$DB_CONTAINER_ID" "$REDIS_CONTAINER_ID" || true + docker network rm "$NETWORK_NAME" || true + docker volume rm redis_data 2>/dev/null || true + echo "✅ Test completed successfully" \ No newline at end of file diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh new file mode 100755 index 00000000..b4969ece --- /dev/null +++ b/docker/docker-entrypoint.sh @@ -0,0 +1,58 @@ +#!/bin/bash +set -e + +# Copy default config if none exists +if [ ! -f /app/morphik.toml ]; then + cp /app/morphik.toml.default /app/morphik.toml +fi + +# Function to check PostgreSQL +check_postgres() { + if [ -n "$POSTGRES_URI" ]; then + # Extract connection details from POSTGRES_URI, which can be + # postgresql:// or postgresql+asyncpg:// + + # Using awk for more robust URI parsing that handles special characters + eval $(./parse-postgres-uri.py "$POSTGRES_URI") + + echo "Waiting for PostgreSQL..." + max_retries=30 + retries=0 + until PGPASSWORD=$PG_PASS pg_isready -h $PG_HOST -p $PG_PORT -U $PG_USER -d $PG_DB; do + retries=$((retries + 1)) + if [ $retries -eq $max_retries ]; then + echo "Error: PostgreSQL did not become ready in time" + exit 1 + fi + echo "Waiting for PostgreSQL... (Attempt $retries/$max_retries)" + sleep 2 + done + echo "PostgreSQL is ready!" + + # Verify database connection + # NOTE: preserve stderr for debugging + if ! PGPASSWORD=$PG_PASS psql -h $PG_HOST -p $PG_PORT -U $PG_USER -d $PG_DB -c "SELECT 1"; then + echo "Error: Could not connect to PostgreSQL database" + echo "POSTGRES_URI: $POSTGRES_URI" + echo "USER: $PG_USER" + echo "PASS: $PG_PASS" + echo "HOST: $PG_HOST" + echo "PORT: $PG_PORT" + echo "DB: $PG_DB" + exit 1 + fi + echo "PostgreSQL connection verified!" + fi +} + +# Check PostgreSQL +check_postgres + +# Check if command arguments were passed ($# is the number of arguments) +if [ $# -gt 0 ]; then + # If arguments exist, execute them (e.g., execute "arq core.workers...") + exec "$@" +else + # Otherwise, execute the default command (uv run start_server.py) + exec uv run uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto +fi \ No newline at end of file diff --git a/docker/morphik.toml.default b/docker/morphik.toml.default new file mode 100644 index 00000000..6c7b24ee --- /dev/null +++ b/docker/morphik.toml.default @@ -0,0 +1,41 @@ +[api] +host = "0.0.0.0" +port = 8000 +reload = false + +[auth] +jwt_algorithm = "HS256" +dev_mode = true +dev_entity_id = "dev_user" +dev_entity_type = "developer" +dev_permissions = ["read", "write", "admin"] + +[completion] +provider = "ollama" +model_name = "llama2" +base_url = "http://localhost:11434" + +[database] +provider = "postgres" + +[embedding] +provider = "ollama" +model_name = "nomic-embed-text" +dimensions = 768 +similarity_metric = "cosine" +base_url = "http://localhost:11434" + +[parser] +chunk_size = 1000 +chunk_overlap = 200 +use_unstructured_api = false + +[reranker] +use_reranker = false + +[storage] +provider = "local" +storage_path = "/app/storage" + +[vector_store] +provider = "pgvector" diff --git a/docker/parse-postgres-uri.py b/docker/parse-postgres-uri.py new file mode 100755 index 00000000..29f9a39a --- /dev/null +++ b/docker/parse-postgres-uri.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python3 +import re +import sys +import urllib.parse +from typing import Dict + +def parse_postgres_uri(uri: str) -> Dict[str, str]: + """Parse a PostgreSQL connection URI into its components. + + Args: + uri: The PostgreSQL connection URI (e.g., 'postgresql://user:pass@host:port/db') + + Returns: + Dictionary containing the parsed components (USER, PASS, HOST, PORT, DB) + """ + # Default values + result = { + 'USER': '', + 'PASS': '', + 'HOST': '', + 'PORT': '5432', # Default PostgreSQL port + 'DB': 'postgres' # Default database name + } + + try: + # PostgreSQL URI pattern: + # postgresql[+driver]://[user[:password]@][netloc][:port][/dbname][?param1=value1&...] + pattern = r''' + ^ + postgresql(?:\+[a-z]+)?:// # scheme with optional driver + (?:([^:/?#@]+)(?::([^/?#@]*))?@)? # user:password@ + (?:([^:/?#]+)(?::(\d+))?)? # host:port + (?:/([^?#]*))? # /dbname + (?:\?([^#]*))? # ?query + $ + ''' + + match = re.match(pattern, uri.strip(), re.VERBOSE) + if not match: + raise ValueError("Invalid PostgreSQL URI format") + + user, password, host, port, dbname, query = match.groups() + + # Handle username and password + if user: + result['USER'] = urllib.parse.unquote(user) + if password: + result['PASS'] = urllib.parse.unquote(password) + + # Handle host and port + if host: + result['HOST'] = host + if port: + result['PORT'] = port + + # Handle database name + if dbname: + result['DB'] = urllib.parse.unquote(dbname) + + # Handle query parameters (e.g., for password in query string) + if query and not result['PASS']: + for param in query.split('&'): # type: ignore + if '=' in param: + key, value = param.split('=', 1) + if key.lower() == 'password': + result['PASS'] = urllib.parse.unquote(value) + break + + except Exception as e: + # If any error occurs, print the failure message and exit + print(f"Error parsing PostgreSQL URI: {e}", file=sys.stderr) + print("POSTGRES_URI_PARSE_FAILURE=1", file=sys.stderr) + sys.exit(1) + + return result + +def main(): + if len(sys.argv) != 2: + print("Usage: eval $(parse-postgres-uri.sh \"postgresql://user:pass@host:port/db\")", file=sys.stderr) + sys.exit(1) + + # Print default empty values first (for compatibility with original script) + print('PG_USER=""') + print('PG_PASS=""') + print('PG_HOST=""') + print('PG_PORT=""') + print('PG_DB=""') + + # Parse the URI and print the results + try: + components = parse_postgres_uri(sys.argv[1]) + for key, value in components.items(): + # Escape special characters in the value for shell compatibility + escaped_value = value.replace('"', '\\"').replace('`', '\\`').replace('$', '\\$') + print(f'PG_{key}="{escaped_value}"') + except Exception as e: + print(f'Error parsing PostgreSQL URI: {e}', file=sys.stderr) + print('POSTGRES_URI_PARSE_FAILURE=1') + sys.exit(1) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/dockerfile b/dockerfile index 0d3c9d67..5c22a713 100644 --- a/dockerfile +++ b/dockerfile @@ -101,96 +101,11 @@ ENV VIRTUAL_ENV=/app/.venv ENV PATH="/app/.venv/bin:/usr/local/bin:${PATH}" # Create default configuration -RUN echo '[api]\n\ -host = "0.0.0.0"\n\ -port = 8000\n\ -reload = false\n\ -\n\ -[auth]\n\ -jwt_algorithm = "HS256"\n\ -dev_mode = true\n\ -dev_entity_id = "dev_user"\n\ -dev_entity_type = "developer"\n\ -dev_permissions = ["read", "write", "admin"]\n\ -\n\ -[completion]\n\ -provider = "ollama"\n\ -model_name = "llama2"\n\ -base_url = "http://localhost:11434"\n\ -\n\ -[database]\n\ -provider = "postgres"\n\ -\n\ -[embedding]\n\ -provider = "ollama"\n\ -model_name = "nomic-embed-text"\n\ -dimensions = 768\n\ -similarity_metric = "cosine"\n\ -base_url = "http://localhost:11434"\n\ -\n\ -[parser]\n\ -chunk_size = 1000\n\ -chunk_overlap = 200\n\ -use_unstructured_api = false\n\ -\n\ -[reranker]\n\ -use_reranker = false\n\ -\n\ -[storage]\n\ -provider = "local"\n\ -storage_path = "/app/storage"\n\ -\n\ -[vector_store]\n\ -provider = "pgvector"\n\ -' > /app/morphik.toml.default +COPY docker/morphik.toml.default /app/morphik.toml.default # Create startup script -RUN echo '#!/bin/bash\n\ -set -e\n\ -\n\ -# Copy default config if none exists\n\ -if [ ! -f /app/morphik.toml ]; then\n\ - cp /app/morphik.toml.default /app/morphik.toml\n\ -fi\n\ -\n\ -# Function to check PostgreSQL\n\ -check_postgres() {\n\ - if [ -n "$POSTGRES_URI" ]; then\n\ - echo "Waiting for PostgreSQL..."\n\ - max_retries=30\n\ - retries=0\n\ - until PGPASSWORD=$PGPASSWORD pg_isready -h postgres -U morphik -d morphik; do\n\ - retries=$((retries + 1))\n\ - if [ $retries -eq $max_retries ]; then\n\ - echo "Error: PostgreSQL did not become ready in time"\n\ - exit 1\n\ - fi\n\ - echo "Waiting for PostgreSQL... (Attempt $retries/$max_retries)"\n\ - sleep 2\n\ - done\n\ - echo "PostgreSQL is ready!"\n\ - \n\ - # Verify database connection\n\ - if ! PGPASSWORD=$PGPASSWORD psql -h postgres -U morphik -d morphik -c "SELECT 1" > /dev/null 2>&1; then\n\ - echo "Error: Could not connect to PostgreSQL database"\n\ - exit 1\n\ - fi\n\ - echo "PostgreSQL connection verified!"\n\ - fi\n\ -}\n\ -\n\ -# Check PostgreSQL\n\ -check_postgres\n\ -\n\ -# Check if command arguments were passed ($# is the number of arguments)\n\ -if [ $# -gt 0 ]; then\n\ - # If arguments exist, execute them (e.g., execute "arq core.workers...")\n\ - exec "$@"\n\ -else\n\ - # Otherwise, execute the default command (uv run start_server.py)\n\ - exec uv run uvicorn core.api:app --host $HOST --port $PORT --loop asyncio --http auto --ws auto --lifespan auto\n\ -fi\n\ -' > /app/docker-entrypoint.sh && chmod +x /app/docker-entrypoint.sh +COPY docker/parse-postgres-uri.py /app/parse-postgres-uri.py +COPY docker/docker-entrypoint.sh /app/docker-entrypoint.sh # Copy application code # pyproject.toml is needed for uv to identify the project context for `uv run` diff --git a/ee/ui-component/package.json b/ee/ui-component/package.json index 7053469e..def83d82 100644 --- a/ee/ui-component/package.json +++ b/ee/ui-component/package.json @@ -1,6 +1,6 @@ { "name": "@morphik/ui", - "version": "0.2.22", + "version": "0.2.24", "private": true, "description": "Modern UI component for Morphik - A powerful document processing and querying system", "author": "Morphik Team",