diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/.gitignore b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/.gitignore new file mode 100644 index 00000000..a271c463 --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/.gitignore @@ -0,0 +1,39 @@ +# Model files +models/*.gguf + +# Python +__pycache__/ +*.py[cod] +*$py.class +.Python +venv/ +env/ + +# Jupyter +.ipynb_checkpoints/ +*.ipynb_checkpoints + +# Audio files +*.wav +*.mp3 +*.m4a + +# Image files +*.png +*.jpg +*.jpeg +!docs/*.png + +# Logs +*.log +server.log + +# OS +.DS_Store +Thumbs.db + +# IDE +.vscode/ +.idea/ +*.swp +*.swo \ No newline at end of file diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/README.md b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/README.md new file mode 100644 index 00000000..1af91d4c --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/README.md @@ -0,0 +1,202 @@ +# LlamaCpp Provider for Strands SDK + +This tutorial demonstrates using the LlamaCpp provider with Strands Agents. LlamaCpp enables running quantized models locally with advanced features like grammar constraints, multimodal support, and custom sampling parameters. + +## Prerequisites + +- Python 3.8+ +- llama.cpp with server support ([Installation Guide](https://github.com/ggerganov/llama.cpp)) +- 16GB RAM (minimum 8GB) +- 8GB storage for model files + +## Quick Start + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. Download Model Files + +Download the quantized Qwen2.5-Omni model for multimodal capabilities: + +```bash +# Create models directory +mkdir -p models && cd models + +# Download main model (4.68 GB) +huggingface-cli download ggml-org/Qwen2.5-Omni-7B-GGUF \ + Qwen2.5-Omni-7B-Q4_K_M.gguf --local-dir . + +# Download multimodal projector (1.55 GB) +huggingface-cli download ggml-org/Qwen2.5-Omni-7B-GGUF \ + mmproj-Qwen2.5-Omni-7B-Q8_0.gguf --local-dir . + +cd .. +``` + +Both files are required for audio and vision support. + +### 3. Start LlamaCpp Server + +```bash +llama-server -m models/Qwen2.5-Omni-7B-Q4_K_M.gguf \ + --mmproj models/mmproj-Qwen2.5-Omni-7B-Q8_0.gguf \ + --host 0.0.0.0 --port 8080 -c 8192 -ngl 50 --jinja +``` + +Key parameters: +- `-m`: Path to main model file +- `--mmproj`: Path to multimodal projector +- `-c`: Context window size (default: 8192) +- `-ngl`: Number of GPU layers (0 for CPU-only) +- `--jinja`: Enable template support for tools + +The server will log "loaded multimodal model" when ready. + +### 4. Run the Tutorial + +```bash +jupyter notebook llamacpp_demo.ipynb +``` + +## Key Features + +### Grammar Constraints + +Control output format using GBNF (Backus-Naur Form) grammars: + +```python +model.use_grammar_constraint('root ::= "yes" | "no"') +``` + +### Advanced Sampling + +LlamaCpp provides fine-grained control over text generation: + +- **Mirostat**: Dynamic perplexity control +- **TFS**: Tail-free sampling for quality improvement +- **Min-p**: Minimum probability threshold +- **Custom sampler ordering**: Control sampling pipeline + +### Structured Output + +Generate validated JSON output using Pydantic models: + +```python +agent.structured_output(MyModel, "Generate user data") +``` + +### Multimodal Capabilities + +- **Audio Input**: Process speech and audio files +- **Vision Input**: Analyze images +- **Combined Processing**: Simultaneous audio-visual understanding + +### Performance Optimization + +- Prompt caching for repeated queries +- Slot-based session management +- GPU acceleration with configurable layers + +## Tutorial Content + +The Jupyter notebook demonstrates: + +1. **Grammar Constraints**: Enforce specific output formats +2. **Sampling Strategies**: Compare generation quality with different parameters +3. **Structured Output**: Type-safe data generation +4. **Tool Integration**: Function calling with LlamaCpp +5. **Audio Processing**: Speech recognition and understanding +6. **Image Analysis**: Visual content interpretation +7. **Multimodal Agents**: Combined audio-visual processing +8. **Performance Testing**: Optimization techniques and benchmarks + +## Additional Examples + +The `examples/` directory contains standalone Python scripts demonstrating specific features. + +## Parameter Reference + +### Standard Parameters + +- `temperature`: Controls randomness (0.0-2.0) +- `max_tokens`: Maximum response length +- `top_p`: Nucleus sampling threshold +- `frequency_penalty`: Reduce repetition +- `presence_penalty`: Encourage topic diversity + +### LlamaCpp-Specific Parameters + +- `grammar`: GBNF grammar string +- `json_schema`: JSON schema for structured output +- `mirostat`: Enable Mirostat sampling (0, 1, or 2) +- `min_p`: Minimum probability cutoff +- `repeat_penalty`: Penalize token repetition +- `cache_prompt`: Enable prompt caching +- `slot_id`: Session slot for multi-user support + +See the notebook for detailed parameter usage examples. + +## Hardware Requirements + +### Minimum Configuration +- 8GB RAM +- 4GB VRAM (or CPU-only mode) +- 8GB storage + +### Recommended Configuration +- 16GB RAM +- 8GB+ VRAM +- CUDA-capable GPU or Apple Silicon + +### GPU Acceleration +- **NVIDIA**: Requires CUDA toolkit +- **Apple Silicon**: Metal support included +- **AMD**: ROCm support (experimental) +- **CPU Mode**: Set `-ngl 0` when starting server + +## About Quantized Models + +### What is Quantization? + +Quantization reduces model size by using lower precision numbers (e.g., 4-bit instead of 16-bit). This enables running large language models on consumer hardware with minimal quality loss. + +### Qwen2.5-Omni-7B Model + +- **Parameters**: 7.6 billion +- **Quantization**: 4-bit (Q4_K_M format) +- **Size**: 4.68GB (vs ~15GB unquantized) +- **Context**: 8,192 tokens (expandable to 32K) +- **Languages**: 23 languages supported + +### LlamaCpp vs Ollama + +| Feature | LlamaCpp | Ollama | +|---------|----------|--------| +| **Model Format** | GGUF files | Modelfile abstraction | +| **Control** | Full parameter access | Simplified interface | +| **Features** | Grammar, multimodal, sampling | Basic generation | +| **Use Case** | Advanced applications | Quick prototyping | + +LlamaCpp provides lower-level control suitable for production applications requiring specific output formats or advanced features. + +## Troubleshooting + +### Common Issues + +1. **Server won't start**: Verify llama.cpp installation and model file paths +2. **Out of memory**: Reduce GPU layers with `-ngl` parameter +3. **No multimodal support**: Ensure both model files are downloaded +4. **Slow performance**: Enable GPU acceleration or reduce context size + +### Additional Resources + +- [LlamaCpp Documentation](https://github.com/ggerganov/llama.cpp) +- [GGUF Format Specification](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) +- [Strands SDK Documentation](https://docs.strands.dev) + +## License + +MIT \ No newline at end of file diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/llamacpp.ipynb b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/llamacpp.ipynb new file mode 100644 index 00000000..4ce4c030 --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/llamacpp.ipynb @@ -0,0 +1,604 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a Multimodal Agent with LlamaCpp and Strands SDK\n", + "\n", + "This notebook demonstrates how to create agents using LlamaCpp with Strands SDK. You'll learn to use quantized models locally with advanced features like grammar constraints, multimodal processing, and custom tools.\n", + "\n", + "LlamaCpp supports any GGUF-format quantized model. You can easily switch between models by downloading different GGUF files and updating the model path. Popular options include Llama, Mistral, Phi, and Qwen families. Simply change the model file in your server command to use a different model:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup and Installation\n", + "\n", + "For this tutorial, we use Qwen2.5-Omni for its multimodal capabilities (audio + vision + text).\n", + "\n", + "Before running this notebook, ensure you have:\n", + "\n", + "1. **Python 3.8+** installed\n", + "2. **llama.cpp** with server support ([Installation Guide](https://github.com/ggerganov/llama.cpp))\n", + "3. **Model files** downloaded:\n", + "\n", + "```bash\n", + "# Download Qwen2.5-Omni model files\n", + "mkdir -p models && cd models\n", + "\n", + "# Main model (4.68 GB)\n", + "huggingface-cli download ggml-org/Qwen2.5-Omni-7B-GGUF \\\n", + " Qwen2.5-Omni-7B-Q4_K_M.gguf --local-dir .\n", + "\n", + "# Multimodal projector (1.55 GB) - Required for audio/vision\n", + "huggingface-cli download ggml-org/Qwen2.5-Omni-7B-GGUF \\\n", + " mmproj-Qwen2.5-Omni-7B-Q8_0.gguf --local-dir .\n", + "\n", + "cd ..\n", + "```\n", + "\n", + "4. **Start the server**:\n", + "\n", + "```bash\n", + "llama-server -m models/Qwen2.5-Omni-7B-Q4_K_M.gguf \\\n", + " --mmproj models/mmproj-Qwen2.5-Omni-7B-Q8_0.gguf \\\n", + " --host 0.0.0.0 --port 8080 -c 8192 -ngl 50 --jinja\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Install Python Dependencies\n", + "\n", + "Install the Strands SDK and required libraries for audio processing, image handling, and notebook widgets." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Install dependencies from requirements.txt\n", + "!pip install -q -r requirements.txt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Import Required Libraries\n", + "\n", + "Import the Strands SDK components and utility functions we'll use throughout this tutorial. The utils folder contains helper functions for audio, image, grammar, and benchmarking operations." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import json\n", + "from datetime import datetime\n", + "from typing import List, Dict, Any\n", + "from pathlib import Path\n", + "\n", + "# Add utils to path\n", + "utils_path = os.path.join(os.getcwd(), 'utils')\n", + "if utils_path not in sys.path:\n", + " sys.path.append(utils_path)\n", + "\n", + "# Import Strands SDK\n", + "from strands import Agent, tool\n", + "from strands.models.llamacpp import LlamaCppModel\n", + "from pydantic import BaseModel, Field\n", + "\n", + "# Import utilities\n", + "from utils import (\n", + " # Audio utilities\n", + " AudioRecorder, create_audio_interface, display_audio_interface,\n", + " \n", + " # Image utilities \n", + " create_test_image, image_to_bytes, analyze_image_with_llamacpp,\n", + " \n", + " # Grammar and sampling utilities\n", + " demonstrate_grammar_constraint, test_sampling_strategy,\n", + " get_predefined_grammars, get_sampling_strategies,\n", + " \n", + " # Benchmark utilities\n", + " benchmark_performance, run_comprehensive_benchmark\n", + ")\n", + "\n", + "# IPython for multimedia display\n", + "from IPython.display import Audio, Image as IPImage, display, HTML\n", + "import ipywidgets as widgets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Define Structured Data Models\n", + "\n", + "First, define Pydantic models that will be used for type-safe structured output generation. These models ensure the AI generates data in exactly the format you need." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define structured output models\n", + "class TaskPlan(BaseModel):\n", + " \"\"\"A structured task plan.\"\"\"\n", + " title: str = Field(description=\"Brief title of the task\")\n", + " steps: List[str] = Field(description=\"List of steps to complete\")\n", + " estimated_time: int = Field(description=\"Estimated time in minutes\")\n", + " difficulty: str = Field(description=\"Easy, Medium, or Hard\")\n", + " \n", + "class ProductReview(BaseModel):\n", + " \"\"\"A structured product review.\"\"\"\n", + " product_name: str = Field(description=\"Name of the product\")\n", + " rating: int = Field(description=\"Rating from 1 to 5\")\n", + " pros: List[str] = Field(description=\"Positive aspects\")\n", + " cons: List[str] = Field(description=\"Negative aspects\")\n", + " recommendation: bool = Field(description=\"Would you recommend it?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sampling Parameters\n", + "\n", + "LlamaCpp offers fine-grained control over text generation through various sampling strategies. The following examples demonstrate how different parameters affect output quality and creativity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test different sampling strategies\n", + "strategies = get_sampling_strategies()\n", + "prompt = \"Write a creative story opening about a mysterious door:\"\n", + "\n", + "# Test first 3 strategies\n", + "for strategy in strategies[:3]:\n", + " test_sampling_strategy(\n", + " params=strategy[\"params\"],\n", + " name=strategy[\"name\"],\n", + " prompt=prompt\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grammar Constraints in Action\n", + "\n", + "Test predefined GBNF grammars that force specific output formats. Watch how the model's responses are constrained to match exact patterns like yes/no, multiple choice, or JSON structures." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Demonstrate various grammar constraints\n", + "grammars = get_predefined_grammars()\n", + "\n", + "# Test a few interesting examples\n", + "examples_to_test = [\"yes_no\", \"multiple_choice\", \"simple_json\", \"color_names\"]\n", + "\n", + "for grammar_name in examples_to_test:\n", + " if grammar_name in grammars:\n", + " grammar_info = grammars[grammar_name]\n", + " \n", + " demonstrate_grammar_constraint(\n", + " grammar=grammar_info[\"grammar\"],\n", + " prompt=grammar_info[\"example_prompt\"],\n", + " description=f\"{grammar_name.upper()}: {grammar_info['description']}\",\n", + " base_url=\"http://localhost:8080\",\n", + " temperature=0.1,\n", + " max_tokens=50\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom Grammar Examples\n", + "\n", + "Create your own GBNF grammar for specific use cases and use JSON schemas as an alternative constraint method. Both approaches guarantee structured output without post-processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Example: Create a custom grammar for star ratings\n", + "star_rating_grammar = '''root ::= rating \" stars\"\n", + "rating ::= \"1\" | \"2\" | \"3\" | \"4\" | \"5\"'''\n", + "\n", + "# Create model with custom grammar\n", + "model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\"temperature\": 0.1, \"max_tokens\": 20}\n", + ")\n", + "\n", + "# Apply the grammar constraint\n", + "model.use_grammar_constraint(star_rating_grammar)\n", + "agent = Agent(model=model)\n", + "\n", + "# Test the constraint\n", + "test_prompts = [\n", + " \"How would you rate this restaurant?\",\n", + " \"What's your opinion on this movie?\",\n", + " \"Rate the customer service experience:\"\n", + "]\n", + "\n", + "for prompt in test_prompts:\n", + " response = agent(prompt)\n", + " print(f\"{prompt} -> {response}\")\n", + "\n", + "# Example: JSON Schema constraint\n", + "json_model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\"temperature\": 0.3, \"max_tokens\": 100}\n", + ")\n", + "\n", + "# Define JSON schema for structured output\n", + "product_schema = {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"product_name\": {\"type\": \"string\"},\n", + " \"price\": {\"type\": \"number\", \"minimum\": 0},\n", + " \"category\": {\"type\": \"string\", \"enum\": [\"electronics\", \"clothing\", \"food\", \"books\"]},\n", + " \"in_stock\": {\"type\": \"boolean\"}\n", + " },\n", + " \"required\": [\"product_name\", \"price\", \"category\", \"in_stock\"]\n", + "}\n", + "\n", + "json_model.use_json_schema(product_schema)\n", + "json_agent = Agent(model=json_model)\n", + "\n", + "response = json_agent(\"Generate information for a laptop product:\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Custom Tools\n", + "\n", + "Extend your agent's capabilities by defining custom functions that the model can call. The following tools demonstrate how to add domain-specific functionality to your agent." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Define custom tools\n", + "@tool\n", + "def calculate_bmi(weight_kg: float, height_m: float) -> Dict[str, Any]:\n", + " \"\"\"\n", + " Calculate Body Mass Index (BMI).\n", + " \n", + " Args:\n", + " weight_kg: Weight in kilograms\n", + " height_m: Height in meters\n", + " \n", + " Returns:\n", + " BMI value and category\n", + " \"\"\"\n", + " bmi = weight_kg / (height_m ** 2)\n", + " \n", + " if bmi < 18.5:\n", + " category = \"Underweight\"\n", + " elif bmi < 25:\n", + " category = \"Normal weight\"\n", + " elif bmi < 30:\n", + " category = \"Overweight\"\n", + " else:\n", + " category = \"Obese\"\n", + " \n", + " return {\n", + " \"bmi\": round(bmi, 2),\n", + " \"category\": category,\n", + " \"healthy_range\": \"18.5 - 24.9\"\n", + " }\n", + "\n", + "@tool\n", + "def get_weather_description(condition: str) -> str:\n", + " \"\"\"\n", + " Get a poetic description of weather conditions.\n", + " \n", + " Args:\n", + " condition: Weather condition (sunny, rainy, cloudy, etc.)\n", + " \n", + " Returns:\n", + " Poetic weather description\n", + " \"\"\"\n", + " descriptions = {\n", + " \"sunny\": \"Golden rays dance across azure skies\",\n", + " \"rainy\": \"Silver droplets paint the world anew\",\n", + " \"cloudy\": \"Cotton castles drift through endless blue\",\n", + " \"snowy\": \"Crystal blankets hush the sleeping earth\"\n", + " }\n", + " \n", + " return descriptions.get(condition.lower(), f\"The weather shows its {condition} face\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create Agent with Tools\n", + "\n", + "Initialize an agent with access to your custom tools. The agent will automatically determine when to use these tools based on the user's query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create agent with tools\n", + "model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\n", + " \"temperature\": 0.7,\n", + " \"max_tokens\": 300,\n", + " \"top_k\": 40\n", + " }\n", + ")\n", + "\n", + "agent = Agent(\n", + " model=model,\n", + " tools=[calculate_bmi, get_weather_description],\n", + " system_prompt=\"You are a helpful assistant with access to calculation and description tools.\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test Tool Usage\n", + "\n", + "Observe how the agent intelligently calls the appropriate tools based on natural language queries. The agent handles both single and compound tool requests seamlessly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test tool usage\n", + "test_queries = [\n", + " \"What's the BMI for someone who is 1.75m tall and weighs 70kg?\",\n", + " \"Give me a poetic description of rainy weather\",\n", + " \"Calculate BMI for 85kg and 1.80m, then describe sunny weather\"\n", + "]\n", + "\n", + "for query in test_queries:\n", + " response = agent(query)\n", + " print(f\"Q: {query}\")\n", + " print(f\"A: {response}\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multimodal Processing\n", + "\n", + "Process audio and images alongside text using Qwen2.5-Omni's multimodal capabilities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create speech recognition interface\n", + "recorder = AudioRecorder(sample_rate=16000)\n", + "\n", + "# Create interface for multilingual speech recognition\n", + "interface_components = create_audio_interface(\n", + " recorder=recorder,\n", + " base_url=\"http://localhost:8080\"\n", + ")\n", + "\n", + "# Display the interface\n", + "display_audio_interface(interface_components)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + " Example: Audio message format for Qwen2.5-Omni\n", + "\n", + "```python\n", + "example_message = {\n", + " \"role\": \"user\",\n", + " \"content\": [\n", + " {\n", + " \"type\": \"audio\",\n", + " \"audio\": {\n", + " \"data\": \"base64_encoded_audio_data_here\",\n", + " \"format\": \"wav\"\n", + " }\n", + " },\n", + " {\n", + " \"type\": \"text\", \n", + " \"text\": \"Please transcribe exactly what was said. If not in English, provide: 1) Original transcription 2) Language detected 3) English translation\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + " The SDK handles this formatting automatically when using the interface above" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create and analyze test image\n", + "test_image = create_test_image()\n", + "display(test_image)\n", + "\n", + "# Analyze image\n", + "analysis = analyze_image_with_llamacpp(\n", + " test_image,\n", + " \"Describe this image in detail. What shapes and colors do you see?\",\n", + " max_tokens=200\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance Optimization\n", + "\n", + "Compare three optimization strategies to understand the trade-offs between quality, speed, and resource usage. Each configuration is tailored for different production scenarios." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Test different performance optimization settings\n", + "\n", + "# Configuration 1: High Quality (slower)\n", + "high_quality_model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\n", + " \"temperature\": 0.3,\n", + " \"top_k\": 10,\n", + " \"repeat_penalty\": 1.2,\n", + " \"max_tokens\": 100\n", + " }\n", + ")\n", + "\n", + "# Configuration 2: Balanced Performance\n", + "balanced_model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\n", + " \"temperature\": 0.7,\n", + " \"top_k\": 40,\n", + " \"min_p\": 0.05,\n", + " \"max_tokens\": 100,\n", + " \"cache_prompt\": True # Enable prompt caching\n", + " }\n", + ")\n", + "\n", + "# Configuration 3: Speed Optimized\n", + "speed_model = LlamaCppModel(\n", + " base_url=\"http://localhost:8080\",\n", + " params={\n", + " \"temperature\": 0.8,\n", + " \"top_k\": 20,\n", + " \"max_tokens\": 100,\n", + " \"cache_prompt\": True,\n", + " \"n_probs\": 0 # Disable probability computation\n", + " }\n", + ")\n", + "\n", + "# Test each configuration\n", + "prompt = \"Explain machine learning in simple terms:\"\n", + "\n", + "agent_hq = Agent(model=high_quality_model)\n", + "response_hq = agent_hq(prompt)\n", + "\n", + "agent_balanced = Agent(model=balanced_model)\n", + "response_balanced = agent_balanced(prompt)\n", + "\n", + "agent_speed = Agent(model=speed_model)\n", + "response_speed = agent_speed(prompt)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance Benchmark\n", + "\n", + "Run a comprehensive benchmark to measure response times and quality across different configurations. This data helps you choose optimal settings for your specific use case." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Run comprehensive performance benchmark\n", + "benchmark_results = run_comprehensive_benchmark(base_url=\"http://localhost:8080\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "This notebook demonstrated the key features of LlamaCpp with Strands SDK. You can now:\n", + "\n", + "- Experiment with different GGUF models from Hugging Face\n", + "- Create custom grammars for your specific use cases\n", + "- Build production applications with local AI\n", + "- Explore multimodal capabilities with other models\n", + "\n", + "For more examples and documentation, visit the [Strands SDK Documentation](https://docs.strands.ai)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/requirements.txt b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/requirements.txt new file mode 100644 index 00000000..a680aeec --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/requirements.txt @@ -0,0 +1,26 @@ +# Core Strands SDK +strands-agents>=1.4.0 + +# Web framework +fastapi>=0.115.0 + +# Data validation +pydantic>=2.0.0 + +# Numerical operations +numpy>=1.24.0 + +# HTTP requests +requests>=2.31.0 + +# Audio recording and processing +sounddevice>=0.4.6 +scipy>=1.10.0 +soundfile>=0.12.0 + +# Image processing +pillow>=10.0.0 + +# Jupyter notebook support +jupyter>=1.0.0 +ipywidgets>=8.0.0 \ No newline at end of file diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/README.md b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/README.md new file mode 100644 index 00000000..500799ed --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/README.md @@ -0,0 +1,162 @@ +# LlamaCpp Tutorial Utilities + +Support modules for the LlamaCpp tutorial notebook. + +## Modules + +### audio_utils.py +Audio recording and analysis utilities. + +**Classes:** +- `AudioRecorder`: Simple audio recorder for capturing microphone input + - Record audio with configurable duration and sample rate + - Play back recorded audio + - Convert audio to bytes for SDK integration + +**Functions:** +- `create_enhanced_audio_interface()`: Creates a comprehensive Jupyter widget interface +- `display_audio_interface()`: Displays the audio interface in notebooks + +**Features:** +- Progress tracking during recording and analysis +- Separate output areas for recording status, analysis, and transcription +- Error handling with troubleshooting guidance +- Support for Qwen2.5-Omni multimodal analysis + +### image_utils.py +Image processing and analysis utilities. + +**Functions:** +- `create_test_image()`: Create simple test images with geometric shapes +- `create_complex_test_image()`: Create complex scenes for advanced testing +- `image_to_bytes()`: Convert PIL images to bytes for SDK +- `analyze_image_with_llamacpp()`: Analyze images using LlamaCpp multimodal models +- `create_image_analysis_demo()`: Complete image analysis demonstration +- `load_external_image()`: Load images from file paths +- `resize_image()`: Resize images while maintaining aspect ratio + +**Features:** +- Programmatic test image generation +- Direct integration with Strands SDK +- Error handling for analysis failures +- Support for various image formats + +### grammar_utils.py +Grammar constraints and sampling utilities. + +**Functions:** +- `demonstrate_grammar_constraint()`: Test specific GBNF grammar constraints +- `get_predefined_grammars()`: Collection of common grammar patterns +- `test_sampling_strategy()`: Test different sampling configurations +- `get_sampling_strategies()`: Predefined sampling strategy configurations +- `test_structured_output()`: Generate structured output with Pydantic models +- `run_grammar_constraints_demo()`: Comprehensive grammar demonstration +- `run_sampling_strategies_demo()`: Comprehensive sampling demonstration +- `create_json_grammar()`: Generate GBNF grammars from JSON schemas + +**Features:** +- Pre-built grammar patterns +- Multiple sampling strategies +- Structured output generation +- Response timing analysis + +### benchmark_utils.py +Performance benchmarking utilities. + +**Functions:** +- `benchmark_performance()`: Comprehensive performance testing +- `analyze_benchmark_results()`: Statistical analysis of benchmark data +- `visualize_performance()`: Text-based performance visualizations +- `run_comprehensive_benchmark()`: Complete benchmark suite with analysis + +**Features:** +- Multiple configuration testing with statistical analysis +- Performance comparison with baseline measurements +- Text-based charts for response time, tokens/sec, and consistency +- Recommendations based on benchmark results +- Error handling for failed benchmark runs + +## Usage Examples + +### Audio Recording +```python +from utils import AudioRecorder, create_enhanced_audio_interface, display_audio_interface + +# Create recorder +recorder = AudioRecorder(sample_rate=16000) + +# Create interface +interface = create_enhanced_audio_interface(recorder) +display_audio_interface(interface) +``` + +### Image Analysis +```python +from utils import create_test_image, analyze_image_with_llamacpp + +# Create and analyze image +image = create_test_image() +analysis = analyze_image_with_llamacpp(image, "Describe this image") +print(analysis) +``` + +### Grammar Constraints +```python +from utils import demonstrate_grammar_constraint, get_predefined_grammars + +# Get available grammars +grammars = get_predefined_grammars() + +# Test yes/no constraint +demonstrate_grammar_constraint( + grammars["yes_no"]["grammar"], + "Is Python interpreted?", + "Yes/No responses only" +) +``` + +### Performance Benchmarking +```python +from utils import run_comprehensive_benchmark + +# Run complete benchmark suite +results = run_comprehensive_benchmark() +print(f"Fastest config: {results['summary']['fastest_config']}") +``` + +## Dependencies + +- `strands`: Strands SDK +- `sounddevice`, `soundfile`, `scipy`: Audio processing +- `PIL`: Image manipulation +- `ipywidgets`: Notebook widgets +- `pydantic`: Data validation +- `numpy`: Numerical operations + +## Integration with Notebook + +The main notebook imports all utilities with: + +```python +from utils import ( + # Audio utilities + AudioRecorder, create_enhanced_audio_interface, display_audio_interface, + + # Image utilities + create_test_image, analyze_image_with_llamacpp, + + # Grammar utilities + demonstrate_grammar_constraint, get_predefined_grammars, + + # Benchmark utilities + run_comprehensive_benchmark +) +``` + +This keeps the notebook clean and focused on demonstrating LlamaCpp capabilities while maintaining all functionality in reusable, well-organized modules. + +## Notes + +- All functions include error handling +- Modular design for easy extension +- See individual module docstrings for detailed API documentation \ No newline at end of file diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/__init__.py b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/__init__.py new file mode 100644 index 00000000..a754371f --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/__init__.py @@ -0,0 +1,75 @@ +""" +Utils package for LlamaCpp model provider demo. + +This package contains helper utilities for the LlamaCpp demo notebook, +organized into logical modules for better code organization and reusability. +""" + +from .audio_recorder import ( + AudioRecorder, + create_audio_interface, + display_audio_interface, + clear_audio_interface_cache +) +from .image_utils import ( + create_test_image, + create_complex_test_image, + image_to_base64, + image_to_bytes, + analyze_image_with_llamacpp, + create_image_analysis_demo, + load_external_image, + resize_image +) +from .grammar_utils import ( + demonstrate_grammar_constraint, + test_sampling_strategy, + get_predefined_grammars, + get_sampling_strategies, + run_grammar_constraints_demo, + run_sampling_strategies_demo, + test_structured_output, + create_json_grammar +) +from .benchmark_utils import ( + benchmark_performance, + analyze_benchmark_results, + visualize_performance, + run_comprehensive_benchmark +) + +__all__ = [ + # Audio utilities + 'AudioRecorder', + 'create_audio_interface', + 'display_audio_interface', + 'clear_audio_interface_cache', + + # Image utilities + 'create_test_image', + 'image_to_bytes', + 'analyze_image_with_llamacpp', + + # Grammar and sampling utilities + 'demonstrate_grammar_constraint', + 'test_sampling_strategy', + 'get_predefined_grammars', + 'get_sampling_strategies', + + # Benchmark utilities + 'benchmark_performance', + 'run_comprehensive_benchmark', + + # Additional utilities (not used in notebook but available) + 'create_complex_test_image', + 'image_to_base64', + 'create_image_analysis_demo', + 'load_external_image', + 'resize_image', + 'run_grammar_constraints_demo', + 'run_sampling_strategies_demo', + 'test_structured_output', + 'create_json_grammar', + 'analyze_benchmark_results', + 'visualize_performance', +] \ No newline at end of file diff --git a/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/audio_recorder.py b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/audio_recorder.py new file mode 100644 index 00000000..65c6724b --- /dev/null +++ b/01-tutorials/01-fundamentals/02-model-providers/03-llamacpp-model/utils/audio_recorder.py @@ -0,0 +1,329 @@ +""" +Audio recording utilities for the LlamaCpp tutorial. + +Provides audio recording and speech transcription functionality +for multimodal AI applications. +""" + +import os +import base64 +import tempfile +import threading +import time +from typing import Optional + +import numpy as np +import sounddevice as sd +import soundfile as sf +import ipywidgets as widgets +from IPython.display import HTML, display + +from strands import Agent +from strands.models.llamacpp import LlamaCppModel + + +class AudioRecorder: + """Audio recorder for speech capture and processing.""" + + def __init__(self, sample_rate: int = 16000): + self.sample_rate = sample_rate + self.recording: Optional[np.ndarray] = None + self.is_recording = False + + def record(self, duration: int = 5) -> np.ndarray: + """Record audio for specified duration.""" + self.recording = sd.rec( + int(duration * self.sample_rate), + samplerate=self.sample_rate, + channels=1, + dtype='float32' + ) + sd.wait() + return self.recording + + def get_audio_bytes(self) -> bytes: + """Get audio data as bytes for SDK.""" + if self.recording is None: + raise ValueError("No recording available") + + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: + sf.write(tmp_file.name, self.recording, self.sample_rate, format='WAV') + tmp_filename = tmp_file.name + + with open(tmp_filename, 'rb') as f: + audio_bytes = f.read() + + os.unlink(tmp_filename) + return audio_bytes + + def play(self) -> None: + """Play the recorded audio.""" + if self.recording is None: + raise ValueError("No recording available") + sd.play(self.recording, self.sample_rate) + sd.wait() + + +# Global widget cache to prevent duplication +_audio_interface_cache = {} + +def create_audio_interface(recorder: AudioRecorder, base_url: str = "http://localhost:8080") -> dict: + """ + Create an audio recording interface for speech transcription. + + Args: + recorder: AudioRecorder instance + base_url: LlamaCpp server URL + + Returns: + Dictionary containing interface widgets and handlers + """ + + # Use cached widgets if they exist to prevent duplication + cache_key = id(recorder) + if cache_key in _audio_interface_cache: + cached = _audio_interface_cache[cache_key] + # Clear existing outputs + cached['widgets']['recording_output'].clear_output() + cached['widgets']['analysis_output'].clear_output() + cached['widgets']['status_label'].value = "Ready to record" + cached['widgets']['play_button'].disabled = True + cached['widgets']['analyze_button'].disabled = True + cached['widgets']['progress_bar'].value = 0 + cached['widgets']['progress_bar'].layout.visibility = 'hidden' + return cached + + # Basic controls + duration_slider = widgets.IntSlider( + value=5, + min=1, + max=15, + step=1, + description='Duration (sec):', + style={'description_width': '100px'} + ) + + record_button = widgets.Button( + description='Record', + button_style='info', + layout=widgets.Layout(width='80px') + ) + + play_button = widgets.Button( + description='Play', + button_style='success', + disabled=True, + layout=widgets.Layout(width='80px') + ) + + analyze_button = widgets.Button( + description='Transcribe', + button_style='primary', + disabled=True, + layout=widgets.Layout(width='90px') + ) + + clear_button = widgets.Button( + description='Clear', + button_style='warning', + layout=widgets.Layout(width='80px') + ) + + # Status label + status_label = widgets.Label(value="Ready to record") + + # Output areas + recording_output = widgets.Output(layout=widgets.Layout(height='50px')) + analysis_output = widgets.Output(layout=widgets.Layout(height='200px', overflow='auto')) + + # Progress bar + progress_bar = widgets.IntProgress( + value=0, + min=0, + max=100, + description='', + bar_style='info', + layout=widgets.Layout(width='100%', visibility='hidden') + ) + + def on_record_click(b): + """Handle record button click.""" + recording_output.clear_output(wait=True) + with recording_output: + status_label.value = f"Recording for {duration_slider.value} seconds..." + progress_bar.layout.visibility = 'visible' + progress_bar.value = 0 + + def update_progress(): + for i in range(duration_slider.value * 10): + time.sleep(0.1) + progress_bar.value = (i + 1) / (duration_slider.value * 10) * 100 + + progress_thread = threading.Thread(target=update_progress) + progress_thread.start() + + recorder.record(duration_slider.value) + + progress_thread.join() + progress_bar.layout.visibility = 'hidden' + + status_label.value = "Recording ready" + play_button.disabled = False + analyze_button.disabled = False + + def on_play_click(b): + """Handle play button click.""" + recording_output.clear_output(wait=True) + with recording_output: + status_label.value = "Playing audio..." + recorder.play() + status_label.value = "Recording ready" + + def on_analyze_click(b): + """Handle analyze button click.""" + analysis_output.clear_output(wait=True) + + status_label.value = "Transcribing audio..." + progress_bar.layout.visibility = 'visible' + progress_bar.value = 20 + + try: + # Get audio bytes + audio_bytes = recorder.get_audio_bytes() + progress_bar.value = 40 + + # Create LlamaCpp model + clean_base_url = base_url.rstrip('/').replace('/v1', '') + model = LlamaCppModel( + base_url=clean_base_url, + params={"temperature": 0.7, "max_tokens": 300} + ) + agent = Agent(model=model) + progress_bar.value = 60 + + # Create message with audio content + message_content = [ + { + "audio": { + "source": {"bytes": audio_bytes}, + "format": "wav" + } + }, + { + "text": "Please transcribe exactly what was said in this audio recording. If the speech is in a language other than English, first provide the exact transcription in the original language, then provide an English translation. Format your response as:\n1. Original transcription: [exact words spoken]\n2. Language detected: [language name]\n3. English translation: [translation if needed, or 'Already in English']" + } + ] + + progress_bar.value = 80 + response = agent(message_content) + progress_bar.value = 100 + + # Extract and display response + with analysis_output: + if hasattr(response, 'message') and 'content' in response.message: + full_response = "" + for content_block in response.message['content']: + if 'text' in content_block: + full_response += content_block['text'] + display(HTML(f'
{full_response}'))
+ else:
+ display(HTML(f'{str(response)}'))
+
+ status_label.value = "Transcription complete"
+
+ except Exception as e:
+ with analysis_output:
+ display(HTML(f'