From ace7418e67d77725228d433adcde0e097d3ecc78 Mon Sep 17 00:00:00 2001 From: Manoj B Bhamsagar Date: Sun, 19 Oct 2025 06:27:16 +0530 Subject: [PATCH 1/4] feat(confluence): make SVG processing optional to fix pycairo installation issues This change addresses installation failures on Debian/Ubuntu systems where svglib 1.6.0 introduced breaking changes that require pycairo compilation, which fails without gcc and cairo-dev system libraries. Changes: - Move svglib dependency to optional extras: pip install 'llama-index-readers-confluence[svg]' - Add graceful degradation in process_svg() when dependencies unavailable - Add FileType.SVG enum for custom parser support - Add comprehensive migration guide with 4 different approaches - Add unit tests for optional dependency behavior - Add working examples for all SVG processing options - Update README and CHANGELOG Breaking Change: SVG processing now requires explicit installation with [svg] extra. Users who need SVG support should install with: pip install 'llama-index-readers-confluence[svg]' Backward Compatibility: Maintained through graceful degradation - SVG attachments are skipped with informative warnings when dependencies are not installed. Fixes installation issues on systems without C compilers. Tested: 3 tests passed, 1 skipped (expected when svglib not installed) --- .../CHANGELOG.md | 13 + .../MIGRATION_GUIDE.md | 207 ++++++++++++++++ .../llama-index-readers-confluence/README.md | 25 +- .../examples/svg_parsing_examples.py | 228 ++++++++++++++++++ .../llama_index/readers/confluence/base.py | 51 +++- .../llama_index/readers/confluence/event.py | 1 + .../pyproject.toml | 6 +- .../requirements.txt | 3 +- .../tests/test_svg_optional.py | 128 ++++++++++ 9 files changed, 649 insertions(+), 13 deletions(-) create mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md create mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py create mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md index 1bc7500fb8..76d8b2bfaf 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md +++ b/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md @@ -1,5 +1,18 @@ # CHANGELOG +## [Unreleased] + +### Changed + +- **BREAKING**: Made SVG processing optional to avoid installation issues with pycairo dependency +- SVG support (`svglib`) moved to optional dependencies. Install with `pip install llama-index-readers-confluence[svg]` +- SVG attachments will be skipped with a warning if optional dependencies are not installed +- Pinned svglib to <1.6.0 to avoid breaking changes in newer versions + +### Fixed + +- Fixed installation failures on Debian/Ubuntu systems due to pycairo compilation issues + ## [0.1.8] - 2024-08-20 - Added observability events for ConfluenceReader diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md b/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md new file mode 100644 index 0000000000..598f8e43c2 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md @@ -0,0 +1,207 @@ +# Migration Guide: SVG Support Changes + +## Overview + +Starting from version 0.4.5, SVG processing support has been moved to an optional dependency to address installation issues on systems where the `pycairo` package cannot be compiled (particularly Debian/Ubuntu systems without C compilers or Cairo development libraries). + +## What Changed? + +### Before (versions < 0.4.5) + +- `svglib` was a required dependency +- All users had to install `pycairo` even if they didn't need SVG support +- Installation could fail on systems without proper build tools + +### After (versions >= 0.4.5) + +- `svglib` is now an optional dependency +- SVG processing is skipped by default with a warning if optional dependencies are not installed +- Base installation works on all systems without requiring C compilers +- SVG version pinned to `<1.6.0` to avoid breaking changes + +## Migration Paths + +### Option 1: Continue Using Built-in SVG Support (Recommended if SVG is needed) + +If you need SVG processing and can install the required system dependencies: + +```bash +# Uninstall current version +pip uninstall llama-index-readers-confluence + +# Install with SVG support +pip install 'llama-index-readers-confluence[svg]' +``` + +**System Requirements for SVG Support:** + +- On Debian/Ubuntu: `sudo apt-get install gcc python3-dev libcairo2-dev` +- On macOS: `brew install cairo` +- On Windows: Install Visual C++ Build Tools + +### Option 2: Skip SVG Processing (Recommended for Docker/CI environments) + +If you don't need SVG processing or want to avoid installation issues: + +```bash +# Install without SVG support (default) +pip install llama-index-readers-confluence +``` + +SVG attachments will be skipped with a warning in the logs. All other functionality remains unchanged. + +### Option 3: Use Custom SVG Parser + +If you need SVG processing but cannot install pycairo, use a custom parser: + +```python +from llama_index.readers.confluence import ConfluenceReader +from llama_index.readers.confluence.event import FileType + + +# Simple text extraction from SVG (no OCR) +class SimpleSVGParser(BaseReader): + def load_data(self, file_path, **kwargs): + import xml.etree.ElementTree as ET + + with open(file_path, "r") as f: + root = ET.fromstring(f.read()) + + # Extract text elements from SVG + texts = [elem.text for elem in root.findall(".//text") if elem.text] + extracted_text = " ".join(texts) or "[SVG Image]" + + return [ + Document(text=extracted_text, metadata={"file_path": file_path}) + ] + + +reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + custom_parsers={FileType.SVG: SimpleSVGParser()}, +) +``` + +See `examples/svg_parsing_examples.py` for more custom parser examples. + +### Option 4: Filter Out SVG Attachments + +If you want to explicitly skip SVG files without warnings: + +```python +def attachment_filter( + media_type: str, file_size: int, title: str +) -> tuple[bool, str]: + if media_type == "image/svg+xml": + return False, "SVG processing disabled" + return True, "" + + +reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + process_attachment_callback=attachment_filter, +) +``` + +## Docker/Container Deployments + +### Before (versions < 0.4.5) + +```dockerfile +FROM python:3.11-slim + +# Required system dependencies for pycairo +RUN apt-get update && apt-get install -y \ + gcc \ + python3-dev \ + libcairo2-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install llama-index-readers-confluence +``` + +### After (versions >= 0.4.5) - Without SVG Support + +```dockerfile +FROM python:3.11-slim + +# No system dependencies needed! +RUN pip install llama-index-readers-confluence +``` + +### After (versions >= 0.4.5) - With SVG Support + +```dockerfile +FROM python:3.11-slim + +# Only if you need SVG support +RUN apt-get update && apt-get install -y \ + gcc \ + python3-dev \ + libcairo2-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip install 'llama-index-readers-confluence[svg]' +``` + +## FAQ + +### Q: Will my existing code break? + +**A:** No, your existing code will continue to work. If you were using SVG processing and don't install the `[svg]` extra, SVG attachments will simply be skipped with a warning instead of failing. + +### Q: How do I know if SVG dependencies are installed? + +**A:** Check the logs. If you see warnings like "SVG processing skipped: Optional dependencies not installed", then SVG dependencies are not available. + +### Q: Can I use a different OCR engine for SVG? + +**A:** Yes! Use the custom parser approach (Option 3) and implement your own SVG-to-text conversion logic. You could use libraries like `cairosvg`, `pdf2image`, or pure XML parsing depending on your needs. + +### Q: Why was this change made? + +**A:** The `pycairo` dependency (required by `svglib`) requires C compilation and system libraries (Cairo). This caused installation failures in: + +- Docker containers based on slim images +- CI/CD pipelines without build tools +- Systems managed by users without admin rights +- Environments where SVG support isn't needed + +Making it optional allows the package to work everywhere while still supporting SVG for users who need it. + +### Q: What if I encounter other issues? + +**A:** Please file an issue on GitHub with: + +1. Your Python version +2. Your operating system +3. Whether you installed with `[svg]` extra +4. The full error message +5. Output of `pip list` showing installed packages + +## Testing Your Migration + +After migrating, test your setup: + +```python +from llama_index.readers.confluence import ConfluenceReader +import logging + +# Enable logging to see SVG warnings +logging.basicConfig(level=logging.INFO) + +reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", +) + +# Try loading data +documents = reader.load_data(space_key="MYSPACE", include_attachments=True) + +# Check logs for any SVG-related warnings +print(f"Loaded {len(documents)} documents") +``` + +If you see "SVG processing skipped" warnings but didn't expect them, you may need to install the `[svg]` extra. diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/README.md b/llama-index-integrations/readers/llama-index-readers-confluence/README.md index 119a9c6466..9bf005f061 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/README.md +++ b/llama-index-integrations/readers/llama-index-readers-confluence/README.md @@ -51,6 +51,23 @@ include attachments, this is set to `False` by default, if set to `True` all att ConfluenceReader will extract the text from the attachments and add it to the Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel. +### Optional Dependencies + +**SVG Support**: SVG processing requires additional dependencies that can cause installation issues on some systems. +To enable SVG attachment processing, install with the `svg` extra: + +```bash +pip install llama-index-readers-confluence[svg] +``` + +If SVG dependencies are not installed, SVG attachments will be skipped with a warning in the logs, but all other +functionality will work normally. This allows the package to be installed on systems where the SVG dependencies +(svglib and its transitive dependency pycairo) cannot be built. + +**Migration Note for Existing Users**: If you were previously using SVG processing and want to continue doing so, +you need to install the svg extra as shown above. Alternatively, you can provide a custom SVG parser using the +`custom_parsers` parameter (see Advanced Configuration section and `examples/svg_parsing_examples.py` for details). + ## Advanced Configuration The ConfluenceReader supports several advanced configuration options for customizing the reading behavior: @@ -98,7 +115,8 @@ confluence_parsers = { # ConfluenceFileType.CSV: CSVParser(), # ConfluenceFileType.SPREADSHEET: ExcelParser(), # ConfluenceFileType.MARKDOWN: MarkdownParser(), - # ConfluenceFileType.TEXT: TextParser() + # ConfluenceFileType.TEXT: TextParser(), + # ConfluenceFileType.SVG: CustomSVGParser(), # Custom SVG parser to avoid pycairo issues } reader = ConfluenceReader( @@ -108,6 +126,10 @@ reader = ConfluenceReader( ) ``` +For SVG parsing examples including alternatives to the built-in parser, see `examples/svg_parsing_examples.py`. + +```` + **Processing Callbacks**: - `process_attachment_callback`: A callback function to control which attachments should be processed. The function receives the media type and file size as parameters and should return a tuple of `(should_process: bool, reason: str)`. @@ -425,3 +447,4 @@ print(f"Processing completed. Total documents: {len(documents)}") ``` This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/). +```` diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py b/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py new file mode 100644 index 0000000000..da76fb5736 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py @@ -0,0 +1,228 @@ +""" +Example: Using Custom SVG Parser with Confluence Reader + +This example demonstrates how to use a custom parser for SVG files if you want +to handle SVG processing differently or avoid the pycairo dependency issues. + +Option 1: Skip SVG processing entirely (default behavior without svg extra) +Option 2: Use the built-in SVG processor (requires pip install llama-index-readers-confluence[svg]) +Option 3: Provide a custom SVG parser (example below) +""" + +from typing import List, Union +import pathlib +from llama_index.core.readers.base import BaseReader +from llama_index.core.schema import Document +from llama_index.readers.confluence import ConfluenceReader +from llama_index.readers.confluence.event import FileType + + +# Example 1: Simple custom SVG parser that extracts text content from SVG +class SimpleSVGParser(BaseReader): + """ + Simple SVG parser that extracts text elements from SVG files. + This avoids the pycairo dependency by using basic XML parsing. + """ + + def load_data( + self, file_path: Union[str, pathlib.Path], **kwargs + ) -> List[Document]: + """Load and parse an SVG file to extract text content.""" + try: + import xml.etree.ElementTree as ET + except ImportError: + raise ImportError("xml.etree.ElementTree is required") + + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + try: + # Parse SVG XML + root = ET.fromstring(content) + # Extract all text elements (handles common SVG namespace) + ns = {"svg": "http://www.w3.org/2000/svg"} + texts = [] + + # Try with namespace + for text_elem in root.findall(".//svg:text", ns): + if text_elem.text: + texts.append(text_elem.text.strip()) + + # Try without namespace if nothing found + if not texts: + for text_elem in root.findall(".//text"): + if text_elem.text: + texts.append(text_elem.text.strip()) + + extracted_text = " ".join(texts) if texts else "[SVG Image - No text content]" + + return [ + Document( + text=extracted_text, + metadata={"file_path": str(file_path), "source_type": "svg"}, + ) + ] + except Exception as e: + return [ + Document( + text=f"[Error parsing SVG: {str(e)}]", + metadata={"file_path": str(file_path), "source_type": "svg"}, + ) + ] + + +# Example 2: Custom SVG parser using cairosvg (alternative to svglib) +class CairoSVGParser(BaseReader): + """ + Alternative SVG parser using cairosvg library. + Install with: pip install cairosvg pillow pytesseract + + Note: This still requires cairo system libraries but has different + installation characteristics than svglib+pycairo. + """ + + def load_data( + self, file_path: Union[str, pathlib.Path], **kwargs + ) -> List[Document]: + """Load and parse an SVG file by converting to PNG and extracting text.""" + try: + import cairosvg + import pytesseract + from PIL import Image + from io import BytesIO + except ImportError: + raise ImportError( + "cairosvg, pillow, and pytesseract are required. " + "Install with: pip install cairosvg pillow pytesseract" + ) + + try: + # Convert SVG to PNG + png_data = cairosvg.svg2png(url=str(file_path)) + + # Extract text using OCR + image = Image.open(BytesIO(png_data)) + text = pytesseract.image_to_string(image) + + return [ + Document( + text=text or "[SVG Image - No text extracted]", + metadata={"file_path": str(file_path), "source_type": "svg"}, + ) + ] + except Exception as e: + return [ + Document( + text=f"[Error parsing SVG: {str(e)}]", + metadata={"file_path": str(file_path), "source_type": "svg"}, + ) + ] + + +# Usage examples + +def example_without_svg_support(): + """ + Example 1: Use Confluence reader without SVG support. + SVG attachments will be skipped with a warning. + """ + reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + ) + + # SVG attachments will be skipped automatically + documents = reader.load_data( + space_key="MYSPACE", + include_attachments=True, + ) + return documents + + +def example_with_builtin_svg_support(): + """ + Example 2: Use built-in SVG support. + Requires: pip install llama-index-readers-confluence[svg] + """ + reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + ) + + # Built-in SVG processing will be used if dependencies are installed + documents = reader.load_data( + space_key="MYSPACE", + include_attachments=True, + ) + return documents + + +def example_with_custom_svg_parser(): + """ + Example 3: Use custom SVG parser to avoid pycairo dependency. + This gives you full control over SVG processing. + """ + # Use the simple text extraction parser + svg_parser = SimpleSVGParser() + + reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + custom_parsers={ + FileType.SVG: svg_parser, + }, + ) + + documents = reader.load_data( + space_key="MYSPACE", + include_attachments=True, + ) + return documents + + +def example_skip_svg_via_callback(): + """ + Example 4: Skip SVG attachments using a callback. + This is useful if you want to explicitly skip SVG files. + """ + def attachment_filter(media_type: str, file_size: int, title: str) -> tuple[bool, str]: + # Skip SVG files + if media_type == "image/svg+xml": + return False, "SVG files are not supported in this configuration" + return True, "" + + reader = ConfluenceReader( + base_url="https://yoursite.atlassian.com/wiki", + api_token="your_token", + process_attachment_callback=attachment_filter, + ) + + documents = reader.load_data( + space_key="MYSPACE", + include_attachments=True, + ) + return documents + + +if __name__ == "__main__": + print("SVG Processing Examples for Confluence Reader") + print("=" * 50) + print("\nOption 1: Without SVG support (default)") + print(" - No additional dependencies required") + print(" - SVG attachments are skipped with warnings") + print(" - Best for systems where pycairo cannot be installed") + + print("\nOption 2: With built-in SVG support") + print(" - Requires: pip install llama-index-readers-confluence[svg]") + print(" - Full OCR-based text extraction from SVG") + print(" - May have installation challenges on some systems") + + print("\nOption 3: With custom SVG parser") + print(" - No pycairo dependency") + print(" - Simple text element extraction") + print(" - Easy to customize for your needs") + + print("\nOption 4: Skip SVG via callback") + print(" - Explicitly filter out SVG files") + print(" - Clean logs without warnings") + print(" - Useful when SVG content is not needed") diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py index 74c7c64983..d3268b426f 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py +++ b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py @@ -1159,15 +1159,23 @@ def process_csv(self, link): return text def process_svg(self, link): + """ + Process SVG attachments by converting them to images and extracting text. + + Note: This method requires optional SVG dependencies. Install them with: + pip install llama-index-readers-confluence[svg] + + If SVG dependencies are not available, a warning will be logged and an empty string returned. + + Alternatively, you can provide a custom SVG parser via the custom_parsers parameter. + """ try: import pytesseract # type: ignore from PIL import Image # type: ignore - from reportlab.graphics import renderPM # type: ignore - from svglib.svglib import svg2rlg # type: ignore except ImportError: raise ImportError( - "`pytesseract`, `Pillow`, or `svglib` package not found, please run" - " `pip install pytesseract Pillow svglib`" + "`pytesseract` or `Pillow` package not found, please run" + " `pip install pytesseract Pillow`" ) response = self.confluence.request(path=link, absolute=True) @@ -1180,14 +1188,37 @@ def process_svg(self, link): ): return text - drawing = svg2rlg(BytesIO(response.content)) + # Check for custom parser first + if FileType.SVG in self.custom_parsers and self.custom_parser_manager: + return self.custom_parser_manager.process_with_custom_parser( + FileType.SVG, response.content, "svg" + ) - img_data = BytesIO() - renderPM.drawToFile(drawing, img_data, fmt="PNG") - img_data.seek(0) - image = Image.open(img_data) + # Check for optional SVG parsing dependencies + try: + from reportlab.graphics import renderPM # type: ignore + from svglib.svglib import svg2rlg # type: ignore + except ImportError: + self.logger.warning( + "SVG processing skipped: Optional dependencies not installed. " + "To enable SVG processing, install with: " + "pip install 'llama-index-readers-confluence[svg]' " + "or provide a custom SVG parser via the custom_parsers parameter." + ) + return "" - return pytesseract.image_to_string(image) + try: + drawing = svg2rlg(BytesIO(response.content)) + + img_data = BytesIO() + renderPM.drawToFile(drawing, img_data, fmt="PNG") + img_data.seek(0) + image = Image.open(img_data) + + return pytesseract.image_to_string(image) + except Exception as e: + self.logger.error(f"Error processing SVG file at {link}: {e}") + return "" if __name__ == "__main__": diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py index 725bcd6b90..b76a52cbce 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py +++ b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py @@ -13,6 +13,7 @@ class FileType(Enum): SPREADSHEET = "spreadsheet" PRESENTATION = "presentation" PDF = "pdf" + SVG = "svg" UNKNOWN = "unknown" diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml index c30e9cc9ac..4d38cc57c0 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml @@ -41,11 +41,15 @@ dependencies = [ "pillow>=10.2.0,<11", "docx2txt>=0.8,<0.9", "xlrd>=2.0.1,<3", - "svglib>=1.5.1,<2", "retrying>=1.3.4,<2", "llama-index-core>=0.13.0,<0.15", ] +[project.optional-dependencies] +svg = [ + "svglib>=1.5.1,<1.6.0", +] + [tool.codespell] check-filenames = true check-hidden = true diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt b/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt index 93177d7ee5..12d43ef7e2 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt +++ b/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt @@ -5,5 +5,6 @@ pdf2image Pillow docx2txt xlrd -svglib retrying +# Optional: SVG support (may require system dependencies for pycairo) +# svglib>=1.5.1,<1.6.0 diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py b/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py new file mode 100644 index 0000000000..000a086ba4 --- /dev/null +++ b/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py @@ -0,0 +1,128 @@ +"""Test SVG processing with optional dependencies.""" + +from unittest.mock import MagicMock, patch +import pytest + + +class TestSVGOptionalDependencies: + """Test that SVG processing is optional and gracefully handles missing dependencies.""" + + @patch("atlassian.Confluence") + def test_svg_processing_without_svglib(self, mock_confluence_class): + """Test that SVG processing returns empty string when svglib is not installed.""" + from llama_index.readers.confluence import ConfluenceReader + + # Mock the confluence client instance + mock_confluence_instance = MagicMock() + mock_confluence_class.return_value = mock_confluence_instance + + # Create reader + reader = ConfluenceReader( + base_url="https://test.atlassian.com/wiki", api_token="test_token" + ) + + # Mock the response + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b"test" + mock_confluence_instance.request.return_value = mock_response + + # Hide svglib and reportlab imports to simulate missing dependencies + with patch.dict( + "sys.modules", + { + "svglib": None, + "svglib.svglib": None, + "reportlab": None, + "reportlab.graphics": None, + }, + ): + # Should return empty string and log warning instead of raising error + result = reader.process_svg("test_link") + assert result == "" + + @patch("atlassian.Confluence") + def test_svg_processing_with_svglib_available(self, mock_confluence_class): + """Test that SVG processing works when svglib is available.""" + # Skip this test if svglib is not actually installed + try: + import svglib # noqa: F401 + from reportlab.graphics import renderPM # noqa: F401 + except ImportError: + pytest.skip("SVG dependencies not installed") + + from llama_index.readers.confluence import ConfluenceReader + + # Mock the confluence client instance + mock_confluence_instance = MagicMock() + mock_confluence_class.return_value = mock_confluence_instance + + reader = ConfluenceReader( + base_url="https://test.atlassian.com/wiki", api_token="test_token" + ) + + # Create a minimal valid SVG + svg_content = b""" + + + Test +""" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = svg_content + mock_confluence_instance.request.return_value = mock_response + + # Should process without error (actual text extraction may vary) + result = reader.process_svg("test_link") + # Result should be a string (may be empty if tesseract can't extract text) + assert isinstance(result, str) + + @patch("atlassian.Confluence") + def test_svg_processing_with_empty_response(self, mock_confluence_class): + """Test that SVG processing handles empty responses gracefully.""" + from llama_index.readers.confluence import ConfluenceReader + + # Mock the confluence client instance + mock_confluence_instance = MagicMock() + mock_confluence_class.return_value = mock_confluence_instance + + reader = ConfluenceReader( + base_url="https://test.atlassian.com/wiki", api_token="test_token" + ) + + # Test with empty content + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.content = b"" + mock_confluence_instance.request.return_value = mock_response + + result = reader.process_svg("test_link") + assert result == "" + + # Test with None content + mock_response.content = None + result = reader.process_svg("test_link") + assert result == "" + + # Test with non-200 status + mock_response.status_code = 404 + mock_response.content = b"test" + result = reader.process_svg("test_link") + assert result == "" + + @patch("atlassian.Confluence") + def test_reader_initialization_without_svglib(self, mock_confluence_class): + """Test that ConfluenceReader can be initialized without svglib installed.""" + from llama_index.readers.confluence import ConfluenceReader + + # Mock the confluence client instance + mock_confluence_instance = MagicMock() + mock_confluence_class.return_value = mock_confluence_instance + + # Should not raise an error during initialization + reader = ConfluenceReader( + base_url="https://test.atlassian.com/wiki", api_token="test_token" + ) + assert reader is not None + assert reader.base_url == "https://test.atlassian.com/wiki" From 2e2051cdd6c728f1d405d07347f52f1745a5b236 Mon Sep 17 00:00:00 2001 From: Manoj B Bhamsagar Date: Sun, 26 Oct 2025 17:17:41 +0530 Subject: [PATCH 2/4] Revert "feat(confluence): make SVG processing optional to fix pycairo installation issues" This reverts commit ace7418e67d77725228d433adcde0e097d3ecc78. --- .../CHANGELOG.md | 13 - .../MIGRATION_GUIDE.md | 207 ---------------- .../llama-index-readers-confluence/README.md | 25 +- .../examples/svg_parsing_examples.py | 228 ------------------ .../llama_index/readers/confluence/base.py | 51 +--- .../llama_index/readers/confluence/event.py | 1 - .../pyproject.toml | 6 +- .../requirements.txt | 3 +- .../tests/test_svg_optional.py | 128 ---------- 9 files changed, 13 insertions(+), 649 deletions(-) delete mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md delete mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py delete mode 100644 llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md b/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md index 76d8b2bfaf..1bc7500fb8 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md +++ b/llama-index-integrations/readers/llama-index-readers-confluence/CHANGELOG.md @@ -1,18 +1,5 @@ # CHANGELOG -## [Unreleased] - -### Changed - -- **BREAKING**: Made SVG processing optional to avoid installation issues with pycairo dependency -- SVG support (`svglib`) moved to optional dependencies. Install with `pip install llama-index-readers-confluence[svg]` -- SVG attachments will be skipped with a warning if optional dependencies are not installed -- Pinned svglib to <1.6.0 to avoid breaking changes in newer versions - -### Fixed - -- Fixed installation failures on Debian/Ubuntu systems due to pycairo compilation issues - ## [0.1.8] - 2024-08-20 - Added observability events for ConfluenceReader diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md b/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md deleted file mode 100644 index 598f8e43c2..0000000000 --- a/llama-index-integrations/readers/llama-index-readers-confluence/MIGRATION_GUIDE.md +++ /dev/null @@ -1,207 +0,0 @@ -# Migration Guide: SVG Support Changes - -## Overview - -Starting from version 0.4.5, SVG processing support has been moved to an optional dependency to address installation issues on systems where the `pycairo` package cannot be compiled (particularly Debian/Ubuntu systems without C compilers or Cairo development libraries). - -## What Changed? - -### Before (versions < 0.4.5) - -- `svglib` was a required dependency -- All users had to install `pycairo` even if they didn't need SVG support -- Installation could fail on systems without proper build tools - -### After (versions >= 0.4.5) - -- `svglib` is now an optional dependency -- SVG processing is skipped by default with a warning if optional dependencies are not installed -- Base installation works on all systems without requiring C compilers -- SVG version pinned to `<1.6.0` to avoid breaking changes - -## Migration Paths - -### Option 1: Continue Using Built-in SVG Support (Recommended if SVG is needed) - -If you need SVG processing and can install the required system dependencies: - -```bash -# Uninstall current version -pip uninstall llama-index-readers-confluence - -# Install with SVG support -pip install 'llama-index-readers-confluence[svg]' -``` - -**System Requirements for SVG Support:** - -- On Debian/Ubuntu: `sudo apt-get install gcc python3-dev libcairo2-dev` -- On macOS: `brew install cairo` -- On Windows: Install Visual C++ Build Tools - -### Option 2: Skip SVG Processing (Recommended for Docker/CI environments) - -If you don't need SVG processing or want to avoid installation issues: - -```bash -# Install without SVG support (default) -pip install llama-index-readers-confluence -``` - -SVG attachments will be skipped with a warning in the logs. All other functionality remains unchanged. - -### Option 3: Use Custom SVG Parser - -If you need SVG processing but cannot install pycairo, use a custom parser: - -```python -from llama_index.readers.confluence import ConfluenceReader -from llama_index.readers.confluence.event import FileType - - -# Simple text extraction from SVG (no OCR) -class SimpleSVGParser(BaseReader): - def load_data(self, file_path, **kwargs): - import xml.etree.ElementTree as ET - - with open(file_path, "r") as f: - root = ET.fromstring(f.read()) - - # Extract text elements from SVG - texts = [elem.text for elem in root.findall(".//text") if elem.text] - extracted_text = " ".join(texts) or "[SVG Image]" - - return [ - Document(text=extracted_text, metadata={"file_path": file_path}) - ] - - -reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - custom_parsers={FileType.SVG: SimpleSVGParser()}, -) -``` - -See `examples/svg_parsing_examples.py` for more custom parser examples. - -### Option 4: Filter Out SVG Attachments - -If you want to explicitly skip SVG files without warnings: - -```python -def attachment_filter( - media_type: str, file_size: int, title: str -) -> tuple[bool, str]: - if media_type == "image/svg+xml": - return False, "SVG processing disabled" - return True, "" - - -reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - process_attachment_callback=attachment_filter, -) -``` - -## Docker/Container Deployments - -### Before (versions < 0.4.5) - -```dockerfile -FROM python:3.11-slim - -# Required system dependencies for pycairo -RUN apt-get update && apt-get install -y \ - gcc \ - python3-dev \ - libcairo2-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install llama-index-readers-confluence -``` - -### After (versions >= 0.4.5) - Without SVG Support - -```dockerfile -FROM python:3.11-slim - -# No system dependencies needed! -RUN pip install llama-index-readers-confluence -``` - -### After (versions >= 0.4.5) - With SVG Support - -```dockerfile -FROM python:3.11-slim - -# Only if you need SVG support -RUN apt-get update && apt-get install -y \ - gcc \ - python3-dev \ - libcairo2-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN pip install 'llama-index-readers-confluence[svg]' -``` - -## FAQ - -### Q: Will my existing code break? - -**A:** No, your existing code will continue to work. If you were using SVG processing and don't install the `[svg]` extra, SVG attachments will simply be skipped with a warning instead of failing. - -### Q: How do I know if SVG dependencies are installed? - -**A:** Check the logs. If you see warnings like "SVG processing skipped: Optional dependencies not installed", then SVG dependencies are not available. - -### Q: Can I use a different OCR engine for SVG? - -**A:** Yes! Use the custom parser approach (Option 3) and implement your own SVG-to-text conversion logic. You could use libraries like `cairosvg`, `pdf2image`, or pure XML parsing depending on your needs. - -### Q: Why was this change made? - -**A:** The `pycairo` dependency (required by `svglib`) requires C compilation and system libraries (Cairo). This caused installation failures in: - -- Docker containers based on slim images -- CI/CD pipelines without build tools -- Systems managed by users without admin rights -- Environments where SVG support isn't needed - -Making it optional allows the package to work everywhere while still supporting SVG for users who need it. - -### Q: What if I encounter other issues? - -**A:** Please file an issue on GitHub with: - -1. Your Python version -2. Your operating system -3. Whether you installed with `[svg]` extra -4. The full error message -5. Output of `pip list` showing installed packages - -## Testing Your Migration - -After migrating, test your setup: - -```python -from llama_index.readers.confluence import ConfluenceReader -import logging - -# Enable logging to see SVG warnings -logging.basicConfig(level=logging.INFO) - -reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", -) - -# Try loading data -documents = reader.load_data(space_key="MYSPACE", include_attachments=True) - -# Check logs for any SVG-related warnings -print(f"Loaded {len(documents)} documents") -``` - -If you see "SVG processing skipped" warnings but didn't expect them, you may need to install the `[svg]` extra. diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/README.md b/llama-index-integrations/readers/llama-index-readers-confluence/README.md index 9bf005f061..119a9c6466 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/README.md +++ b/llama-index-integrations/readers/llama-index-readers-confluence/README.md @@ -51,23 +51,6 @@ include attachments, this is set to `False` by default, if set to `True` all att ConfluenceReader will extract the text from the attachments and add it to the Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel. -### Optional Dependencies - -**SVG Support**: SVG processing requires additional dependencies that can cause installation issues on some systems. -To enable SVG attachment processing, install with the `svg` extra: - -```bash -pip install llama-index-readers-confluence[svg] -``` - -If SVG dependencies are not installed, SVG attachments will be skipped with a warning in the logs, but all other -functionality will work normally. This allows the package to be installed on systems where the SVG dependencies -(svglib and its transitive dependency pycairo) cannot be built. - -**Migration Note for Existing Users**: If you were previously using SVG processing and want to continue doing so, -you need to install the svg extra as shown above. Alternatively, you can provide a custom SVG parser using the -`custom_parsers` parameter (see Advanced Configuration section and `examples/svg_parsing_examples.py` for details). - ## Advanced Configuration The ConfluenceReader supports several advanced configuration options for customizing the reading behavior: @@ -115,8 +98,7 @@ confluence_parsers = { # ConfluenceFileType.CSV: CSVParser(), # ConfluenceFileType.SPREADSHEET: ExcelParser(), # ConfluenceFileType.MARKDOWN: MarkdownParser(), - # ConfluenceFileType.TEXT: TextParser(), - # ConfluenceFileType.SVG: CustomSVGParser(), # Custom SVG parser to avoid pycairo issues + # ConfluenceFileType.TEXT: TextParser() } reader = ConfluenceReader( @@ -126,10 +108,6 @@ reader = ConfluenceReader( ) ``` -For SVG parsing examples including alternatives to the built-in parser, see `examples/svg_parsing_examples.py`. - -```` - **Processing Callbacks**: - `process_attachment_callback`: A callback function to control which attachments should be processed. The function receives the media type and file size as parameters and should return a tuple of `(should_process: bool, reason: str)`. @@ -447,4 +425,3 @@ print(f"Processing completed. Total documents: {len(documents)}") ``` This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/run-llama/llama_index/). -```` diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py b/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py deleted file mode 100644 index da76fb5736..0000000000 --- a/llama-index-integrations/readers/llama-index-readers-confluence/examples/svg_parsing_examples.py +++ /dev/null @@ -1,228 +0,0 @@ -""" -Example: Using Custom SVG Parser with Confluence Reader - -This example demonstrates how to use a custom parser for SVG files if you want -to handle SVG processing differently or avoid the pycairo dependency issues. - -Option 1: Skip SVG processing entirely (default behavior without svg extra) -Option 2: Use the built-in SVG processor (requires pip install llama-index-readers-confluence[svg]) -Option 3: Provide a custom SVG parser (example below) -""" - -from typing import List, Union -import pathlib -from llama_index.core.readers.base import BaseReader -from llama_index.core.schema import Document -from llama_index.readers.confluence import ConfluenceReader -from llama_index.readers.confluence.event import FileType - - -# Example 1: Simple custom SVG parser that extracts text content from SVG -class SimpleSVGParser(BaseReader): - """ - Simple SVG parser that extracts text elements from SVG files. - This avoids the pycairo dependency by using basic XML parsing. - """ - - def load_data( - self, file_path: Union[str, pathlib.Path], **kwargs - ) -> List[Document]: - """Load and parse an SVG file to extract text content.""" - try: - import xml.etree.ElementTree as ET - except ImportError: - raise ImportError("xml.etree.ElementTree is required") - - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - - try: - # Parse SVG XML - root = ET.fromstring(content) - # Extract all text elements (handles common SVG namespace) - ns = {"svg": "http://www.w3.org/2000/svg"} - texts = [] - - # Try with namespace - for text_elem in root.findall(".//svg:text", ns): - if text_elem.text: - texts.append(text_elem.text.strip()) - - # Try without namespace if nothing found - if not texts: - for text_elem in root.findall(".//text"): - if text_elem.text: - texts.append(text_elem.text.strip()) - - extracted_text = " ".join(texts) if texts else "[SVG Image - No text content]" - - return [ - Document( - text=extracted_text, - metadata={"file_path": str(file_path), "source_type": "svg"}, - ) - ] - except Exception as e: - return [ - Document( - text=f"[Error parsing SVG: {str(e)}]", - metadata={"file_path": str(file_path), "source_type": "svg"}, - ) - ] - - -# Example 2: Custom SVG parser using cairosvg (alternative to svglib) -class CairoSVGParser(BaseReader): - """ - Alternative SVG parser using cairosvg library. - Install with: pip install cairosvg pillow pytesseract - - Note: This still requires cairo system libraries but has different - installation characteristics than svglib+pycairo. - """ - - def load_data( - self, file_path: Union[str, pathlib.Path], **kwargs - ) -> List[Document]: - """Load and parse an SVG file by converting to PNG and extracting text.""" - try: - import cairosvg - import pytesseract - from PIL import Image - from io import BytesIO - except ImportError: - raise ImportError( - "cairosvg, pillow, and pytesseract are required. " - "Install with: pip install cairosvg pillow pytesseract" - ) - - try: - # Convert SVG to PNG - png_data = cairosvg.svg2png(url=str(file_path)) - - # Extract text using OCR - image = Image.open(BytesIO(png_data)) - text = pytesseract.image_to_string(image) - - return [ - Document( - text=text or "[SVG Image - No text extracted]", - metadata={"file_path": str(file_path), "source_type": "svg"}, - ) - ] - except Exception as e: - return [ - Document( - text=f"[Error parsing SVG: {str(e)}]", - metadata={"file_path": str(file_path), "source_type": "svg"}, - ) - ] - - -# Usage examples - -def example_without_svg_support(): - """ - Example 1: Use Confluence reader without SVG support. - SVG attachments will be skipped with a warning. - """ - reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - ) - - # SVG attachments will be skipped automatically - documents = reader.load_data( - space_key="MYSPACE", - include_attachments=True, - ) - return documents - - -def example_with_builtin_svg_support(): - """ - Example 2: Use built-in SVG support. - Requires: pip install llama-index-readers-confluence[svg] - """ - reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - ) - - # Built-in SVG processing will be used if dependencies are installed - documents = reader.load_data( - space_key="MYSPACE", - include_attachments=True, - ) - return documents - - -def example_with_custom_svg_parser(): - """ - Example 3: Use custom SVG parser to avoid pycairo dependency. - This gives you full control over SVG processing. - """ - # Use the simple text extraction parser - svg_parser = SimpleSVGParser() - - reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - custom_parsers={ - FileType.SVG: svg_parser, - }, - ) - - documents = reader.load_data( - space_key="MYSPACE", - include_attachments=True, - ) - return documents - - -def example_skip_svg_via_callback(): - """ - Example 4: Skip SVG attachments using a callback. - This is useful if you want to explicitly skip SVG files. - """ - def attachment_filter(media_type: str, file_size: int, title: str) -> tuple[bool, str]: - # Skip SVG files - if media_type == "image/svg+xml": - return False, "SVG files are not supported in this configuration" - return True, "" - - reader = ConfluenceReader( - base_url="https://yoursite.atlassian.com/wiki", - api_token="your_token", - process_attachment_callback=attachment_filter, - ) - - documents = reader.load_data( - space_key="MYSPACE", - include_attachments=True, - ) - return documents - - -if __name__ == "__main__": - print("SVG Processing Examples for Confluence Reader") - print("=" * 50) - print("\nOption 1: Without SVG support (default)") - print(" - No additional dependencies required") - print(" - SVG attachments are skipped with warnings") - print(" - Best for systems where pycairo cannot be installed") - - print("\nOption 2: With built-in SVG support") - print(" - Requires: pip install llama-index-readers-confluence[svg]") - print(" - Full OCR-based text extraction from SVG") - print(" - May have installation challenges on some systems") - - print("\nOption 3: With custom SVG parser") - print(" - No pycairo dependency") - print(" - Simple text element extraction") - print(" - Easy to customize for your needs") - - print("\nOption 4: Skip SVG via callback") - print(" - Explicitly filter out SVG files") - print(" - Clean logs without warnings") - print(" - Useful when SVG content is not needed") diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py index d3268b426f..74c7c64983 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py +++ b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/base.py @@ -1159,23 +1159,15 @@ def process_csv(self, link): return text def process_svg(self, link): - """ - Process SVG attachments by converting them to images and extracting text. - - Note: This method requires optional SVG dependencies. Install them with: - pip install llama-index-readers-confluence[svg] - - If SVG dependencies are not available, a warning will be logged and an empty string returned. - - Alternatively, you can provide a custom SVG parser via the custom_parsers parameter. - """ try: import pytesseract # type: ignore from PIL import Image # type: ignore + from reportlab.graphics import renderPM # type: ignore + from svglib.svglib import svg2rlg # type: ignore except ImportError: raise ImportError( - "`pytesseract` or `Pillow` package not found, please run" - " `pip install pytesseract Pillow`" + "`pytesseract`, `Pillow`, or `svglib` package not found, please run" + " `pip install pytesseract Pillow svglib`" ) response = self.confluence.request(path=link, absolute=True) @@ -1188,37 +1180,14 @@ def process_svg(self, link): ): return text - # Check for custom parser first - if FileType.SVG in self.custom_parsers and self.custom_parser_manager: - return self.custom_parser_manager.process_with_custom_parser( - FileType.SVG, response.content, "svg" - ) + drawing = svg2rlg(BytesIO(response.content)) - # Check for optional SVG parsing dependencies - try: - from reportlab.graphics import renderPM # type: ignore - from svglib.svglib import svg2rlg # type: ignore - except ImportError: - self.logger.warning( - "SVG processing skipped: Optional dependencies not installed. " - "To enable SVG processing, install with: " - "pip install 'llama-index-readers-confluence[svg]' " - "or provide a custom SVG parser via the custom_parsers parameter." - ) - return "" + img_data = BytesIO() + renderPM.drawToFile(drawing, img_data, fmt="PNG") + img_data.seek(0) + image = Image.open(img_data) - try: - drawing = svg2rlg(BytesIO(response.content)) - - img_data = BytesIO() - renderPM.drawToFile(drawing, img_data, fmt="PNG") - img_data.seek(0) - image = Image.open(img_data) - - return pytesseract.image_to_string(image) - except Exception as e: - self.logger.error(f"Error processing SVG file at {link}: {e}") - return "" + return pytesseract.image_to_string(image) if __name__ == "__main__": diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py index b76a52cbce..725bcd6b90 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py +++ b/llama-index-integrations/readers/llama-index-readers-confluence/llama_index/readers/confluence/event.py @@ -13,7 +13,6 @@ class FileType(Enum): SPREADSHEET = "spreadsheet" PRESENTATION = "presentation" PDF = "pdf" - SVG = "svg" UNKNOWN = "unknown" diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml index 4d38cc57c0..c30e9cc9ac 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml @@ -41,15 +41,11 @@ dependencies = [ "pillow>=10.2.0,<11", "docx2txt>=0.8,<0.9", "xlrd>=2.0.1,<3", + "svglib>=1.5.1,<2", "retrying>=1.3.4,<2", "llama-index-core>=0.13.0,<0.15", ] -[project.optional-dependencies] -svg = [ - "svglib>=1.5.1,<1.6.0", -] - [tool.codespell] check-filenames = true check-hidden = true diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt b/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt index 12d43ef7e2..93177d7ee5 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt +++ b/llama-index-integrations/readers/llama-index-readers-confluence/requirements.txt @@ -5,6 +5,5 @@ pdf2image Pillow docx2txt xlrd +svglib retrying -# Optional: SVG support (may require system dependencies for pycairo) -# svglib>=1.5.1,<1.6.0 diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py b/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py deleted file mode 100644 index 000a086ba4..0000000000 --- a/llama-index-integrations/readers/llama-index-readers-confluence/tests/test_svg_optional.py +++ /dev/null @@ -1,128 +0,0 @@ -"""Test SVG processing with optional dependencies.""" - -from unittest.mock import MagicMock, patch -import pytest - - -class TestSVGOptionalDependencies: - """Test that SVG processing is optional and gracefully handles missing dependencies.""" - - @patch("atlassian.Confluence") - def test_svg_processing_without_svglib(self, mock_confluence_class): - """Test that SVG processing returns empty string when svglib is not installed.""" - from llama_index.readers.confluence import ConfluenceReader - - # Mock the confluence client instance - mock_confluence_instance = MagicMock() - mock_confluence_class.return_value = mock_confluence_instance - - # Create reader - reader = ConfluenceReader( - base_url="https://test.atlassian.com/wiki", api_token="test_token" - ) - - # Mock the response - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = b"test" - mock_confluence_instance.request.return_value = mock_response - - # Hide svglib and reportlab imports to simulate missing dependencies - with patch.dict( - "sys.modules", - { - "svglib": None, - "svglib.svglib": None, - "reportlab": None, - "reportlab.graphics": None, - }, - ): - # Should return empty string and log warning instead of raising error - result = reader.process_svg("test_link") - assert result == "" - - @patch("atlassian.Confluence") - def test_svg_processing_with_svglib_available(self, mock_confluence_class): - """Test that SVG processing works when svglib is available.""" - # Skip this test if svglib is not actually installed - try: - import svglib # noqa: F401 - from reportlab.graphics import renderPM # noqa: F401 - except ImportError: - pytest.skip("SVG dependencies not installed") - - from llama_index.readers.confluence import ConfluenceReader - - # Mock the confluence client instance - mock_confluence_instance = MagicMock() - mock_confluence_class.return_value = mock_confluence_instance - - reader = ConfluenceReader( - base_url="https://test.atlassian.com/wiki", api_token="test_token" - ) - - # Create a minimal valid SVG - svg_content = b""" - - - Test -""" - - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = svg_content - mock_confluence_instance.request.return_value = mock_response - - # Should process without error (actual text extraction may vary) - result = reader.process_svg("test_link") - # Result should be a string (may be empty if tesseract can't extract text) - assert isinstance(result, str) - - @patch("atlassian.Confluence") - def test_svg_processing_with_empty_response(self, mock_confluence_class): - """Test that SVG processing handles empty responses gracefully.""" - from llama_index.readers.confluence import ConfluenceReader - - # Mock the confluence client instance - mock_confluence_instance = MagicMock() - mock_confluence_class.return_value = mock_confluence_instance - - reader = ConfluenceReader( - base_url="https://test.atlassian.com/wiki", api_token="test_token" - ) - - # Test with empty content - mock_response = MagicMock() - mock_response.status_code = 200 - mock_response.content = b"" - mock_confluence_instance.request.return_value = mock_response - - result = reader.process_svg("test_link") - assert result == "" - - # Test with None content - mock_response.content = None - result = reader.process_svg("test_link") - assert result == "" - - # Test with non-200 status - mock_response.status_code = 404 - mock_response.content = b"test" - result = reader.process_svg("test_link") - assert result == "" - - @patch("atlassian.Confluence") - def test_reader_initialization_without_svglib(self, mock_confluence_class): - """Test that ConfluenceReader can be initialized without svglib installed.""" - from llama_index.readers.confluence import ConfluenceReader - - # Mock the confluence client instance - mock_confluence_instance = MagicMock() - mock_confluence_class.return_value = mock_confluence_instance - - # Should not raise an error during initialization - reader = ConfluenceReader( - base_url="https://test.atlassian.com/wiki", api_token="test_token" - ) - assert reader is not None - assert reader.base_url == "https://test.atlassian.com/wiki" From 9a735b909d5b3f62d2cf402b516d8e7b79c97376 Mon Sep 17 00:00:00 2001 From: Manoj B Bhamsagar Date: Sun, 26 Oct 2025 17:23:11 +0530 Subject: [PATCH 3/4] fix: pin svglib to >=1.5,<1.6 --- .../readers/llama-index-readers-confluence/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml index c30e9cc9ac..c284ae4b7d 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml @@ -41,7 +41,7 @@ dependencies = [ "pillow>=10.2.0,<11", "docx2txt>=0.8,<0.9", "xlrd>=2.0.1,<3", - "svglib>=1.5.1,<2", + "svglib>=1.5,<1.6", "retrying>=1.3.4,<2", "llama-index-core>=0.13.0,<0.15", ] From 12e25d512626cc63d21a97b2d515a0422d07d0a7 Mon Sep 17 00:00:00 2001 From: Manoj B Bhamsagar Date: Mon, 27 Oct 2025 09:28:32 +0530 Subject: [PATCH 4/4] bump: version to 0.5.0 for confluence reader --- .../readers/llama-index-readers-confluence/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml index c284ae4b7d..f6890da622 100644 --- a/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-confluence/pyproject.toml @@ -26,7 +26,7 @@ dev = [ [project] name = "llama-index-readers-confluence" -version = "0.4.4" +version = "0.5.0" description = "llama-index readers confluence integration" authors = [{name = "Your Name", email = "you@example.com"}] requires-python = ">=3.9,<4.0"