Handle very long file names (onyx-dot-app#4939)

Weves · web-flow · commit e4c0ab3247ba · 2025-06-23T19:22:02.000-07:00
* Handle very long file names

* Add logging

* Enhancements

* EL comments
diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py
@@ -27,6 +27,7 @@
 from onyx.db.file_record import get_filerecord_by_file_id_optional
 from onyx.db.file_record import upsert_filerecord
 from onyx.db.models import FileRecord as FileStoreModel
+from onyx.file_store.s3_key_utils import generate_s3_key
 from onyx.utils.file import FileWithMimeType
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import get_current_tenant_id
@@ -206,7 +207,19 @@ def _get_bucket_name(self) -> str:
     def _get_s3_key(self, file_name: str) -> str:
         """Generate S3 key from file name with tenant ID prefix"""
         tenant_id = get_current_tenant_id()
-        return f"{self._s3_prefix}/{tenant_id}/{file_name}"
+
+        s3_key = generate_s3_key(
+            file_name=file_name,
+            prefix=self._s3_prefix,
+            tenant_id=tenant_id,
+            max_key_length=1024,
+        )
+
+        # Log if truncation occurred (when the key is exactly at the limit)
+        if len(s3_key) == 1024:
+            logger.info(f"File name was too long and was truncated: {file_name}")
+
+        return s3_key
 
     def initialize(self) -> None:
         """Initialize the S3 file store by ensuring the bucket exists"""
diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py
@@ -0,0 +1,164 @@
+"""
+S3 key sanitization utilities for ensuring AWS S3 compatibility.
+
+This module provides utilities for sanitizing file names to be compatible with
+AWS S3 object key naming guidelines while ensuring uniqueness when significant
+sanitization occurs.
+
+Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
+"""
+
+import hashlib
+import re
+import urllib.parse
+from re import Match
+
+# Constants for S3 key generation
+HASH_LENGTH = 64  # SHA256 hex digest length
+HASH_SEPARATOR_LENGTH = 1  # Length of underscore separator
+HASH_WITH_SEPARATOR_LENGTH = HASH_LENGTH + HASH_SEPARATOR_LENGTH
+
+
+def _encode_special_char(match: Match[str]) -> str:
+    """Helper function to URL encode special characters."""
+    return urllib.parse.quote(match.group(0), safe="")
+
+
+def sanitize_s3_key_name(file_name: str) -> str:
+    """
+    Sanitize file name to be S3-compatible according to AWS guidelines.
+
+    This method:
+    1. Replaces problematic characters with safe alternatives
+    2. URL-encodes characters that might require special handling
+    3. Ensures the result is safe for S3 object keys
+    4. Adds uniqueness when significant sanitization occurs
+
+    Args:
+        file_name: The original file name to sanitize
+
+    Returns:
+        A sanitized file name that is S3-compatible
+
+    Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
+    """
+    if not file_name:
+        return "unnamed_file"
+
+    original_name = file_name
+
+    # Characters to avoid completely (replace with underscore)
+    # These are characters that AWS recommends avoiding
+    avoid_chars = r'[\\{}^%`\[\]"<>#|~]'
+
+    # Replace avoided characters with underscore
+    sanitized = re.sub(avoid_chars, "_", file_name)
+
+    # Characters that might require special handling but are allowed
+    # We'll URL encode these to be safe
+    special_chars = r"[&$@=;:+,?\s]"
+
+    sanitized = re.sub(special_chars, _encode_special_char, sanitized)
+
+    # Handle non-ASCII characters by URL encoding them
+    # This ensures Unicode characters are properly handled
+    needs_unicode_encoding = False
+    try:
+        # Try to encode as ASCII to check if it contains non-ASCII chars
+        sanitized.encode("ascii")
+    except UnicodeEncodeError:
+        needs_unicode_encoding = True
+        # Contains non-ASCII characters, URL encode the entire string
+        # but preserve safe ASCII characters
+        sanitized = urllib.parse.quote(
+            sanitized,
+            safe="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.()!*",
+        )
+
+    # Ensure we don't have consecutive periods at the start (relative path issue)
+    sanitized = re.sub(r"^\.+", "", sanitized)
+
+    # Remove any trailing periods to avoid download issues
+    sanitized = sanitized.rstrip(".")
+
+    # If sanitization resulted in empty string, use a default
+    if not sanitized:
+        sanitized = "sanitized_file"
+
+    # Check if significant sanitization occurred and add uniqueness if needed
+    significant_changes = (
+        # Check if we replaced many characters
+        len(re.findall(avoid_chars, original_name)) > 3
+        or
+        # Check if we had to URL encode Unicode characters
+        needs_unicode_encoding
+        or
+        # Check if the sanitized name is very different in length (expansion due to encoding)
+        len(sanitized) > len(original_name) * 2
+        or
+        # Check if the original had many special characters
+        len(re.findall(special_chars, original_name)) > 5
+    )
+
+    if significant_changes:
+        # Add a short hash to ensure uniqueness while keeping some readability
+        name_hash = hashlib.sha256(original_name.encode("utf-8")).hexdigest()[:8]
+
+        # Try to preserve file extension if it exists and is reasonable
+        if "." in sanitized and len(sanitized.split(".")[-1]) <= 10:
+            name_parts = sanitized.rsplit(".", 1)
+            sanitized = f"{name_parts[0]}_{name_hash}.{name_parts[1]}"
+        else:
+            sanitized = f"{sanitized}_{name_hash}"
+
+    return sanitized
+
+
+def generate_s3_key(
+    file_name: str, prefix: str, tenant_id: str, max_key_length: int = 1024
+) -> str:
+    """
+    Generate a complete S3 key from file name with prefix and tenant ID.
+
+    Args:
+        file_name: The original file name
+        prefix: S3 key prefix (e.g., 'onyx-files')
+        tenant_id: Tenant identifier
+        max_key_length: Maximum allowed S3 key length (default: 1024)
+
+    Returns:
+        A complete S3 key that fits within the length limit
+    """
+    # Strip slashes from prefix and tenant_id to avoid double slashes
+    prefix_clean = prefix.strip("/")
+    tenant_clean = tenant_id.strip("/")
+
+    # Sanitize the file name first
+    sanitized_file_name = sanitize_s3_key_name(file_name)
+
+    # Handle long file names that could exceed S3's key limit
+    # S3 key format: {prefix}/{tenant_id}/{file_name}
+    prefix_and_tenant_parts = [prefix_clean, tenant_clean]
+    prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/"
+    max_file_name_length = max_key_length - len(prefix_and_tenant)
+
+    if len(sanitized_file_name) < max_file_name_length:
+        return "/".join(prefix_and_tenant_parts + [sanitized_file_name])
+
+    # For very long file names, use hash-based approach to ensure uniqueness
+    # Use the original file name for the hash to maintain consistency
+    file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
+
+    # Calculate how much space we have for the readable part
+    # Reserve space for hash (64 chars) + underscore separator (1 char)
+    readable_part_max_length = max(0, max_file_name_length - HASH_WITH_SEPARATOR_LENGTH)
+
+    if readable_part_max_length > 0:
+        # Use first part of sanitized name + hash to maintain some readability
+        readable_part = sanitized_file_name[:readable_part_max_length]
+        truncated_name = f"{readable_part}_{file_hash}"
+    else:
+        # If no space for readable part, just use hash
+        truncated_name = file_hash
+
+    return "/".join(prefix_and_tenant_parts + [truncated_name])