From 72e952018c9ef0192ebcb2625ea22c62cd281d82 Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 23 Jun 2025 16:36:48 -0700 Subject: [PATCH 1/4] Handle very long file names --- backend/onyx/file_store/file_store.py | 28 ++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py index 9226ac9a760..5aa97fad878 100644 --- a/backend/onyx/file_store/file_store.py +++ b/backend/onyx/file_store/file_store.py @@ -1,3 +1,4 @@ +import hashlib import tempfile import uuid from abc import ABC @@ -206,7 +207,32 @@ def _get_bucket_name(self) -> str: def _get_s3_key(self, file_name: str) -> str: """Generate S3 key from file name with tenant ID prefix""" tenant_id = get_current_tenant_id() - return f"{self._s3_prefix}/{tenant_id}/{file_name}" + + # Strip slashes from prefix and tenant_id to avoid double slashes + prefix_clean = self._s3_prefix.strip("/") + tenant_clean = tenant_id.strip("/") + + # Handle long file names that could exceed S3's 1024 character key limit + # S3 key format: {prefix}/{tenant_id}/{file_name} + # We need to account for the prefix, tenant_id, and forward slashes + prefix_and_tenant_parts = [prefix_clean, tenant_clean] + prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/" + max_file_name_length = 1024 - len(prefix_and_tenant) + + if len(file_name) < max_file_name_length: + return "/".join(prefix_and_tenant_parts + [file_name]) + + # For long file names, use a hash-based approach to ensure uniqueness + file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest() + # Use first part of original name + hash to maintain some readability + readable_part = file_name[ + : max(0, max_file_name_length - 65) + ] # 64 chars for hash + 1 for separator + if readable_part: + truncated_name = f"{readable_part}_{file_hash}" + else: + truncated_name = file_hash + return "/".join(prefix_and_tenant_parts + [truncated_name]) def initialize(self) -> None: """Initialize the S3 file store by ensuring the bucket exists""" From 0ce1d9a0baee83fb1fd6a7eafdea2ab9c957b63a Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 23 Jun 2025 16:57:50 -0700 Subject: [PATCH 2/4] Add logging --- backend/onyx/file_store/file_store.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py index 5aa97fad878..5c99341241c 100644 --- a/backend/onyx/file_store/file_store.py +++ b/backend/onyx/file_store/file_store.py @@ -222,6 +222,8 @@ def _get_s3_key(self, file_name: str) -> str: if len(file_name) < max_file_name_length: return "/".join(prefix_and_tenant_parts + [file_name]) + logger.info(f"File name is too long: {file_name}. Truncating.") + # For long file names, use a hash-based approach to ensure uniqueness file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest() # Use first part of original name + hash to maintain some readability From 886f2a16e136c8fd61b31e365817a1402fad6530 Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 23 Jun 2025 18:09:41 -0700 Subject: [PATCH 3/4] Enhancements --- backend/onyx/file_store/file_store.py | 41 ++---- backend/onyx/file_store/s3_key_utils.py | 159 ++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 28 deletions(-) create mode 100644 backend/onyx/file_store/s3_key_utils.py diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py index 5c99341241c..537c734bb16 100644 --- a/backend/onyx/file_store/file_store.py +++ b/backend/onyx/file_store/file_store.py @@ -1,4 +1,3 @@ -import hashlib import tempfile import uuid from abc import ABC @@ -28,6 +27,7 @@ from onyx.db.file_record import get_filerecord_by_file_id_optional from onyx.db.file_record import upsert_filerecord from onyx.db.models import FileRecord as FileStoreModel +from onyx.file_store.s3_key_utils import generate_s3_key from onyx.utils.file import FileWithMimeType from onyx.utils.logger import setup_logger from shared_configs.contextvars import get_current_tenant_id @@ -208,33 +208,18 @@ def _get_s3_key(self, file_name: str) -> str: """Generate S3 key from file name with tenant ID prefix""" tenant_id = get_current_tenant_id() - # Strip slashes from prefix and tenant_id to avoid double slashes - prefix_clean = self._s3_prefix.strip("/") - tenant_clean = tenant_id.strip("/") - - # Handle long file names that could exceed S3's 1024 character key limit - # S3 key format: {prefix}/{tenant_id}/{file_name} - # We need to account for the prefix, tenant_id, and forward slashes - prefix_and_tenant_parts = [prefix_clean, tenant_clean] - prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/" - max_file_name_length = 1024 - len(prefix_and_tenant) - - if len(file_name) < max_file_name_length: - return "/".join(prefix_and_tenant_parts + [file_name]) - - logger.info(f"File name is too long: {file_name}. Truncating.") - - # For long file names, use a hash-based approach to ensure uniqueness - file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest() - # Use first part of original name + hash to maintain some readability - readable_part = file_name[ - : max(0, max_file_name_length - 65) - ] # 64 chars for hash + 1 for separator - if readable_part: - truncated_name = f"{readable_part}_{file_hash}" - else: - truncated_name = file_hash - return "/".join(prefix_and_tenant_parts + [truncated_name]) + s3_key = generate_s3_key( + file_name=file_name, + prefix=self._s3_prefix, + tenant_id=tenant_id, + max_key_length=1024, + ) + + # Log if truncation occurred (when the key is exactly at the limit) + if len(s3_key) == 1024: + logger.info(f"File name was too long and was truncated: {file_name}") + + return s3_key def initialize(self) -> None: """Initialize the S3 file store by ensuring the bucket exists""" diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py new file mode 100644 index 00000000000..b38c0385a4b --- /dev/null +++ b/backend/onyx/file_store/s3_key_utils.py @@ -0,0 +1,159 @@ +""" +S3 key sanitization utilities for ensuring AWS S3 compatibility. + +This module provides utilities for sanitizing file names to be compatible with +AWS S3 object key naming guidelines while ensuring uniqueness when significant +sanitization occurs. + +Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html +""" + +import hashlib +import re +import urllib.parse +from re import Match + + +def _encode_special_char(match: Match[str]) -> str: + """Helper function to URL encode special characters.""" + return urllib.parse.quote(match.group(0), safe="") + + +def sanitize_s3_key_name(file_name: str) -> str: + """ + Sanitize file name to be S3-compatible according to AWS guidelines. + + This method: + 1. Replaces problematic characters with safe alternatives + 2. URL-encodes characters that might require special handling + 3. Ensures the result is safe for S3 object keys + 4. Adds uniqueness when significant sanitization occurs + + Args: + file_name: The original file name to sanitize + + Returns: + A sanitized file name that is S3-compatible + + Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + """ + if not file_name: + return "unnamed_file" + + original_name = file_name + + # Characters to avoid completely (replace with underscore) + # These are characters that AWS recommends avoiding + avoid_chars = r'[\\{}^%`\[\]"<>#|~]' + + # Replace avoided characters with underscore + sanitized = re.sub(avoid_chars, "_", file_name) + + # Characters that might require special handling but are allowed + # We'll URL encode these to be safe + special_chars = r"[&$@=;:+,?\s]" + + sanitized = re.sub(special_chars, _encode_special_char, sanitized) + + # Handle non-ASCII characters by URL encoding them + # This ensures Unicode characters are properly handled + needs_unicode_encoding = False + try: + # Try to encode as ASCII to check if it contains non-ASCII chars + sanitized.encode("ascii") + except UnicodeEncodeError: + needs_unicode_encoding = True + # Contains non-ASCII characters, URL encode the entire string + # but preserve safe ASCII characters + sanitized = urllib.parse.quote( + sanitized, + safe="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.()!*", + ) + + # Ensure we don't have consecutive periods at the start (relative path issue) + sanitized = re.sub(r"^\.+", "", sanitized) + + # Remove any trailing periods to avoid download issues + sanitized = sanitized.rstrip(".") + + # If sanitization resulted in empty string, use a default + if not sanitized: + sanitized = "sanitized_file" + + # Check if significant sanitization occurred and add uniqueness if needed + significant_changes = ( + # Check if we replaced many characters + len(re.findall(avoid_chars, original_name)) > 3 + or + # Check if we had to URL encode Unicode characters + needs_unicode_encoding + or + # Check if the sanitized name is very different in length (expansion due to encoding) + len(sanitized) > len(original_name) * 2 + or + # Check if the original had many special characters + len(re.findall(special_chars, original_name)) > 5 + ) + + if significant_changes: + # Add a short hash to ensure uniqueness while keeping some readability + name_hash = hashlib.sha256(original_name.encode("utf-8")).hexdigest()[:8] + + # Try to preserve file extension if it exists and is reasonable + if "." in sanitized and len(sanitized.split(".")[-1]) <= 10: + name_parts = sanitized.rsplit(".", 1) + sanitized = f"{name_parts[0]}_{name_hash}.{name_parts[1]}" + else: + sanitized = f"{sanitized}_{name_hash}" + + return sanitized + + +def generate_s3_key( + file_name: str, prefix: str, tenant_id: str, max_key_length: int = 1024 +) -> str: + """ + Generate a complete S3 key from file name with prefix and tenant ID. + + Args: + file_name: The original file name + prefix: S3 key prefix (e.g., 'onyx-files') + tenant_id: Tenant identifier + max_key_length: Maximum allowed S3 key length (default: 1024) + + Returns: + A complete S3 key that fits within the length limit + """ + # Strip slashes from prefix and tenant_id to avoid double slashes + prefix_clean = prefix.strip("/") + tenant_clean = tenant_id.strip("/") + + # Sanitize the file name first + sanitized_file_name = sanitize_s3_key_name(file_name) + + # Handle long file names that could exceed S3's key limit + # S3 key format: {prefix}/{tenant_id}/{file_name} + prefix_and_tenant_parts = [prefix_clean, tenant_clean] + prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/" + max_file_name_length = max_key_length - len(prefix_and_tenant) + + if len(sanitized_file_name) < max_file_name_length: + return "/".join(prefix_and_tenant_parts + [sanitized_file_name]) + + # For very long file names, use hash-based approach to ensure uniqueness + # Use the original file name for the hash to maintain consistency + file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest() + + # Calculate how much space we have for the readable part + # Reserve 65 characters for hash (64 chars + 1 underscore) + readable_part_max_length = max(0, max_file_name_length - 65) + + if readable_part_max_length > 0: + # Use first part of sanitized name + hash to maintain some readability + readable_part = sanitized_file_name[:readable_part_max_length] + truncated_name = f"{readable_part}_{file_hash}" + else: + # If no space for readable part, just use hash + truncated_name = file_hash + + return "/".join(prefix_and_tenant_parts + [truncated_name]) From 1456738083441777c7e38da85f5aff3acb51f184 Mon Sep 17 00:00:00 2001 From: Weves Date: Mon, 23 Jun 2025 18:16:11 -0700 Subject: [PATCH 4/4] EL comments --- backend/onyx/file_store/s3_key_utils.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py index b38c0385a4b..3f84ce59da1 100644 --- a/backend/onyx/file_store/s3_key_utils.py +++ b/backend/onyx/file_store/s3_key_utils.py @@ -13,6 +13,11 @@ import urllib.parse from re import Match +# Constants for S3 key generation +HASH_LENGTH = 64 # SHA256 hex digest length +HASH_SEPARATOR_LENGTH = 1 # Length of underscore separator +HASH_WITH_SEPARATOR_LENGTH = HASH_LENGTH + HASH_SEPARATOR_LENGTH + def _encode_special_char(match: Match[str]) -> str: """Helper function to URL encode special characters.""" @@ -145,8 +150,8 @@ def generate_s3_key( file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest() # Calculate how much space we have for the readable part - # Reserve 65 characters for hash (64 chars + 1 underscore) - readable_part_max_length = max(0, max_file_name_length - 65) + # Reserve space for hash (64 chars) + underscore separator (1 char) + readable_part_max_length = max(0, max_file_name_length - HASH_WITH_SEPARATOR_LENGTH) if readable_part_max_length > 0: # Use first part of sanitized name + hash to maintain some readability