From 72e952018c9ef0192ebcb2625ea22c62cd281d82 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Mon, 23 Jun 2025 16:36:48 -0700
Subject: [PATCH 1/4] Handle very long file names

---
 backend/onyx/file_store/file_store.py | 28 ++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py
index 9226ac9a760..5aa97fad878 100644
--- a/backend/onyx/file_store/file_store.py
+++ b/backend/onyx/file_store/file_store.py
@@ -1,3 +1,4 @@
+import hashlib
 import tempfile
 import uuid
 from abc import ABC
@@ -206,7 +207,32 @@ def _get_bucket_name(self) -> str:
     def _get_s3_key(self, file_name: str) -> str:
         """Generate S3 key from file name with tenant ID prefix"""
         tenant_id = get_current_tenant_id()
-        return f"{self._s3_prefix}/{tenant_id}/{file_name}"
+
+        # Strip slashes from prefix and tenant_id to avoid double slashes
+        prefix_clean = self._s3_prefix.strip("/")
+        tenant_clean = tenant_id.strip("/")
+
+        # Handle long file names that could exceed S3's 1024 character key limit
+        # S3 key format: {prefix}/{tenant_id}/{file_name}
+        # We need to account for the prefix, tenant_id, and forward slashes
+        prefix_and_tenant_parts = [prefix_clean, tenant_clean]
+        prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/"
+        max_file_name_length = 1024 - len(prefix_and_tenant)
+
+        if len(file_name) < max_file_name_length:
+            return "/".join(prefix_and_tenant_parts + [file_name])
+
+        # For long file names, use a hash-based approach to ensure uniqueness
+        file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
+        # Use first part of original name + hash to maintain some readability
+        readable_part = file_name[
+            : max(0, max_file_name_length - 65)
+        ]  # 64 chars for hash + 1 for separator
+        if readable_part:
+            truncated_name = f"{readable_part}_{file_hash}"
+        else:
+            truncated_name = file_hash
+        return "/".join(prefix_and_tenant_parts + [truncated_name])
 
     def initialize(self) -> None:
         """Initialize the S3 file store by ensuring the bucket exists"""

From 0ce1d9a0baee83fb1fd6a7eafdea2ab9c957b63a Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Mon, 23 Jun 2025 16:57:50 -0700
Subject: [PATCH 2/4] Add logging

---
 backend/onyx/file_store/file_store.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py
index 5aa97fad878..5c99341241c 100644
--- a/backend/onyx/file_store/file_store.py
+++ b/backend/onyx/file_store/file_store.py
@@ -222,6 +222,8 @@ def _get_s3_key(self, file_name: str) -> str:
         if len(file_name) < max_file_name_length:
             return "/".join(prefix_and_tenant_parts + [file_name])
 
+        logger.info(f"File name is too long: {file_name}. Truncating.")
+
         # For long file names, use a hash-based approach to ensure uniqueness
         file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
         # Use first part of original name + hash to maintain some readability

From 886f2a16e136c8fd61b31e365817a1402fad6530 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Mon, 23 Jun 2025 18:09:41 -0700
Subject: [PATCH 3/4] Enhancements

---
 backend/onyx/file_store/file_store.py   |  41 ++----
 backend/onyx/file_store/s3_key_utils.py | 159 ++++++++++++++++++++++++
 2 files changed, 172 insertions(+), 28 deletions(-)
 create mode 100644 backend/onyx/file_store/s3_key_utils.py

diff --git a/backend/onyx/file_store/file_store.py b/backend/onyx/file_store/file_store.py
index 5c99341241c..537c734bb16 100644
--- a/backend/onyx/file_store/file_store.py
+++ b/backend/onyx/file_store/file_store.py
@@ -1,4 +1,3 @@
-import hashlib
 import tempfile
 import uuid
 from abc import ABC
@@ -28,6 +27,7 @@
 from onyx.db.file_record import get_filerecord_by_file_id_optional
 from onyx.db.file_record import upsert_filerecord
 from onyx.db.models import FileRecord as FileStoreModel
+from onyx.file_store.s3_key_utils import generate_s3_key
 from onyx.utils.file import FileWithMimeType
 from onyx.utils.logger import setup_logger
 from shared_configs.contextvars import get_current_tenant_id
@@ -208,33 +208,18 @@ def _get_s3_key(self, file_name: str) -> str:
         """Generate S3 key from file name with tenant ID prefix"""
         tenant_id = get_current_tenant_id()
 
-        # Strip slashes from prefix and tenant_id to avoid double slashes
-        prefix_clean = self._s3_prefix.strip("/")
-        tenant_clean = tenant_id.strip("/")
-
-        # Handle long file names that could exceed S3's 1024 character key limit
-        # S3 key format: {prefix}/{tenant_id}/{file_name}
-        # We need to account for the prefix, tenant_id, and forward slashes
-        prefix_and_tenant_parts = [prefix_clean, tenant_clean]
-        prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/"
-        max_file_name_length = 1024 - len(prefix_and_tenant)
-
-        if len(file_name) < max_file_name_length:
-            return "/".join(prefix_and_tenant_parts + [file_name])
-
-        logger.info(f"File name is too long: {file_name}. Truncating.")
-
-        # For long file names, use a hash-based approach to ensure uniqueness
-        file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
-        # Use first part of original name + hash to maintain some readability
-        readable_part = file_name[
-            : max(0, max_file_name_length - 65)
-        ]  # 64 chars for hash + 1 for separator
-        if readable_part:
-            truncated_name = f"{readable_part}_{file_hash}"
-        else:
-            truncated_name = file_hash
-        return "/".join(prefix_and_tenant_parts + [truncated_name])
+        s3_key = generate_s3_key(
+            file_name=file_name,
+            prefix=self._s3_prefix,
+            tenant_id=tenant_id,
+            max_key_length=1024,
+        )
+
+        # Log if truncation occurred (when the key is exactly at the limit)
+        if len(s3_key) == 1024:
+            logger.info(f"File name was too long and was truncated: {file_name}")
+
+        return s3_key
 
     def initialize(self) -> None:
         """Initialize the S3 file store by ensuring the bucket exists"""
diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py
new file mode 100644
index 00000000000..b38c0385a4b
--- /dev/null
+++ b/backend/onyx/file_store/s3_key_utils.py
@@ -0,0 +1,159 @@
+"""
+S3 key sanitization utilities for ensuring AWS S3 compatibility.
+
+This module provides utilities for sanitizing file names to be compatible with
+AWS S3 object key naming guidelines while ensuring uniqueness when significant
+sanitization occurs.
+
+Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
+"""
+
+import hashlib
+import re
+import urllib.parse
+from re import Match
+
+
+def _encode_special_char(match: Match[str]) -> str:
+    """Helper function to URL encode special characters."""
+    return urllib.parse.quote(match.group(0), safe="")
+
+
+def sanitize_s3_key_name(file_name: str) -> str:
+    """
+    Sanitize file name to be S3-compatible according to AWS guidelines.
+
+    This method:
+    1. Replaces problematic characters with safe alternatives
+    2. URL-encodes characters that might require special handling
+    3. Ensures the result is safe for S3 object keys
+    4. Adds uniqueness when significant sanitization occurs
+
+    Args:
+        file_name: The original file name to sanitize
+
+    Returns:
+        A sanitized file name that is S3-compatible
+
+    Reference: https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
+    """
+    if not file_name:
+        return "unnamed_file"
+
+    original_name = file_name
+
+    # Characters to avoid completely (replace with underscore)
+    # These are characters that AWS recommends avoiding
+    avoid_chars = r'[\\{}^%`\[\]"<>#|~]'
+
+    # Replace avoided characters with underscore
+    sanitized = re.sub(avoid_chars, "_", file_name)
+
+    # Characters that might require special handling but are allowed
+    # We'll URL encode these to be safe
+    special_chars = r"[&$@=;:+,?\s]"
+
+    sanitized = re.sub(special_chars, _encode_special_char, sanitized)
+
+    # Handle non-ASCII characters by URL encoding them
+    # This ensures Unicode characters are properly handled
+    needs_unicode_encoding = False
+    try:
+        # Try to encode as ASCII to check if it contains non-ASCII chars
+        sanitized.encode("ascii")
+    except UnicodeEncodeError:
+        needs_unicode_encoding = True
+        # Contains non-ASCII characters, URL encode the entire string
+        # but preserve safe ASCII characters
+        sanitized = urllib.parse.quote(
+            sanitized,
+            safe="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.()!*",
+        )
+
+    # Ensure we don't have consecutive periods at the start (relative path issue)
+    sanitized = re.sub(r"^\.+", "", sanitized)
+
+    # Remove any trailing periods to avoid download issues
+    sanitized = sanitized.rstrip(".")
+
+    # If sanitization resulted in empty string, use a default
+    if not sanitized:
+        sanitized = "sanitized_file"
+
+    # Check if significant sanitization occurred and add uniqueness if needed
+    significant_changes = (
+        # Check if we replaced many characters
+        len(re.findall(avoid_chars, original_name)) > 3
+        or
+        # Check if we had to URL encode Unicode characters
+        needs_unicode_encoding
+        or
+        # Check if the sanitized name is very different in length (expansion due to encoding)
+        len(sanitized) > len(original_name) * 2
+        or
+        # Check if the original had many special characters
+        len(re.findall(special_chars, original_name)) > 5
+    )
+
+    if significant_changes:
+        # Add a short hash to ensure uniqueness while keeping some readability
+        name_hash = hashlib.sha256(original_name.encode("utf-8")).hexdigest()[:8]
+
+        # Try to preserve file extension if it exists and is reasonable
+        if "." in sanitized and len(sanitized.split(".")[-1]) <= 10:
+            name_parts = sanitized.rsplit(".", 1)
+            sanitized = f"{name_parts[0]}_{name_hash}.{name_parts[1]}"
+        else:
+            sanitized = f"{sanitized}_{name_hash}"
+
+    return sanitized
+
+
+def generate_s3_key(
+    file_name: str, prefix: str, tenant_id: str, max_key_length: int = 1024
+) -> str:
+    """
+    Generate a complete S3 key from file name with prefix and tenant ID.
+
+    Args:
+        file_name: The original file name
+        prefix: S3 key prefix (e.g., 'onyx-files')
+        tenant_id: Tenant identifier
+        max_key_length: Maximum allowed S3 key length (default: 1024)
+
+    Returns:
+        A complete S3 key that fits within the length limit
+    """
+    # Strip slashes from prefix and tenant_id to avoid double slashes
+    prefix_clean = prefix.strip("/")
+    tenant_clean = tenant_id.strip("/")
+
+    # Sanitize the file name first
+    sanitized_file_name = sanitize_s3_key_name(file_name)
+
+    # Handle long file names that could exceed S3's key limit
+    # S3 key format: {prefix}/{tenant_id}/{file_name}
+    prefix_and_tenant_parts = [prefix_clean, tenant_clean]
+    prefix_and_tenant = "/".join(prefix_and_tenant_parts) + "/"
+    max_file_name_length = max_key_length - len(prefix_and_tenant)
+
+    if len(sanitized_file_name) < max_file_name_length:
+        return "/".join(prefix_and_tenant_parts + [sanitized_file_name])
+
+    # For very long file names, use hash-based approach to ensure uniqueness
+    # Use the original file name for the hash to maintain consistency
+    file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
+
+    # Calculate how much space we have for the readable part
+    # Reserve 65 characters for hash (64 chars + 1 underscore)
+    readable_part_max_length = max(0, max_file_name_length - 65)
+
+    if readable_part_max_length > 0:
+        # Use first part of sanitized name + hash to maintain some readability
+        readable_part = sanitized_file_name[:readable_part_max_length]
+        truncated_name = f"{readable_part}_{file_hash}"
+    else:
+        # If no space for readable part, just use hash
+        truncated_name = file_hash
+
+    return "/".join(prefix_and_tenant_parts + [truncated_name])

From 1456738083441777c7e38da85f5aff3acb51f184 Mon Sep 17 00:00:00 2001
From: Weves <chrisweaver101@gmail.com>
Date: Mon, 23 Jun 2025 18:16:11 -0700
Subject: [PATCH 4/4] EL comments

---
 backend/onyx/file_store/s3_key_utils.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/backend/onyx/file_store/s3_key_utils.py b/backend/onyx/file_store/s3_key_utils.py
index b38c0385a4b..3f84ce59da1 100644
--- a/backend/onyx/file_store/s3_key_utils.py
+++ b/backend/onyx/file_store/s3_key_utils.py
@@ -13,6 +13,11 @@
 import urllib.parse
 from re import Match
 
+# Constants for S3 key generation
+HASH_LENGTH = 64  # SHA256 hex digest length
+HASH_SEPARATOR_LENGTH = 1  # Length of underscore separator
+HASH_WITH_SEPARATOR_LENGTH = HASH_LENGTH + HASH_SEPARATOR_LENGTH
+
 
 def _encode_special_char(match: Match[str]) -> str:
     """Helper function to URL encode special characters."""
@@ -145,8 +150,8 @@ def generate_s3_key(
     file_hash = hashlib.sha256(file_name.encode("utf-8")).hexdigest()
 
     # Calculate how much space we have for the readable part
-    # Reserve 65 characters for hash (64 chars + 1 underscore)
-    readable_part_max_length = max(0, max_file_name_length - 65)
+    # Reserve space for hash (64 chars) + underscore separator (1 char)
+    readable_part_max_length = max(0, max_file_name_length - HASH_WITH_SEPARATOR_LENGTH)
 
     if readable_part_max_length > 0:
         # Use first part of sanitized name + hash to maintain some readability