diff --git a/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py new file mode 100644 index 00000000000..1d360275107 --- /dev/null +++ b/backend/alembic/versions/12635f6655b7_drive_canonical_ids.py @@ -0,0 +1,555 @@ +"""drive-canonical-ids + +Revision ID: 12635f6655b7 +Revises: 58c50ef19f08 +Create Date: 2025-06-20 14:44:54.241159 + +""" + +from alembic import op +import sqlalchemy as sa +from urllib.parse import urlparse, urlunparse + +from onyx.document_index.factory import get_default_document_index +from onyx.db.search_settings import SearchSettings +from onyx.document_index.vespa.shared_utils.utils import get_vespa_http_client +from onyx.document_index.vespa.shared_utils.utils import ( + replace_invalid_doc_id_characters, +) +from onyx.document_index.vespa_constants import SEARCH_ENDPOINT, DOCUMENT_ID_ENDPOINT +from onyx.utils.logger import setup_logger + +logger = setup_logger() + +# revision identifiers, used by Alembic. +revision = "12635f6655b7" +down_revision = "58c50ef19f08" +branch_labels = None +depends_on = None + + +def active_search_settings() -> tuple[SearchSettings, SearchSettings | None]: + result = op.get_bind().execute( + sa.text( + """ + SELECT * FROM search_settings WHERE status = 'PRESENT' ORDER BY id DESC LIMIT 1 + """ + ) + ) + search_settings_fetch = result.fetchall() + search_settings = ( + SearchSettings(**search_settings_fetch[0]._asdict()) + if search_settings_fetch + else None + ) + + result2 = op.get_bind().execute( + sa.text( + """ + SELECT * FROM search_settings WHERE status = 'FUTURE' ORDER BY id DESC LIMIT 1 + """ + ) + ) + search_settings_future_fetch = result2.fetchall() + search_settings_future = ( + SearchSettings(**search_settings_future_fetch[0]._asdict()) + if search_settings_future_fetch + else None + ) + + if not isinstance(search_settings, SearchSettings): + raise RuntimeError( + "current search settings is of type " + str(type(search_settings)) + ) + if ( + not isinstance(search_settings_future, SearchSettings) + and search_settings_future is not None + ): + raise RuntimeError( + "future search settings is of type " + str(type(search_settings_future)) + ) + + return search_settings, search_settings_future + + +def normalize_google_drive_url(url: str) -> str: + """Remove query parameters from Google Drive URLs to create canonical document IDs. + NOTE: copied from drive doc_conversion.py + """ + parsed_url = urlparse(url) + parsed_url = parsed_url._replace(query="") + spl_path = parsed_url.path.split("/") + if spl_path and (spl_path[-1] in ["edit", "view", "preview"]): + spl_path.pop() + parsed_url = parsed_url._replace(path="/".join(spl_path)) + # Remove query parameters and reconstruct URL + return urlunparse(parsed_url) + + +def get_google_drive_documents_from_database() -> list[dict]: + """Get all Google Drive documents from the database.""" + bind = op.get_bind() + result = bind.execute( + sa.text( + """ + SELECT d.id, cc.id as cc_pair_id + FROM document d + JOIN document_by_connector_credential_pair dcc ON d.id = dcc.id + JOIN connector_credential_pair cc ON dcc.connector_id = cc.connector_id + AND dcc.credential_id = cc.credential_id + JOIN connector c ON cc.connector_id = c.id + WHERE c.source = 'GOOGLE_DRIVE' + """ + ) + ) + + documents = [] + for row in result: + documents.append({"document_id": row.id, "cc_pair_id": row.cc_pair_id}) + + return documents + + +def update_document_id_in_database(old_doc_id: str, new_doc_id: str) -> None: + """Update document IDs in all relevant database tables using copy-and-swap approach.""" + bind = op.get_bind() + + logger.info(f"Updating database tables for document {old_doc_id} -> {new_doc_id}") + + # Check if new document ID already exists + result = bind.execute( + sa.text("SELECT COUNT(*) FROM document WHERE id = :new_id"), + {"new_id": new_doc_id}, + ) + row = result.fetchone() + if row and row[0] > 0: + raise RuntimeError( + f"Document with ID {new_doc_id} already exists, cannot create duplicate" + ) + + # Step 1: Create a new document row with the new ID (copy all fields from old row) + # Use a conservative approach to handle columns that might not exist in all installations + try: + bind.execute( + sa.text( + """ + INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id, + link, doc_updated_at, primary_owners, secondary_owners, + external_user_emails, external_user_group_ids, is_public, + chunk_count, last_modified, last_synced, kg_stage, kg_processing_time) + SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id, + link, doc_updated_at, primary_owners, secondary_owners, + external_user_emails, external_user_group_ids, is_public, + chunk_count, last_modified, last_synced, kg_stage, kg_processing_time + FROM document + WHERE id = :old_id + """ + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + except Exception as e: + # If the full INSERT fails, try a more basic version with only core columns + logger.warning(f"Full INSERT failed, trying basic version: {e}") + bind.execute( + sa.text( + """ + INSERT INTO document (id, from_ingestion_api, boost, hidden, semantic_id, + link, doc_updated_at, primary_owners, secondary_owners) + SELECT :new_id, from_ingestion_api, boost, hidden, semantic_id, + link, doc_updated_at, primary_owners, secondary_owners + FROM document + WHERE id = :old_id + """ + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Step 2: Update all foreign key references to point to the new ID + + # Update document_by_connector_credential_pair table + bind.execute( + sa.text( + "UPDATE document_by_connector_credential_pair SET id = :new_id WHERE id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update search_doc table (stores search results for chat replay) + bind.execute( + sa.text( + "UPDATE search_doc SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update document_retrieval_feedback table (user feedback on documents) + bind.execute( + sa.text( + "UPDATE document_retrieval_feedback SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update document__tag table (document-tag relationships) + bind.execute( + sa.text( + "UPDATE document__tag SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update user_file table (user uploaded files linked to documents) + bind.execute( + sa.text( + "UPDATE user_file SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update KG and chunk_stats tables (these may not exist in all installations) + try: + # Update kg_entity table + bind.execute( + sa.text( + "UPDATE kg_entity SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update kg_entity_extraction_staging table + bind.execute( + sa.text( + "UPDATE kg_entity_extraction_staging SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update kg_relationship table + bind.execute( + sa.text( + "UPDATE kg_relationship SET source_document = :new_id WHERE source_document = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update kg_relationship_extraction_staging table + bind.execute( + sa.text( + "UPDATE kg_relationship_extraction_staging SET source_document = :new_id WHERE source_document = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update chunk_stats table + bind.execute( + sa.text( + "UPDATE chunk_stats SET document_id = :new_id WHERE document_id = :old_id" + ), + {"new_id": new_doc_id, "old_id": old_doc_id}, + ) + + # Update chunk_stats ID field which includes document_id + bind.execute( + sa.text( + """ + UPDATE chunk_stats + SET id = REPLACE(id, :old_id, :new_id) + WHERE id LIKE :old_id_pattern + """ + ), + { + "new_id": new_doc_id, + "old_id": old_doc_id, + "old_id_pattern": f"{old_doc_id}__%", + }, + ) + + except Exception as e: + logger.warning(f"Some KG/chunk tables may not exist or failed to update: {e}") + + # Step 3: Delete the old document row (this should now be safe since all FKs point to new row) + bind.execute( + sa.text("DELETE FROM document WHERE id = :old_id"), {"old_id": old_doc_id} + ) + + +def delete_document_chunks_from_vespa(index_name: str, doc_id: str) -> None: + """Delete all chunks for a document from Vespa using pagination.""" + offset = 0 + limit = 400 # Vespa's maximum hits per query + total_deleted = 0 + + with get_vespa_http_client() as http_client: + while True: + # Use pagination to handle the 400 hit limit + yql = f'select documentid, document_id, chunk_id from sources {index_name} where document_id contains "{doc_id}"' + + params = { + "yql": yql, + "hits": str(limit), + "offset": str(offset), + "timeout": "30s", + "format": "json", + } + + response = http_client.get(SEARCH_ENDPOINT, params=params, timeout=None) + response.raise_for_status() + + search_result = response.json() + hits = search_result.get("root", {}).get("children", []) + + if not hits: + break # No more chunks to process + + # Delete each chunk in this batch + for hit in hits: + vespa_doc_id = hit.get("id") # This is the internal Vespa document ID + if not vespa_doc_id: + print(f"No Vespa document ID found for chunk {hit}") + continue + vespa_doc_id = vespa_doc_id.split("::")[-1] # get the UUID from the end + + # Delete the chunk using the internal Vespa document ID + delete_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_id}" + + try: + resp = http_client.delete(delete_url) + resp.raise_for_status() + total_deleted += 1 + except Exception as e: + print(f"Failed to delete chunk {vespa_doc_id}: {e}") + # Continue trying to delete other chunks even if one fails + continue + + # Move to next batch + offset += limit + + # If we got fewer hits than the limit, we're done + if len(hits) < limit: + break + + +def update_document_id_in_vespa( + index_name: str, old_doc_id: str, new_doc_id: str +) -> None: + """Update a document's ID in Vespa by updating the document_id field.""" + # Clean the new document ID for storage in Vespa (this handles invalid characters) + clean_new_doc_id = replace_invalid_doc_id_characters(new_doc_id) + + offset = 0 + limit = 400 # Vespa's maximum hits per query + total_updated = 0 + + with get_vespa_http_client() as http_client: + while True: + # Use pagination to handle the 400 hit limit + yql = f'select documentid, document_id, chunk_id from sources {index_name} where document_id contains "{old_doc_id}"' + + params = { + "yql": yql, + "hits": str(limit), + "offset": str(offset), + "timeout": "30s", + "format": "json", + } + + response = http_client.get(SEARCH_ENDPOINT, params=params, timeout=None) + response.raise_for_status() + + search_result = response.json() + hits = search_result.get("root", {}).get("children", []) + + if not hits: + break # No more chunks to process + + # Update each chunk in this batch + for hit in hits: + vespa_doc_id = hit.get("id") # This is the internal Vespa document ID + if not vespa_doc_id: + print(f"No Vespa document ID found for chunk {hit}") + continue + vespa_doc_id = vespa_doc_id.split("::")[-1] # get the UUID from the end + + vespa_url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{vespa_doc_id}" + update_request = { + "fields": {"document_id": {"assign": clean_new_doc_id}} + } + + try: + resp = http_client.put(vespa_url, json=update_request) + resp.raise_for_status() + total_updated += 1 + except Exception as e: + print(f"Failed to update chunk {vespa_doc_id}: {e}") + raise + + # Move to next batch + offset += limit + + # If we got fewer hits than the limit, we're done + if len(hits) < limit: + break + + +def delete_document_from_db(current_doc_id: str, index_name: str) -> None: + # Delete all foreign key references first, then delete the document + try: + bind = op.get_bind() + + # Delete from document_by_connector_credential_pair + bind.execute( + sa.text( + "DELETE FROM document_by_connector_credential_pair WHERE id = :doc_id" + ), + {"doc_id": current_doc_id}, + ) + + # Delete from other tables that reference this document + bind.execute( + sa.text("DELETE FROM search_doc WHERE document_id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text( + "DELETE FROM document_retrieval_feedback WHERE document_id = :doc_id" + ), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text("DELETE FROM document__tag WHERE document_id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text("DELETE FROM user_file WHERE document_id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + # Delete from KG tables if they exist + try: + bind.execute( + sa.text("DELETE FROM kg_entity WHERE document_id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text( + "DELETE FROM kg_entity_extraction_staging WHERE document_id = :doc_id" + ), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text("DELETE FROM kg_relationship WHERE source_document = :doc_id"), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text( + "DELETE FROM kg_relationship_extraction_staging WHERE source_document = :doc_id" + ), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text("DELETE FROM chunk_stats WHERE document_id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + bind.execute( + sa.text("DELETE FROM chunk_stats WHERE id LIKE :doc_id_pattern"), + {"doc_id_pattern": f"{current_doc_id}__%"}, + ) + + except Exception as e: + logger.warning( + f"Some KG/chunk tables may not exist or failed to delete from: {e}" + ) + + # Finally delete the document itself + bind.execute( + sa.text("DELETE FROM document WHERE id = :doc_id"), + {"doc_id": current_doc_id}, + ) + + # Delete chunks from vespa + delete_document_chunks_from_vespa(index_name, current_doc_id) + + except Exception as e: + print(f"Failed to delete duplicate document {current_doc_id}: {e}") + # Continue with other documents instead of failing the entire migration + + +def upgrade() -> None: + current_search_settings, future_search_settings = active_search_settings() + document_index = get_default_document_index( + current_search_settings, + future_search_settings, + ) + + # Get the index name + if hasattr(document_index, "index_name"): + index_name = document_index.index_name + else: + # Default index name if we can't get it from the document_index + index_name = "danswer_index" + + # Get all Google Drive documents from the database (this is faster and more reliable) + gdrive_documents = get_google_drive_documents_from_database() + + if not gdrive_documents: + return + + # Track normalized document IDs to detect duplicates + all_normalized_doc_ids = set() + updated_count = 0 + + for doc_info in gdrive_documents: + current_doc_id = doc_info["document_id"] + normalized_doc_id = normalize_google_drive_url(current_doc_id) + + # Check for duplicates + if normalized_doc_id in all_normalized_doc_ids: + delete_document_from_db(current_doc_id, index_name) + continue + + all_normalized_doc_ids.add(normalized_doc_id) + + # If the document ID already doesn't have query parameters, skip it + if current_doc_id == normalized_doc_id: + continue + + try: + # Update both database and Vespa in order + # Database first to ensure consistency + update_document_id_in_database(current_doc_id, normalized_doc_id) + + # For Vespa, we can now use the original document IDs since we're using contains matching + update_document_id_in_vespa(index_name, current_doc_id, normalized_doc_id) + updated_count += 1 + except Exception as e: + print(f"Failed to update document {current_doc_id}: {e}") + from httpx import HTTPStatusError + + if isinstance(e, HTTPStatusError): + print(f"HTTPStatusError: {e}") + print(f"Response: {e.response.text}") + print(f"Status: {e.response.status_code}") + print(f"Headers: {e.response.headers}") + print(f"Request: {e.request.url}") + print(f"Request headers: {e.request.headers}") + # Note: Rollback is complex with copy-and-swap approach since the old document is already deleted + # In case of failure, manual intervention may be required + # Continue with other documents instead of failing the entire migration + continue + + logger.info(f"Migration complete. Updated {updated_count} Google Drive documents") + + +def downgrade() -> None: + # this is a one way migration, so no downgrade. + # It wouldn't make sense to store the extra query parameters + # and duplicate documents to allow a reversal. + pass diff --git a/backend/alembic/versions/27c6ecc08586_permission_framework.py b/backend/alembic/versions/27c6ecc08586_permission_framework.py index e3a4e9471c3..d6d3811234c 100644 --- a/backend/alembic/versions/27c6ecc08586_permission_framework.py +++ b/backend/alembic/versions/27c6ecc08586_permission_framework.py @@ -144,27 +144,34 @@ def upgrade() -> None: def downgrade() -> None: op.execute("TRUNCATE TABLE index_attempt") - op.add_column( - "index_attempt", - sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False), - ) - op.add_column( - "index_attempt", - sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False), - ) - op.add_column( - "index_attempt", - sa.Column( - "connector_specific_config", - postgresql.JSONB(astext_type=sa.Text()), - autoincrement=False, - nullable=False, - ), - ) - - # Check if the constraint exists before dropping conn = op.get_bind() inspector = sa.inspect(conn) + existing_columns = {col["name"] for col in inspector.get_columns("index_attempt")} + + if "input_type" not in existing_columns: + op.add_column( + "index_attempt", + sa.Column("input_type", sa.VARCHAR(), autoincrement=False, nullable=False), + ) + + if "source" not in existing_columns: + op.add_column( + "index_attempt", + sa.Column("source", sa.VARCHAR(), autoincrement=False, nullable=False), + ) + + if "connector_specific_config" not in existing_columns: + op.add_column( + "index_attempt", + sa.Column( + "connector_specific_config", + postgresql.JSONB(astext_type=sa.Text()), + autoincrement=False, + nullable=False, + ), + ) + + # Check if the constraint exists before dropping constraints = inspector.get_foreign_keys("index_attempt") if any( @@ -183,8 +190,12 @@ def downgrade() -> None: "fk_index_attempt_connector_id", "index_attempt", type_="foreignkey" ) - op.drop_column("index_attempt", "credential_id") - op.drop_column("index_attempt", "connector_id") - op.drop_table("connector_credential_pair") - op.drop_table("credential") - op.drop_table("connector") + if "credential_id" in existing_columns: + op.drop_column("index_attempt", "credential_id") + + if "connector_id" in existing_columns: + op.drop_column("index_attempt", "connector_id") + + op.execute("DROP TABLE IF EXISTS connector_credential_pair CASCADE") + op.execute("DROP TABLE IF EXISTS credential CASCADE") + op.execute("DROP TABLE IF EXISTS connector CASCADE") diff --git a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py index ee5ab695879..65cf759d6f3 100644 --- a/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py +++ b/backend/alembic/versions/495cb26ce93e_create_knowlege_graph_tables.py @@ -80,6 +80,7 @@ def upgrade() -> None: ) ) + op.execute("DROP TABLE IF EXISTS kg_config CASCADE") op.create_table( "kg_config", sa.Column("id", sa.Integer(), primary_key=True, nullable=False, index=True), @@ -123,6 +124,7 @@ def upgrade() -> None: ], ) + op.execute("DROP TABLE IF EXISTS kg_entity_type CASCADE") op.create_table( "kg_entity_type", sa.Column("id_name", sa.String(), primary_key=True, nullable=False, index=True), @@ -156,6 +158,7 @@ def upgrade() -> None: ), ) + op.execute("DROP TABLE IF EXISTS kg_relationship_type CASCADE") # Create KGRelationshipType table op.create_table( "kg_relationship_type", @@ -194,6 +197,7 @@ def upgrade() -> None: ), ) + op.execute("DROP TABLE IF EXISTS kg_relationship_type_extraction_staging CASCADE") # Create KGRelationshipTypeExtractionStaging table op.create_table( "kg_relationship_type_extraction_staging", @@ -227,6 +231,8 @@ def upgrade() -> None: ), ) + op.execute("DROP TABLE IF EXISTS kg_entity CASCADE") + # Create KGEntity table op.create_table( "kg_entity", @@ -281,6 +287,7 @@ def upgrade() -> None: "ix_entity_name_search", "kg_entity", ["name", "entity_type_id_name"] ) + op.execute("DROP TABLE IF EXISTS kg_entity_extraction_staging CASCADE") # Create KGEntityExtractionStaging table op.create_table( "kg_entity_extraction_staging", @@ -330,6 +337,7 @@ def upgrade() -> None: ["name", "entity_type_id_name"], ) + op.execute("DROP TABLE IF EXISTS kg_relationship CASCADE") # Create KGRelationship table op.create_table( "kg_relationship", @@ -371,6 +379,7 @@ def upgrade() -> None: "ix_kg_relationship_nodes", "kg_relationship", ["source_node", "target_node"] ) + op.execute("DROP TABLE IF EXISTS kg_relationship_extraction_staging CASCADE") # Create KGRelationshipExtractionStaging table op.create_table( "kg_relationship_extraction_staging", @@ -414,6 +423,7 @@ def upgrade() -> None: ["source_node", "target_node"], ) + op.execute("DROP TABLE IF EXISTS kg_term CASCADE") # Create KGTerm table op.create_table( "kg_term", diff --git a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py index 2c207c50ffb..b588d9f19f6 100644 --- a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py +++ b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py @@ -236,13 +236,14 @@ def _migrate_files_to_external_storage() -> None: print("No files found in PostgreSQL storage to migrate.") return + # might need to move this above the if statement when creating a new multi-tenant + # system. VERY extreme edge case. + external_store.initialize() print(f"Found {total_files} files to migrate from PostgreSQL to external storage.") _set_tenant_contextvar(session) migrated_count = 0 - external_store.initialize() - for i, file_id in enumerate(files_to_migrate, 1): print(f"Migrating file {i}/{total_files}: {file_id}") diff --git a/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py b/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py index f862b415652..7ba81a490d2 100644 --- a/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py +++ b/backend/alembic/versions/df0c7ad8a076_added_deletion_attempt_table.py @@ -18,11 +18,13 @@ def upgrade() -> None: + op.execute("DROP TABLE IF EXISTS document CASCADE") op.create_table( "document", sa.Column("id", sa.String(), nullable=False), sa.PrimaryKeyConstraint("id"), ) + op.execute("DROP TABLE IF EXISTS chunk CASCADE") op.create_table( "chunk", sa.Column("id", sa.String(), nullable=False), @@ -43,6 +45,7 @@ def upgrade() -> None: ), sa.PrimaryKeyConstraint("id", "document_store_type"), ) + op.execute("DROP TABLE IF EXISTS deletion_attempt CASCADE") op.create_table( "deletion_attempt", sa.Column("id", sa.Integer(), nullable=False), @@ -84,6 +87,7 @@ def upgrade() -> None: ), sa.PrimaryKeyConstraint("id"), ) + op.execute("DROP TABLE IF EXISTS document_by_connector_credential_pair CASCADE") op.create_table( "document_by_connector_credential_pair", sa.Column("id", sa.String(), nullable=False), @@ -106,7 +110,10 @@ def upgrade() -> None: def downgrade() -> None: + # upstream tables first op.drop_table("document_by_connector_credential_pair") op.drop_table("deletion_attempt") op.drop_table("chunk") - op.drop_table("document") + + # Alembic op.drop_table() has no "cascade" flag – issue raw SQL + op.execute("DROP TABLE IF EXISTS document CASCADE") diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py index dba488be33b..9242d7facb1 100644 --- a/backend/onyx/connectors/google_drive/doc_conversion.py +++ b/backend/onyx/connectors/google_drive/doc_conversion.py @@ -3,6 +3,8 @@ from datetime import datetime from typing import Any from typing import cast +from urllib.parse import urlparse +from urllib.parse import urlunparse from googleapiclient.errors import HttpError # type: ignore from googleapiclient.http import MediaIoBaseDownload # type: ignore @@ -77,7 +79,15 @@ class PermissionSyncContext(BaseModel): def onyx_document_id_from_drive_file(file: GoogleDriveFileType) -> str: - return file[WEB_VIEW_LINK_KEY] + link = file[WEB_VIEW_LINK_KEY] + parsed_url = urlparse(link) + parsed_url = parsed_url._replace(query="") # remove query parameters + spl_path = parsed_url.path.split("/") + if spl_path and (spl_path[-1] in ["edit", "view", "preview"]): + spl_path.pop() + parsed_url = parsed_url._replace(path="/".join(spl_path)) + # Remove query parameters and reconstruct URL + return urlunparse(parsed_url) def is_gdrive_image_mime_type(mime_type: str) -> bool: diff --git a/backend/requirements/default.txt b/backend/requirements/default.txt index 35b4d4b3883..da96a6e2bad 100644 --- a/backend/requirements/default.txt +++ b/backend/requirements/default.txt @@ -9,7 +9,7 @@ celery==5.5.1 chardet==5.2.0 chonkie==1.0.10 dask==2023.8.1 -ddtrace==2.6.5 +ddtrace==3.10.0 discord.py==2.4.0 distributed==2023.8.1 fastapi==0.115.12 diff --git a/backend/tests/daily/connectors/google_drive/drive_id_mapping.json b/backend/tests/daily/connectors/google_drive/drive_id_mapping.json index b433d873ca9..b399faa02d9 100644 --- a/backend/tests/daily/connectors/google_drive/drive_id_mapping.json +++ b/backend/tests/daily/connectors/google_drive/drive_id_mapping.json @@ -1,68 +1,68 @@ { - "12": "https://drive.google.com/file/d/1u7nynrG4WuFZeuZs8yyhqJF_lbo-op-m/view?usp=drivesdk", - "10": "https://drive.google.com/file/d/1LFcVuXuXIdNJ7hkL0C40eYn_cQtryUVQ/view?usp=drivesdk", - "13": "https://drive.google.com/file/d/1muQMyYAJe0_F-HiDFIfFMt-4qsgMlREM/view?usp=drivesdk", - "11": "https://drive.google.com/file/d/1oHNtlsdJJtk7dE10NgH83Kn5_f2L-Su1/view?usp=drivesdk", - "14": "https://drive.google.com/file/d/1sAw-DrsqpnqLF5A8P59BZwIpt9-LrlaL/view?usp=drivesdk", - "18": "https://drive.google.com/file/d/1qqKH3esasdqV6ryEhdoSQezDPlKj11At/view?usp=drivesdk", - "17": "https://drive.google.com/file/d/1z08VsrCUTozpc5Quzb7mEDUwNkXU3foT/view?usp=drivesdk", - "15": "https://drive.google.com/file/d/1QQ6ZGyYP49IJNeGKNmqZISyVLzTOtK4v/view?usp=drivesdk", - "19": "https://drive.google.com/file/d/172as_pb7E15bXUd63mIIBRotk_tT7h56/view?usp=drivesdk", - "16": "https://drive.google.com/file/d/1552S6HEjJ81q8JXr46BtixQiVq9xlW_I/view?usp=drivesdk", - "5": "https://drive.google.com/file/d/1sv9epxLcNlgM6C-oPDeD_heFw7AIZMgp/view?usp=drivesdk", - "7": "https://drive.google.com/file/d/1S_S0LpQW90EUPPPjJX4jfu5p9gOQjiQF/view?usp=drivesdk", - "9": "https://drive.google.com/file/d/1wH2dBrWzmiGJ88ySHWu6srb7Jsj7qYbA/view?usp=drivesdk", - "8": "https://drive.google.com/file/d/14URUm6RKSZziH1lUtT6gs-xnCTWkXpSn/view?usp=drivesdk", - "6": "https://drive.google.com/file/d/1LBKBuTMRSss-kVw8ut3rMk51wSbTM95j/view?usp=drivesdk", - "3": "https://drive.google.com/file/d/1nNazkPrkuRXHFOl8gdA68pU2g8cy-h6n/view?usp=drivesdk", - "2": "https://drive.google.com/file/d/1miG_QpqXe2QIMApcrlNzaB6fsXW5WMFX/view?usp=drivesdk", - "4": "https://drive.google.com/file/d/1o-i8can6ciL1XXzy2pVUPHZEXEjBJi6C/view?usp=drivesdk", - "0": "https://drive.google.com/file/d/1d3Y59Sns8I0FIW9CtOAjVVLE2MEe_3nP/view?usp=drivesdk", - "1": "https://drive.google.com/file/d/1ipSqxJajs_NkfSKFxgltIMNc0ffdt-NX/view?usp=drivesdk", - "68": "https://drive.google.com/file/d/1rCBZsbhQ-ULWGztiKB0JYhFth9EChiSZ/view?usp=drivesdk", - "66": "https://drive.google.com/file/d/1WVAlbWcu9-Braa0aG6w3cShrY5dbIYcY/view?usp=drivesdk", - "67": "https://drive.google.com/file/d/1p44poOCdNLnVYMxTL9b3h-BXsOQ2RDgM/view?usp=drivesdk", - "69": "https://drive.google.com/file/d/1HFYsaqC14aE-EaobQdwkw0FOlAYMYqkV/view?usp=drivesdk", - "65": "https://drive.google.com/file/d/1RyE07CpTIDYMO3b-atwjWH6ZHFDjyoCl/view?usp=drivesdk", - "32": "https://drive.google.com/file/d/17egJ5W-0bvS2akLBqvxylTIViN0d9nG7/view?usp=drivesdk", - "28": "https://drive.google.com/file/d/1HNqSM2XGqgHnyNYT5wp8hyski18HMcfO/view?usp=drivesdk", - "37": "https://drive.google.com/file/d/16Tdu3gveWkFL0VBUzYSzKxFO4ffv-8h7/view?usp=drivesdk", - "30": "https://drive.google.com/file/d/1uj69jGyYnNOXXqKmLNIp-4KKrVC1qaPy/view?usp=drivesdk", - "25": "https://drive.google.com/file/d/1bw6NFlR4ZxOV6reQK1Oqeq_UaYFVpNV6/view?usp=drivesdk", - "33": "https://drive.google.com/file/d/1FkmXBkt__lOFXg_uhxLI0QIuxWbIGySL/view?usp=drivesdk", - "20": "https://drive.google.com/file/d/1r77uBVOHkuiDQFa9iz9FU8QbfjImOAjF/view?usp=drivesdk", - "24": "https://drive.google.com/file/d/1kwLrdhTgCdjNrOcSwRI14K3gXnS48xne/view?usp=drivesdk", - "39": "https://drive.google.com/file/d/1V3av9F47t44Nf3jcO12U6OIsjsX-B7L1/view?usp=drivesdk", - "29": "https://drive.google.com/file/d/172dCAUNaaoZX0RHqEi7Ev12eV930LtTa/view?usp=drivesdk", - "31": "https://drive.google.com/file/d/17zzfgMSWBVebWGnpSHKd6g1LFN4vn-YP/view?usp=drivesdk", - "38": "https://drive.google.com/file/d/1xOQvIBlBJ2swTGp78WkCZJUQ-d1F8pVu/view?usp=drivesdk", - "23": "https://drive.google.com/file/d/1X89y_CoTWWjh3BWq0ZgeGydCvg3gMZeJ/view?usp=drivesdk", - "34": "https://drive.google.com/file/d/1VNDhcbA_-Ckjp084hKyl9bwP4E3l9K_2/view?usp=drivesdk", - "47": "https://drive.google.com/file/d/1O8E7haA8WcJIma0iKcvebd4_dlC5Zr7S/view?usp=drivesdk", - "52": "https://drive.google.com/file/d/1o-ateliXHj4TyugOxb9zYYXwrkhFl4FX/view?usp=drivesdk", - "27": "https://drive.google.com/file/d/1aZ1CwNVWJt_OtIBVO-9zv1UUqXTDlM1F/view?usp=drivesdk", - "26": "https://drive.google.com/file/d/1qegrc27hYeECs0KexnEuuG0WQm-8Y9oZ/view?usp=drivesdk", - "59": "https://drive.google.com/file/d/1L9oWKHMTjQreGW_k8rNy7kBQ7c0FuXFm/view?usp=drivesdk", - "35": "https://drive.google.com/file/d/1NewjF092B9KKDBs-dpnZ9dzVl2GAs2LW/view?usp=drivesdk", - "49": "https://drive.google.com/file/d/1TsUrBlr2nxJtH122nKQ_GzdMc0DFFERB/view?usp=drivesdk", - "41": "https://drive.google.com/file/d/1gc2Vo3HZF-Bm_WhZ0zyFedWNfVL2BEol/view?usp=drivesdk", - "22": "https://drive.google.com/file/d/1iPfQeganYriuqHO2e5npUPeuX5VIbhG3/view?usp=drivesdk", - "36": "https://drive.google.com/file/d/1KyNoHRTfGMNR15dCRpcVW74l2z-wVm0V/view?usp=drivesdk", - "44": "https://drive.google.com/file/d/1PDuxwmrD20s54FHQIhXn3ucdFmXSX5kS/view?usp=drivesdk", - "21": "https://drive.google.com/file/d/1ZwO5cCfBJgGpZTIpoi8p2js8zuHT_qxe/view?usp=drivesdk", - "53": "https://drive.google.com/file/d/140NZAuAOoiqrNVqWmF4TPNv6njd_guwE/view?usp=drivesdk", - "50": "https://drive.google.com/file/d/1MBmy7nQi7pMwwIPZHJjB_iuQeO07QWsN/view?usp=drivesdk", - "54": "https://drive.google.com/file/d/1TtIJ-ULYWyv0yUvUVdfTPuBNlBt_j1Yd/view?usp=drivesdk", - "57": "https://drive.google.com/file/d/19V5d3NcR029AhGiRibk2nlTmFNCVGBgO/view?usp=drivesdk", - "43": "https://drive.google.com/file/d/1kLChcxIWZS_kHLEHThLcm7ekcgwYP0jF/view?usp=drivesdk", - "42": "https://drive.google.com/file/d/1HKW3C1B5vFYUuXmFieMKYAfq4CwtnEZ_/view?usp=drivesdk", - "48": "https://drive.google.com/file/d/1EJGd47XpWZDXJKWU0CGp84Hm7K47GNVt/view?usp=drivesdk", - "40": "https://drive.google.com/file/d/1Fr4dVKdOvth_O-Td8PTwgNGzZz8ridAl/view?usp=drivesdk", - "58": "https://drive.google.com/file/d/1lUFpiwE7ISzLbowHvCtEUj4sfG4w0Gst/view?usp=drivesdk", - "51": "https://drive.google.com/file/d/1V6fOoKgA8QSTJYWPP5GVHz8WFAQIRLNB/view?usp=drivesdk", - "45": "https://drive.google.com/file/d/1hSrPOwyxFEth4GWWN1e4BjBftmnKa8px/view?usp=drivesdk", - "46": "https://drive.google.com/file/d/1jCynzDt1r0EISpwcrFuk3RlKWHM9u7Mj/view?usp=drivesdk", - "55": "https://drive.google.com/file/d/1Db01f4I_Xn8Bs9piQgZU59ZWAeC2MaQm/view?usp=drivesdk", - "56": "https://drive.google.com/file/d/1NxVfwIxm6FVVR1XnxQNMWWbQEVX66cQm/view?usp=drivesdk", - "61": "https://docs.google.com/document/d/1eAaZJAqjXMZ2VvG_r04EGtn6EGcYycofdNUkDHEA8vY/edit?usp=drivesdk" + "12": "https://drive.google.com/file/d/1u7nynrG4WuFZeuZs8yyhqJF_lbo-op-m", + "10": "https://drive.google.com/file/d/1LFcVuXuXIdNJ7hkL0C40eYn_cQtryUVQ", + "13": "https://drive.google.com/file/d/1muQMyYAJe0_F-HiDFIfFMt-4qsgMlREM", + "11": "https://drive.google.com/file/d/1oHNtlsdJJtk7dE10NgH83Kn5_f2L-Su1", + "14": "https://drive.google.com/file/d/1sAw-DrsqpnqLF5A8P59BZwIpt9-LrlaL", + "18": "https://drive.google.com/file/d/1qqKH3esasdqV6ryEhdoSQezDPlKj11At", + "17": "https://drive.google.com/file/d/1z08VsrCUTozpc5Quzb7mEDUwNkXU3foT", + "15": "https://drive.google.com/file/d/1QQ6ZGyYP49IJNeGKNmqZISyVLzTOtK4v", + "19": "https://drive.google.com/file/d/172as_pb7E15bXUd63mIIBRotk_tT7h56", + "16": "https://drive.google.com/file/d/1552S6HEjJ81q8JXr46BtixQiVq9xlW_I", + "5": "https://drive.google.com/file/d/1sv9epxLcNlgM6C-oPDeD_heFw7AIZMgp", + "7": "https://drive.google.com/file/d/1S_S0LpQW90EUPPPjJX4jfu5p9gOQjiQF", + "9": "https://drive.google.com/file/d/1wH2dBrWzmiGJ88ySHWu6srb7Jsj7qYbA", + "8": "https://drive.google.com/file/d/14URUm6RKSZziH1lUtT6gs-xnCTWkXpSn", + "6": "https://drive.google.com/file/d/1LBKBuTMRSss-kVw8ut3rMk51wSbTM95j", + "3": "https://drive.google.com/file/d/1nNazkPrkuRXHFOl8gdA68pU2g8cy-h6n", + "2": "https://drive.google.com/file/d/1miG_QpqXe2QIMApcrlNzaB6fsXW5WMFX", + "4": "https://drive.google.com/file/d/1o-i8can6ciL1XXzy2pVUPHZEXEjBJi6C", + "0": "https://drive.google.com/file/d/1d3Y59Sns8I0FIW9CtOAjVVLE2MEe_3nP", + "1": "https://drive.google.com/file/d/1ipSqxJajs_NkfSKFxgltIMNc0ffdt-NX", + "68": "https://drive.google.com/file/d/1rCBZsbhQ-ULWGztiKB0JYhFth9EChiSZ", + "66": "https://drive.google.com/file/d/1WVAlbWcu9-Braa0aG6w3cShrY5dbIYcY", + "67": "https://drive.google.com/file/d/1p44poOCdNLnVYMxTL9b3h-BXsOQ2RDgM", + "69": "https://drive.google.com/file/d/1HFYsaqC14aE-EaobQdwkw0FOlAYMYqkV", + "65": "https://drive.google.com/file/d/1RyE07CpTIDYMO3b-atwjWH6ZHFDjyoCl", + "32": "https://drive.google.com/file/d/17egJ5W-0bvS2akLBqvxylTIViN0d9nG7", + "28": "https://drive.google.com/file/d/1HNqSM2XGqgHnyNYT5wp8hyski18HMcfO", + "37": "https://drive.google.com/file/d/16Tdu3gveWkFL0VBUzYSzKxFO4ffv-8h7", + "30": "https://drive.google.com/file/d/1uj69jGyYnNOXXqKmLNIp-4KKrVC1qaPy", + "25": "https://drive.google.com/file/d/1bw6NFlR4ZxOV6reQK1Oqeq_UaYFVpNV6", + "33": "https://drive.google.com/file/d/1FkmXBkt__lOFXg_uhxLI0QIuxWbIGySL", + "20": "https://drive.google.com/file/d/1r77uBVOHkuiDQFa9iz9FU8QbfjImOAjF", + "24": "https://drive.google.com/file/d/1kwLrdhTgCdjNrOcSwRI14K3gXnS48xne", + "39": "https://drive.google.com/file/d/1V3av9F47t44Nf3jcO12U6OIsjsX-B7L1", + "29": "https://drive.google.com/file/d/172dCAUNaaoZX0RHqEi7Ev12eV930LtTa", + "31": "https://drive.google.com/file/d/17zzfgMSWBVebWGnpSHKd6g1LFN4vn-YP", + "38": "https://drive.google.com/file/d/1xOQvIBlBJ2swTGp78WkCZJUQ-d1F8pVu", + "23": "https://drive.google.com/file/d/1X89y_CoTWWjh3BWq0ZgeGydCvg3gMZeJ", + "34": "https://drive.google.com/file/d/1VNDhcbA_-Ckjp084hKyl9bwP4E3l9K_2", + "47": "https://drive.google.com/file/d/1O8E7haA8WcJIma0iKcvebd4_dlC5Zr7S", + "52": "https://drive.google.com/file/d/1o-ateliXHj4TyugOxb9zYYXwrkhFl4FX", + "27": "https://drive.google.com/file/d/1aZ1CwNVWJt_OtIBVO-9zv1UUqXTDlM1F", + "26": "https://drive.google.com/file/d/1qegrc27hYeECs0KexnEuuG0WQm-8Y9oZ", + "59": "https://drive.google.com/file/d/1L9oWKHMTjQreGW_k8rNy7kBQ7c0FuXFm", + "35": "https://drive.google.com/file/d/1NewjF092B9KKDBs-dpnZ9dzVl2GAs2LW", + "49": "https://drive.google.com/file/d/1TsUrBlr2nxJtH122nKQ_GzdMc0DFFERB", + "41": "https://drive.google.com/file/d/1gc2Vo3HZF-Bm_WhZ0zyFedWNfVL2BEol", + "22": "https://drive.google.com/file/d/1iPfQeganYriuqHO2e5npUPeuX5VIbhG3", + "36": "https://drive.google.com/file/d/1KyNoHRTfGMNR15dCRpcVW74l2z-wVm0V", + "44": "https://drive.google.com/file/d/1PDuxwmrD20s54FHQIhXn3ucdFmXSX5kS", + "21": "https://drive.google.com/file/d/1ZwO5cCfBJgGpZTIpoi8p2js8zuHT_qxe", + "53": "https://drive.google.com/file/d/140NZAuAOoiqrNVqWmF4TPNv6njd_guwE", + "50": "https://drive.google.com/file/d/1MBmy7nQi7pMwwIPZHJjB_iuQeO07QWsN", + "54": "https://drive.google.com/file/d/1TtIJ-ULYWyv0yUvUVdfTPuBNlBt_j1Yd", + "57": "https://drive.google.com/file/d/19V5d3NcR029AhGiRibk2nlTmFNCVGBgO", + "43": "https://drive.google.com/file/d/1kLChcxIWZS_kHLEHThLcm7ekcgwYP0jF", + "42": "https://drive.google.com/file/d/1HKW3C1B5vFYUuXmFieMKYAfq4CwtnEZ_", + "48": "https://drive.google.com/file/d/1EJGd47XpWZDXJKWU0CGp84Hm7K47GNVt", + "40": "https://drive.google.com/file/d/1Fr4dVKdOvth_O-Td8PTwgNGzZz8ridAl", + "58": "https://drive.google.com/file/d/1lUFpiwE7ISzLbowHvCtEUj4sfG4w0Gst", + "51": "https://drive.google.com/file/d/1V6fOoKgA8QSTJYWPP5GVHz8WFAQIRLNB", + "45": "https://drive.google.com/file/d/1hSrPOwyxFEth4GWWN1e4BjBftmnKa8px", + "46": "https://drive.google.com/file/d/1jCynzDt1r0EISpwcrFuk3RlKWHM9u7Mj", + "55": "https://drive.google.com/file/d/1Db01f4I_Xn8Bs9piQgZU59ZWAeC2MaQm", + "56": "https://drive.google.com/file/d/1NxVfwIxm6FVVR1XnxQNMWWbQEVX66cQm", + "61": "https://docs.google.com/document/d/1eAaZJAqjXMZ2VvG_r04EGtn6EGcYycofdNUkDHEA8vY" } \ No newline at end of file diff --git a/backend/tests/daily/connectors/google_drive/test_service_acct.py b/backend/tests/daily/connectors/google_drive/test_service_acct.py index f2e6961361d..22c3ba3050f 100644 --- a/backend/tests/daily/connectors/google_drive/test_service_acct.py +++ b/backend/tests/daily/connectors/google_drive/test_service_acct.py @@ -394,7 +394,7 @@ def test_shared_with_me( expected_file_ids=expected_file_ids, ) - retrieved_ids = {urlparse(doc.id).path.split("/")[-2] for doc in retrieved_docs} + retrieved_ids = {urlparse(doc.id).path.split("/")[-1] for doc in retrieved_docs} for id in retrieved_ids: print(id) diff --git a/web/src/app/admin/connector/[ccPairId]/IndexAttemptErrorsModal.tsx b/web/src/app/admin/connector/[ccPairId]/IndexAttemptErrorsModal.tsx index c80d3d05341..ed79a6bcb48 100644 --- a/web/src/app/admin/connector/[ccPairId]/IndexAttemptErrorsModal.tsx +++ b/web/src/app/admin/connector/[ccPairId]/IndexAttemptErrorsModal.tsx @@ -165,11 +165,11 @@ export default function IndexAttemptErrorsModal({ {paginationData.currentPageItems.length > 0 ? ( paginationData.currentPageItems.map((error) => ( - - + + {localizeAndPrettify(error.time_created)} - + {error.document_link ? ( - - {error.failure_message} + +
+ {error.failure_message} +
- +