Skip to content

Commit 4400a94

Browse files
author
Richard Kuo (Danswer)
committed
optimize another index attempt check
1 parent cab7e60 commit 4400a94

File tree

4 files changed

+50
-12
lines changed

4 files changed

+50
-12
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""add index to index_attempt.time_created
2+
3+
Revision ID: 0f7ff6d75b57
4+
Revises: 369644546676
5+
Create Date: 2025-01-10 14:01:14.067144
6+
7+
"""
8+
from alembic import op
9+
10+
# revision identifiers, used by Alembic.
11+
revision = "0f7ff6d75b57"
12+
down_revision = "369644546676"
13+
branch_labels: None = None
14+
depends_on: None = None
15+
16+
17+
def upgrade() -> None:
18+
op.create_index(
19+
op.f("ix_index_attempt_status"),
20+
"index_attempt",
21+
["status"],
22+
unique=False,
23+
)
24+
25+
op.create_index(
26+
op.f("ix_index_attempt_time_created"),
27+
"index_attempt",
28+
["time_created"],
29+
unique=False,
30+
)
31+
32+
33+
def downgrade() -> None:
34+
op.drop_index(op.f("ix_index_attempt_time_created"), table_name="index_attempt")
35+
36+
op.drop_index(op.f("ix_index_attempt_status"), table_name="index_attempt")

backend/onyx/background/celery/tasks/indexing/tasks.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,6 +417,15 @@ def check_for_indexing(self: Task, *, tenant_id: str | None) -> int | None:
417417
unfenced_attempt_ids = get_unfenced_index_attempt_ids(
418418
db_session, redis_client
419419
)
420+
421+
if tenant_id in debug_tenants:
422+
ttl = redis_client.ttl(OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK)
423+
task_logger.info(
424+
f"check_for_indexing after get unfenced lock: "
425+
f"tenant={tenant_id} "
426+
f"ttl={ttl}"
427+
)
428+
420429
for attempt_id in unfenced_attempt_ids:
421430
# debugging logic - remove after we're done
422431
if tenant_id in debug_tenants:

backend/onyx/db/index_attempt.py

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from sqlalchemy import func
1010
from sqlalchemy import select
1111
from sqlalchemy import update
12-
from sqlalchemy.orm import joinedload
1312
from sqlalchemy.orm import Session
1413

1514
from onyx.connectors.models import Document
@@ -118,21 +117,14 @@ def get_in_progress_index_attempts(
118117
def get_all_index_attempts_by_status(
119118
status: IndexingStatus, db_session: Session
120119
) -> list[IndexAttempt]:
121-
"""This eagerly loads the connector and credential so that the db_session can be expired
122-
before running long-living indexing jobs, which causes increasing memory usage.
120+
"""Returns index attempts with the given status.
121+
Only recommend calling this with non-terminal states as the full list of
122+
terminal statuses may be quite large.
123123
124124
Results are ordered by time_created (oldest to newest)."""
125125
stmt = select(IndexAttempt)
126126
stmt = stmt.where(IndexAttempt.status == status)
127127
stmt = stmt.order_by(IndexAttempt.time_created)
128-
stmt = stmt.options(
129-
joinedload(IndexAttempt.connector_credential_pair).joinedload(
130-
ConnectorCredentialPair.connector
131-
),
132-
joinedload(IndexAttempt.connector_credential_pair).joinedload(
133-
ConnectorCredentialPair.credential
134-
),
135-
)
136128
new_attempts = db_session.scalars(stmt)
137129
return list(new_attempts.all())
138130

backend/onyx/db/models.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -763,7 +763,7 @@ class IndexAttempt(Base):
763763
# the run once API
764764
from_beginning: Mapped[bool] = mapped_column(Boolean)
765765
status: Mapped[IndexingStatus] = mapped_column(
766-
Enum(IndexingStatus, native_enum=False)
766+
Enum(IndexingStatus, native_enum=False, index=True)
767767
)
768768
# The two below may be slightly out of sync if user switches Embedding Model
769769
new_docs_indexed: Mapped[int | None] = mapped_column(Integer, default=0)
@@ -782,6 +782,7 @@ class IndexAttempt(Base):
782782
time_created: Mapped[datetime.datetime] = mapped_column(
783783
DateTime(timezone=True),
784784
server_default=func.now(),
785+
index=True,
785786
)
786787
# when the actual indexing run began
787788
# NOTE: will use the api_server clock rather than DB server clock

0 commit comments

Comments
 (0)