Skip to content

Commit 3fdd233

Browse files
authored
delete directly via selection instead of making multiple calls to get chunk ids and delete each one (#2666)
1 parent 0c54d9d commit 3fdd233

File tree

3 files changed

+72
-1
lines changed

3 files changed

+72
-1
lines changed

backend/danswer/background/connector_deletion.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def document_by_cc_pair_cleanup_task(
148148
if count == 1:
149149
# count == 1 means this is the only remaining cc_pair reference to the doc
150150
# delete it from vespa and the db
151-
document_index.delete(doc_ids=[document_id])
151+
document_index.delete_single(doc_id=document_id)
152152
delete_documents_complete__no_commit(
153153
db_session=db_session,
154154
document_ids=[document_id],

backend/danswer/document_index/interfaces.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,16 @@ class Deletable(abc.ABC):
156156
Class must implement the ability to delete document by their unique document ids.
157157
"""
158158

159+
@abc.abstractmethod
160+
def delete_single(self, doc_id: str) -> None:
161+
"""
162+
Given a single document id, hard delete it from the document index
163+
164+
Parameters:
165+
- doc_id: document id as specified by the connector
166+
"""
167+
raise NotImplementedError
168+
159169
@abc.abstractmethod
160170
def delete(self, doc_ids: list[str]) -> None:
161171
"""

backend/danswer/document_index/vespa/index.py

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import httpx
1414
import requests
1515

16+
from danswer.configs.app_configs import DOCUMENT_INDEX_NAME
1617
from danswer.configs.chat_configs import DOC_TIME_DECAY
1718
from danswer.configs.chat_configs import NUM_RETURNED_HITS
1819
from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
@@ -479,6 +480,66 @@ def delete(self, doc_ids: list[str]) -> None:
479480
document_ids=doc_ids, index_name=index_name, http_client=http_client
480481
)
481482

483+
def delete_single(self, doc_id: str) -> None:
484+
"""Possibly faster overall than the delete method due to using a single
485+
delete call with a selection query."""
486+
487+
# Vespa deletion is poorly documented ... luckily we found this
488+
# https://docs.vespa.ai/en/operations/batch-delete.html#example
489+
490+
doc_id = replace_invalid_doc_id_characters(doc_id)
491+
492+
# NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for
493+
# indexing / updates / deletes since we have to make a large volume of requests.
494+
index_names = [self.index_name]
495+
if self.secondary_index_name:
496+
index_names.append(self.secondary_index_name)
497+
498+
with httpx.Client(http2=True) as http_client:
499+
for index_name in index_names:
500+
params = httpx.QueryParams(
501+
{
502+
"selection": f"{index_name}.document_id=='{doc_id}'",
503+
"cluster": DOCUMENT_INDEX_NAME,
504+
}
505+
)
506+
507+
total_chunks_deleted = 0
508+
while True:
509+
try:
510+
resp = http_client.delete(
511+
f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}",
512+
params=params,
513+
)
514+
resp.raise_for_status()
515+
except httpx.HTTPStatusError as e:
516+
logger.error(
517+
f"Failed to delete chunk, details: {e.response.text}"
518+
)
519+
raise
520+
521+
resp_data = resp.json()
522+
523+
if "documentCount" in resp_data:
524+
chunks_deleted = resp_data["documentCount"]
525+
total_chunks_deleted += chunks_deleted
526+
527+
# Check for continuation token to handle pagination
528+
if "continuation" not in resp_data:
529+
break # Exit loop if no continuation token
530+
531+
if not resp_data["continuation"]:
532+
break # Exit loop if continuation token is empty
533+
534+
params = params.set("continuation", resp_data["continuation"])
535+
536+
logger.debug(
537+
f"VespaIndex.delete_single: "
538+
f"index={index_name} "
539+
f"doc={doc_id} "
540+
f"chunks_deleted={total_chunks_deleted}"
541+
)
542+
482543
def id_based_retrieval(
483544
self,
484545
chunk_requests: list[VespaChunkRequest],

0 commit comments

Comments
 (0)