|
13 | 13 | import httpx
|
14 | 14 | import requests
|
15 | 15 |
|
| 16 | +from danswer.configs.app_configs import DOCUMENT_INDEX_NAME |
16 | 17 | from danswer.configs.chat_configs import DOC_TIME_DECAY
|
17 | 18 | from danswer.configs.chat_configs import NUM_RETURNED_HITS
|
18 | 19 | from danswer.configs.chat_configs import TITLE_CONTENT_RATIO
|
@@ -479,6 +480,66 @@ def delete(self, doc_ids: list[str]) -> None:
|
479 | 480 | document_ids=doc_ids, index_name=index_name, http_client=http_client
|
480 | 481 | )
|
481 | 482 |
|
| 483 | + def delete_single(self, doc_id: str) -> None: |
| 484 | + """Possibly faster overall than the delete method due to using a single |
| 485 | + delete call with a selection query.""" |
| 486 | + |
| 487 | + # Vespa deletion is poorly documented ... luckily we found this |
| 488 | + # https://docs.vespa.ai/en/operations/batch-delete.html#example |
| 489 | + |
| 490 | + doc_id = replace_invalid_doc_id_characters(doc_id) |
| 491 | + |
| 492 | + # NOTE: using `httpx` here since `requests` doesn't support HTTP2. This is beneficial for |
| 493 | + # indexing / updates / deletes since we have to make a large volume of requests. |
| 494 | + index_names = [self.index_name] |
| 495 | + if self.secondary_index_name: |
| 496 | + index_names.append(self.secondary_index_name) |
| 497 | + |
| 498 | + with httpx.Client(http2=True) as http_client: |
| 499 | + for index_name in index_names: |
| 500 | + params = httpx.QueryParams( |
| 501 | + { |
| 502 | + "selection": f"{index_name}.document_id=='{doc_id}'", |
| 503 | + "cluster": DOCUMENT_INDEX_NAME, |
| 504 | + } |
| 505 | + ) |
| 506 | + |
| 507 | + total_chunks_deleted = 0 |
| 508 | + while True: |
| 509 | + try: |
| 510 | + resp = http_client.delete( |
| 511 | + f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}", |
| 512 | + params=params, |
| 513 | + ) |
| 514 | + resp.raise_for_status() |
| 515 | + except httpx.HTTPStatusError as e: |
| 516 | + logger.error( |
| 517 | + f"Failed to delete chunk, details: {e.response.text}" |
| 518 | + ) |
| 519 | + raise |
| 520 | + |
| 521 | + resp_data = resp.json() |
| 522 | + |
| 523 | + if "documentCount" in resp_data: |
| 524 | + chunks_deleted = resp_data["documentCount"] |
| 525 | + total_chunks_deleted += chunks_deleted |
| 526 | + |
| 527 | + # Check for continuation token to handle pagination |
| 528 | + if "continuation" not in resp_data: |
| 529 | + break # Exit loop if no continuation token |
| 530 | + |
| 531 | + if not resp_data["continuation"]: |
| 532 | + break # Exit loop if continuation token is empty |
| 533 | + |
| 534 | + params = params.set("continuation", resp_data["continuation"]) |
| 535 | + |
| 536 | + logger.debug( |
| 537 | + f"VespaIndex.delete_single: " |
| 538 | + f"index={index_name} " |
| 539 | + f"doc={doc_id} " |
| 540 | + f"chunks_deleted={total_chunks_deleted}" |
| 541 | + ) |
| 542 | + |
482 | 543 | def id_based_retrieval(
|
483 | 544 | self,
|
484 | 545 | chunk_requests: list[VespaChunkRequest],
|
|
0 commit comments