Skip to content

Commit c6434db

Browse files
authored
Add delete all for tenants in Vespa (#3970)
1 parent 667b9e0 commit c6434db

File tree

1 file changed

+170
-8
lines changed

1 file changed

+170
-8
lines changed

backend/scripts/debugging/onyx_vespa.py

Lines changed: 170 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -256,16 +256,28 @@ def get_documents_for_tenant_connector(
256256

257257

258258
def search_for_document(
259-
index_name: str, document_id: str, max_hits: int | None = 10
259+
index_name: str,
260+
document_id: str | None = None,
261+
tenant_id: str | None = None,
262+
max_hits: int | None = 10,
260263
) -> List[Dict[str, Any]]:
261-
yql_query = (
262-
f'select * from sources {index_name} where document_id contains "{document_id}"'
263-
)
264+
yql_query = f"select * from sources {index_name}"
265+
266+
conditions = []
267+
if document_id is not None:
268+
conditions.append(f'document_id contains "{document_id}"')
269+
270+
if tenant_id is not None:
271+
conditions.append(f'tenant_id contains "{tenant_id}"')
272+
273+
if conditions:
274+
yql_query += " where " + " and ".join(conditions)
275+
264276
params: dict[str, Any] = {"yql": yql_query}
265277
if max_hits is not None:
266278
params["hits"] = max_hits
267279
with get_vespa_http_client() as client:
268-
response = client.get(f"{SEARCH_ENDPOINT}/search/", params=params)
280+
response = client.get(f"{SEARCH_ENDPOINT}search/", params=params)
269281
response.raise_for_status()
270282
result = response.json()
271283
documents = result.get("root", {}).get("children", [])
@@ -582,8 +594,15 @@ def update_document(
582594
) -> None:
583595
update_document(self.tenant_id, connector_id, doc_id, fields)
584596

585-
def search_for_document(self, document_id: str) -> List[Dict[str, Any]]:
586-
return search_for_document(self.index_name, document_id)
597+
def delete_documents_for_tenant(self, count: int | None = None) -> None:
598+
if not self.tenant_id:
599+
raise Exception("Tenant ID is not set")
600+
delete_documents_for_tenant(self.index_name, self.tenant_id, count=count)
601+
602+
def search_for_document(
603+
self, document_id: str | None = None, tenant_id: str | None = None
604+
) -> List[Dict[str, Any]]:
605+
return search_for_document(self.index_name, document_id, tenant_id)
587606

588607
def delete_document(self, connector_id: int, doc_id: str) -> None:
589608
# Delete a document.
@@ -600,6 +619,147 @@ def acls(self, cc_pair_id: int, n: int | None = 10) -> None:
600619
get_document_acls(self.tenant_id, cc_pair_id, n)
601620

602621

622+
def delete_where(
623+
index_name: str,
624+
selection: str,
625+
cluster: str = "default",
626+
bucket_space: str | None = None,
627+
continuation: str | None = None,
628+
time_chunk: str | None = None,
629+
timeout: str | None = None,
630+
tracelevel: int | None = None,
631+
) -> None:
632+
"""
633+
Removes visited documents in `cluster` where the given selection
634+
is true, using Vespa's 'delete where' endpoint.
635+
636+
:param index_name: Typically <namespace>/<document-type> from your schema
637+
:param selection: The selection string, e.g., "true" or "foo contains 'bar'"
638+
:param cluster: The name of the cluster where documents reside
639+
:param bucket_space: e.g. 'global' or 'default'
640+
:param continuation: For chunked visits
641+
:param time_chunk: If you want to chunk the visit by time
642+
:param timeout: e.g. '10s'
643+
:param tracelevel: Increase for verbose logs
644+
"""
645+
# Using index_name of form <namespace>/<document-type>, e.g. "nomic_ai_nomic_embed_text_v1"
646+
# This route ends with "/docid/" since the actual ID is not specified — we rely on "selection".
647+
path = f"/document/v1/{index_name}/docid/"
648+
649+
params = {
650+
"cluster": cluster,
651+
"selection": selection,
652+
}
653+
654+
# Optional parameters
655+
if bucket_space is not None:
656+
params["bucketSpace"] = bucket_space
657+
if continuation is not None:
658+
params["continuation"] = continuation
659+
if time_chunk is not None:
660+
params["timeChunk"] = time_chunk
661+
if timeout is not None:
662+
params["timeout"] = timeout
663+
if tracelevel is not None:
664+
params["tracelevel"] = tracelevel # type: ignore
665+
666+
with get_vespa_http_client() as client:
667+
url = f"{VESPA_APPLICATION_ENDPOINT}{path}"
668+
logger.info(f"Performing 'delete where' on {url} with selection={selection}...")
669+
response = client.delete(url, params=params)
670+
# (Optionally, you can keep fetching `continuation` from the JSON response
671+
# if you have more documents to delete in chunks.)
672+
response.raise_for_status() # will raise HTTPError if not 2xx
673+
logger.info(f"Delete where completed with status: {response.status_code}")
674+
print(f"Delete where completed with status: {response.status_code}")
675+
676+
677+
def delete_documents_for_tenant(
678+
index_name: str,
679+
tenant_id: str,
680+
route: str | None = None,
681+
condition: str | None = None,
682+
timeout: str | None = None,
683+
tracelevel: int | None = None,
684+
count: int | None = None,
685+
) -> None:
686+
"""
687+
For the given tenant_id and index_name (often in the form <namespace>/<document-type>),
688+
find documents via search_for_document, then delete them one at a time using Vespa's
689+
/document/v1/<namespace>/<document-type>/docid/<document-id> endpoint.
690+
691+
:param index_name: Typically <namespace>/<document-type> from your schema
692+
:param tenant_id: The tenant to match in your Vespa search
693+
:param route: Optional route parameter for delete
694+
:param condition: Optional conditional remove
695+
:param timeout: e.g. '10s'
696+
:param tracelevel: Increase for verbose logs
697+
"""
698+
deleted_count = 0
699+
while True:
700+
# Search for documents with the given tenant_id
701+
docs = search_for_document(
702+
index_name=index_name,
703+
document_id=None,
704+
tenant_id=tenant_id,
705+
max_hits=100, # Fetch in batches of 100
706+
)
707+
708+
if not docs:
709+
logger.info("No more documents found to delete.")
710+
break
711+
712+
with get_vespa_http_client() as client:
713+
for doc in docs:
714+
if count is not None and deleted_count >= count:
715+
logger.info(f"Reached maximum delete limit of {count} documents.")
716+
return
717+
718+
fields = doc.get("fields", {})
719+
doc_id_value = fields.get("document_id") or fields.get("documentid")
720+
tenant_id = fields.get("tenant_id")
721+
if tenant_id != tenant_id:
722+
raise Exception("Tenant ID mismatch")
723+
724+
if not doc_id_value:
725+
logger.warning(
726+
"Skipping a document that has no document_id in 'fields'."
727+
)
728+
continue
729+
730+
url = f"{DOCUMENT_ID_ENDPOINT.format(index_name=index_name)}/{doc_id_value}"
731+
732+
params = {}
733+
if condition:
734+
params["condition"] = condition
735+
if route:
736+
params["route"] = route
737+
if timeout:
738+
params["timeout"] = timeout
739+
if tracelevel is not None:
740+
params["tracelevel"] = str(tracelevel)
741+
742+
response = client.delete(url, params=params)
743+
if response.status_code == 200:
744+
logger.info(f"Successfully deleted doc_id={doc_id_value}")
745+
deleted_count += 1
746+
else:
747+
logger.error(
748+
f"Failed to delete doc_id={doc_id_value}, "
749+
f"status={response.status_code}, response={response.text}"
750+
)
751+
print(
752+
f"Could not delete doc_id={doc_id_value}. "
753+
f"Status={response.status_code}, response={response.text}"
754+
)
755+
raise Exception(
756+
f"Could not delete doc_id={doc_id_value}. "
757+
f"Status={response.status_code}, response={response.text}"
758+
)
759+
760+
logger.info(f"Deleted {deleted_count} documents in total.")
761+
762+
603763
def main() -> None:
604764
parser = argparse.ArgumentParser(description="Vespa debugging tool")
605765
parser.add_argument(
@@ -630,7 +790,9 @@ def main() -> None:
630790
args = parser.parse_args()
631791
vespa_debug = VespaDebugging(args.tenant_id)
632792

633-
if args.action == "config":
793+
if args.action == "delete-all-documents":
794+
vespa_debug.delete_documents_for_tenant(args.count)
795+
elif args.action == "config":
634796
vespa_debug.print_config()
635797
elif args.action == "connect":
636798
vespa_debug.check_connectivity()

0 commit comments

Comments
 (0)