@@ -256,16 +256,28 @@ def get_documents_for_tenant_connector(
256256
257257
258258def search_for_document (
259- index_name : str , document_id : str , max_hits : int | None = 10
259+ index_name : str ,
260+ document_id : str | None = None ,
261+ tenant_id : str | None = None ,
262+ max_hits : int | None = 10 ,
260263) -> List [Dict [str , Any ]]:
261- yql_query = (
262- f'select * from sources { index_name } where document_id contains "{ document_id } "'
263- )
264+ yql_query = f"select * from sources { index_name } "
265+
266+ conditions = []
267+ if document_id is not None :
268+ conditions .append (f'document_id contains "{ document_id } "' )
269+
270+ if tenant_id is not None :
271+ conditions .append (f'tenant_id contains "{ tenant_id } "' )
272+
273+ if conditions :
274+ yql_query += " where " + " and " .join (conditions )
275+
264276 params : dict [str , Any ] = {"yql" : yql_query }
265277 if max_hits is not None :
266278 params ["hits" ] = max_hits
267279 with get_vespa_http_client () as client :
268- response = client .get (f"{ SEARCH_ENDPOINT } / search/" , params = params )
280+ response = client .get (f"{ SEARCH_ENDPOINT } search/" , params = params )
269281 response .raise_for_status ()
270282 result = response .json ()
271283 documents = result .get ("root" , {}).get ("children" , [])
@@ -582,8 +594,15 @@ def update_document(
582594 ) -> None :
583595 update_document (self .tenant_id , connector_id , doc_id , fields )
584596
585- def search_for_document (self , document_id : str ) -> List [Dict [str , Any ]]:
586- return search_for_document (self .index_name , document_id )
597+ def delete_documents_for_tenant (self , count : int | None = None ) -> None :
598+ if not self .tenant_id :
599+ raise Exception ("Tenant ID is not set" )
600+ delete_documents_for_tenant (self .index_name , self .tenant_id , count = count )
601+
602+ def search_for_document (
603+ self , document_id : str | None = None , tenant_id : str | None = None
604+ ) -> List [Dict [str , Any ]]:
605+ return search_for_document (self .index_name , document_id , tenant_id )
587606
588607 def delete_document (self , connector_id : int , doc_id : str ) -> None :
589608 # Delete a document.
@@ -600,6 +619,147 @@ def acls(self, cc_pair_id: int, n: int | None = 10) -> None:
600619 get_document_acls (self .tenant_id , cc_pair_id , n )
601620
602621
622+ def delete_where (
623+ index_name : str ,
624+ selection : str ,
625+ cluster : str = "default" ,
626+ bucket_space : str | None = None ,
627+ continuation : str | None = None ,
628+ time_chunk : str | None = None ,
629+ timeout : str | None = None ,
630+ tracelevel : int | None = None ,
631+ ) -> None :
632+ """
633+ Removes visited documents in `cluster` where the given selection
634+ is true, using Vespa's 'delete where' endpoint.
635+
636+ :param index_name: Typically <namespace>/<document-type> from your schema
637+ :param selection: The selection string, e.g., "true" or "foo contains 'bar'"
638+ :param cluster: The name of the cluster where documents reside
639+ :param bucket_space: e.g. 'global' or 'default'
640+ :param continuation: For chunked visits
641+ :param time_chunk: If you want to chunk the visit by time
642+ :param timeout: e.g. '10s'
643+ :param tracelevel: Increase for verbose logs
644+ """
645+ # Using index_name of form <namespace>/<document-type>, e.g. "nomic_ai_nomic_embed_text_v1"
646+ # This route ends with "/docid/" since the actual ID is not specified — we rely on "selection".
647+ path = f"/document/v1/{ index_name } /docid/"
648+
649+ params = {
650+ "cluster" : cluster ,
651+ "selection" : selection ,
652+ }
653+
654+ # Optional parameters
655+ if bucket_space is not None :
656+ params ["bucketSpace" ] = bucket_space
657+ if continuation is not None :
658+ params ["continuation" ] = continuation
659+ if time_chunk is not None :
660+ params ["timeChunk" ] = time_chunk
661+ if timeout is not None :
662+ params ["timeout" ] = timeout
663+ if tracelevel is not None :
664+ params ["tracelevel" ] = tracelevel # type: ignore
665+
666+ with get_vespa_http_client () as client :
667+ url = f"{ VESPA_APPLICATION_ENDPOINT } { path } "
668+ logger .info (f"Performing 'delete where' on { url } with selection={ selection } ..." )
669+ response = client .delete (url , params = params )
670+ # (Optionally, you can keep fetching `continuation` from the JSON response
671+ # if you have more documents to delete in chunks.)
672+ response .raise_for_status () # will raise HTTPError if not 2xx
673+ logger .info (f"Delete where completed with status: { response .status_code } " )
674+ print (f"Delete where completed with status: { response .status_code } " )
675+
676+
677+ def delete_documents_for_tenant (
678+ index_name : str ,
679+ tenant_id : str ,
680+ route : str | None = None ,
681+ condition : str | None = None ,
682+ timeout : str | None = None ,
683+ tracelevel : int | None = None ,
684+ count : int | None = None ,
685+ ) -> None :
686+ """
687+ For the given tenant_id and index_name (often in the form <namespace>/<document-type>),
688+ find documents via search_for_document, then delete them one at a time using Vespa's
689+ /document/v1/<namespace>/<document-type>/docid/<document-id> endpoint.
690+
691+ :param index_name: Typically <namespace>/<document-type> from your schema
692+ :param tenant_id: The tenant to match in your Vespa search
693+ :param route: Optional route parameter for delete
694+ :param condition: Optional conditional remove
695+ :param timeout: e.g. '10s'
696+ :param tracelevel: Increase for verbose logs
697+ """
698+ deleted_count = 0
699+ while True :
700+ # Search for documents with the given tenant_id
701+ docs = search_for_document (
702+ index_name = index_name ,
703+ document_id = None ,
704+ tenant_id = tenant_id ,
705+ max_hits = 100 , # Fetch in batches of 100
706+ )
707+
708+ if not docs :
709+ logger .info ("No more documents found to delete." )
710+ break
711+
712+ with get_vespa_http_client () as client :
713+ for doc in docs :
714+ if count is not None and deleted_count >= count :
715+ logger .info (f"Reached maximum delete limit of { count } documents." )
716+ return
717+
718+ fields = doc .get ("fields" , {})
719+ doc_id_value = fields .get ("document_id" ) or fields .get ("documentid" )
720+ tenant_id = fields .get ("tenant_id" )
721+ if tenant_id != tenant_id :
722+ raise Exception ("Tenant ID mismatch" )
723+
724+ if not doc_id_value :
725+ logger .warning (
726+ "Skipping a document that has no document_id in 'fields'."
727+ )
728+ continue
729+
730+ url = f"{ DOCUMENT_ID_ENDPOINT .format (index_name = index_name )} /{ doc_id_value } "
731+
732+ params = {}
733+ if condition :
734+ params ["condition" ] = condition
735+ if route :
736+ params ["route" ] = route
737+ if timeout :
738+ params ["timeout" ] = timeout
739+ if tracelevel is not None :
740+ params ["tracelevel" ] = str (tracelevel )
741+
742+ response = client .delete (url , params = params )
743+ if response .status_code == 200 :
744+ logger .info (f"Successfully deleted doc_id={ doc_id_value } " )
745+ deleted_count += 1
746+ else :
747+ logger .error (
748+ f"Failed to delete doc_id={ doc_id_value } , "
749+ f"status={ response .status_code } , response={ response .text } "
750+ )
751+ print (
752+ f"Could not delete doc_id={ doc_id_value } . "
753+ f"Status={ response .status_code } , response={ response .text } "
754+ )
755+ raise Exception (
756+ f"Could not delete doc_id={ doc_id_value } . "
757+ f"Status={ response .status_code } , response={ response .text } "
758+ )
759+
760+ logger .info (f"Deleted { deleted_count } documents in total." )
761+
762+
603763def main () -> None :
604764 parser = argparse .ArgumentParser (description = "Vespa debugging tool" )
605765 parser .add_argument (
@@ -630,7 +790,9 @@ def main() -> None:
630790 args = parser .parse_args ()
631791 vespa_debug = VespaDebugging (args .tenant_id )
632792
633- if args .action == "config" :
793+ if args .action == "delete-all-documents" :
794+ vespa_debug .delete_documents_for_tenant (args .count )
795+ elif args .action == "config" :
634796 vespa_debug .print_config ()
635797 elif args .action == "connect" :
636798 vespa_debug .check_connectivity ()
0 commit comments