Fix asynchronous deletion of oversized corpus.

jonathanmetzman · jonathanmetzman · commit f233e5f8e90f · 2025-02-13T19:02:27.000-05:00
This approach should take about 150 MB of RAM and get to about 500
execs/sec. Also increase the amount of files we can delete.

The previous approach was broken since the batch API was being
invoked incorrectly.
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/corpus_pruning_task.py
@@ -19,6 +19,7 @@
 import os
 import random
 import shutil
+import time
 from typing import Dict
 from typing import List
 import zipfile
@@ -76,7 +77,7 @@
 
 # Corpus files limit for cases when corpus pruning task failed in the last
 # execution.
-CORPUS_FILES_LIMIT_FOR_FAILURES = 50000
+CORPUS_FILES_LIMIT_FOR_FAILURES = 50_000
 
 # Corpus total size limit for cases when corpus pruning task failed in the last
 # execution.
@@ -129,36 +130,36 @@ def _get_corpus_file_paths(corpus_path):
 async def _limit_corpus_sizes(corpus_urls):
   try:
     await asyncio.gather(
-        *[_limit_corpus_size(corpus_url) for corpus_url in corpus_urls])
+        *[_limit_corpus_size_async(corpus_url) for corpus_url in corpus_urls])
   except Exception as e:
     # Catch any unexpected exceptions
     logs.error(f"Error in _limit_corpus_size: {e}")
 
 
-async def _limit_corpus_size(corpus_url):
-  """Limits corpus size asynchronously."""
-  logs.info('Limiting corpus size')
+async def _limit_corpus_size_async(corpus_url):
+  """Limits corpus size using async listing and deleting blobs one by one."""
   creds, _ = credentials.get_default()
   creds.refresh(google_auth_requests.Request())
-  bucket = storage.get_bucket_name_and_path(corpus_url)[0]
+  bucket, path = storage.get_bucket_name_and_path(corpus_url)
+  logs.info(f'Limiting corpus size {corpus_url}')
 
-  semaphore = asyncio.Semaphore(20)
+  deleting = False
+  corpus_size = 0
+  num_deleted = 0
+  delete_tasks = []
 
-  async def delete_gcs_blobs_batch(session, bucket, blobs_to_delete, token):
-    async with semaphore:
-      return await delete_gcs_blobs_batch(session, bucket, blobs_to_delete,
-                                          token)
+  async def _delete_blob(name, session):
+    await fast_http.delete_blob_async(bucket, name, session, creds.token)
 
+  # Create the aiohttp session once to reuse it for all requests
   async with aiohttp.ClientSession() as session:
+    start_time = time.time()
     idx = 0
-    deleting = False
-    corpus_size = 0
-    num_deleted = 0
-    blobs_to_delete = []
-    delete_tasks = []
-    num_batches = 0
-    for blob in storage.get_blobs_no_retry(corpus_url, recursive=True):
+    num_deleted = 1
+    async for blob in fast_http.list_blobs_async(bucket, path, creds.token):
       idx += 1
+      if idx >= 5_000_000:
+        break
       if not deleting:
         corpus_size += blob['size']
         if (idx >= CORPUS_FILES_LIMIT_FOR_FAILURES or
@@ -167,33 +168,25 @@ async def delete_gcs_blobs_batch(session, bucket, blobs_to_delete, token):
         continue
 
       assert deleting
-      blobs_to_delete.append(blob)
-      if len(blobs_to_delete) == GOOGLE_CLOUD_MAX_BATCH_SIZE:
-        task = asyncio.create_task(
-            fast_http.delete_gcs_blobs_batch(session, bucket,
-                                             blobs_to_delete.copy(),
-                                             creds.token))
-        delete_tasks.append(task)
-        blobs_to_delete = []
-        num_batches += 1
-        if num_batches == 3_000_000 / GOOGLE_CLOUD_MAX_BATCH_SIZE:
-          break
-
-    if blobs_to_delete:
-      task = asyncio.create_task(
-          delete_gcs_blobs_batch(session, bucket, blobs_to_delete.copy(),
-                                 creds.token))
-      delete_tasks.append(task)
-
-    results = await asyncio.gather(*delete_tasks)
-    for task_success in results:
-      if task_success:
-        num_deleted += GOOGLE_CLOUD_MAX_BATCH_SIZE
-
-    if num_deleted:
-      logs.info(f'Deleted over {num_deleted} corpus files.')
-    else:
-      logs.info('No need to limit corpus.')
+      if idx % 20_000 == 0:  # Arbitrary limit.
+        logs.info(f'Deleting url {blob["name"]}')
+
+      if idx % 100_000 == 0:
+        creds.refresh(google_auth_requests.Request())
+
+      delete_tasks.append(
+          asyncio.create_task(_delete_blob(blob['name'], session)))
+      num_deleted += 1
+      if len(delete_tasks
+            ) >= 1000:  # Arbitrary limit so we don't use too much RAM.
+        # If *any* tasks complete, we can schedule more.
+        _, pending = await asyncio.wait(
+            delete_tasks, return_when=asyncio.FIRST_COMPLETED)
+        delete_tasks = list(pending)
+
+      await asyncio.gather(*delete_tasks)
+      logs.info(f'Deleted {num_deleted} blobs.')
+      logs.info(f'Total time to delete blobs: {time.time() - start_time}')
 
 
 def _get_time_remaining(start_time):
diff --git a/src/clusterfuzz/_internal/fuzzing/corpus_manager.py b/src/clusterfuzz/_internal/fuzzing/corpus_manager.py
@@ -438,10 +438,11 @@ def rsync_from_disk(self,
     # Assert that we aren't making the very bad mistake of deleting the entire
     # corpus because we messed up our determination of which files were deleted
     # by libFuzzer during merge/pruning. We have to do this hacky <500 check
-    # because we have many different kinds of corpuses (e.g. quarantine, regression)
-    # but this check is for the main corpus.
+    # because we have many different kinds of corpuses
+    # (e.g. quarantine, regression) but this check is for the main corpus.
     assert ((len(filenames_to_delete) != len(
-        self._filenames_to_delete_urls_mapping)) or len(filenames_to_delete) < 500)
+        self._filenames_to_delete_urls_mapping)) or
+            len(filenames_to_delete) < 500)
 
     logs.info('Deleting files.')
     storage.delete_signed_urls(filenames_to_delete)
diff --git a/src/clusterfuzz/_internal/system/fast_http.py b/src/clusterfuzz/_internal/system/fast_http.py
@@ -19,16 +19,12 @@
 import urllib.parse
 
 import aiohttp
+import google.api_core.exceptions
 
 from clusterfuzz._internal.base import concurrency
-from clusterfuzz._internal.base import retry
 from clusterfuzz._internal.base import utils
 from clusterfuzz._internal.metrics import logs
 
-BATCH_DELETE_URL = 'https://storage.googleapis.com/batch/storage/v1'
-
-MULTIPART_BOUNDARY = 'multi-part-boundary'
-
 
 def download_urls(urls_and_filepaths: List[Tuple[str, str]]) -> List[bool]:
   """Downloads multiple urls to filepaths in parallel and asynchronously.
@@ -91,37 +87,57 @@ async def _async_download_file(session: aiohttp.ClientSession, url: str,
         fp.write(chunk)
 
 
-@retry.wrap(
-    retries=2,
-    delay=1,
-    function='system.fast_http.delete_gcs_blobs_batch',
-    exception_types=[asyncio.TimeoutError],
-    retry_on_false=True)
-async def delete_gcs_blobs_batch(session, bucket, blobs, auth_token):
-  """Batch deletes |blobs| asynchronously."""
+async def delete_blob_async(bucket_name, blob_name, session, auth_token):
+  """Asynchronously deletes a GCS blob."""
+  blob_name = urllib.parse.quote(blob_name, safe='')
+  url = (
+      f'https://storage.googleapis.com/storage/v1/b/{bucket_name}/o/{blob_name}'
+  )
   headers = {
       'Authorization': f'Bearer {auth_token}',
-      'Content-Type': f'multipart/mixed; boundary={MULTIPART_BOUNDARY}'
   }
-  # Build multipart body
-  body = []
-  bucket = urllib.parse.quote(bucket, safe='')
-  for idx, blob in enumerate(blobs):
-    path = urllib.parse.quote(blob['name'], safe='')
-    body.append(f'--{MULTIPART_BOUNDARY}\r\n'
-                'Content-Type: application/http\r\n'
-                f'Content-ID: <item{idx+1}>\r\n\r\n'
-                f'DELETE /storage/v1/b/{bucket}/o/{path} HTTP/1.1\r\n'
-                'Content-Length: 0\r\n\r\n'
-                'Host: storage.googleapis.com\r\n')
-  body.append(f'--{MULTIPART_BOUNDARY}--\r\n')
-  body = '\r\n'.join(body)
 
   try:
-    async with session.post(
-        BATCH_DELETE_URL, headers=headers, data=body, timeout=25) as response:
-      response.raise_for_status()
-      return True
+    async with session.delete(url, headers=headers) as response:
+      if response.status != 204:
+        response_text = await response.text()
+        logs.error(f'Failed to delete blob {blob_name}. Status code: '
+                   f'{response.status} {response_text}')
+  except google.api_core.exceptions.NotFound:
+    logs.info(f'Not found: {blob_name} {response_text}')
   except Exception as e:
-    logs.info(f'Failed to batch delete {e}')
-    return False
+    logs.error(f'Error deleting {blob_name}: {e}')
+
+
+async def list_blobs_async(bucket_name, path, auth_token):
+  """Asynchronously lists blobs, yielding dicts containing their size, updated
+  time and name."""
+  async with aiohttp.ClientSession() as session:
+    url = f'https://storage.googleapis.com/storage/v1/b/{bucket_name}/o'
+    params = {
+        'prefix': path,
+        'delimiter': '/',
+        # Need token and save space in response.
+        'fields': 'items(name,size,updated),nextPageToken'
+    }
+    while True:
+      async with session.get(
+          url, headers={'Authorization': f'Bearer {auth_token}'},
+          params=params) as response:
+        if response.status == 200:
+          data = await response.json()
+          items = data.get('items', [])
+          for blob in items:
+            yield {
+                'size': int(blob['size']),
+                'updated': blob['updated'],
+                'name': blob['name'],
+            }
+
+          next_page_token = data.get('nextPageToken')
+          if not next_page_token:
+            break
+          params['pageToken'] = next_page_token
+        else:
+          logs.error(f'No blobsm, tatus code: {response.status}')
+          break