Skip to content

Commit dc4face

Browse files
Merge pull request #574 from geoadmin/fix-PB-1575-delete-external
PB-1575: make deletion batch size configurable.
2 parents c721eaf + fbf64e7 commit dc4face

File tree

3 files changed

+63
-14
lines changed

3 files changed

+63
-14
lines changed

app/config/settings_prod.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,7 @@
209209
DELETE_EXPIRED_ITEMS_OLDER_THAN_HOURS = 24
210210
DELETE_EXPIRED_ITEMS_MAX = 110 * 1000
211211
DELETE_EXPIRED_ITEMS_MAX_PERCENTAGE = 50
212+
DELETE_EXPIRED_ITEMS_BATCH_SIZE = 10 * 1000
212213

213214
# Media files (i.e. uploaded content=assets in this project)
214215
UPLOAD_FILE_CHUNK_SIZE = 1024 * 1024 # Size in Bytes

app/stac_api/management/commands/remove_expired_items.py

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,49 @@ def __str__(self):
2222

2323
class Handler(CommandHandler):
2424

25-
def delete(self, instance, object_type):
26-
if self.options['dry_run']:
27-
self.print_success(f'skipping deletion of {object_type} {instance}')
28-
else:
29-
instance.delete()
25+
def delete_by_batch(self, queryset, object_type, batch_size):
26+
# When many rows are involved, looping over each one is very slow.
27+
# Running a single delete() against all of them consumes a lot of memory
28+
# and does not delete anything if it fails mid-way. Hence, we batch.
29+
#
30+
# Django's delete() method already batches deletions in groups of 100
31+
# rows. These batches are wrapped within transactions. It does not seem
32+
# to be designed to allow disabling the transaction or tweaking the
33+
# batch size.
34+
# https://github.com/django/django/blob/main/django/db/models/sql/subqueries.py#L26
35+
# https://github.com/django/django/blob/main/django/db/models/deletion.py#L454
36+
# Also, it does not seem to do anything to reduce memory consumption.
37+
#
38+
# In our case, we don't need the deletions to be transactional. If we
39+
# die in the middle, it's fine if some rows are deleted and some are
40+
# not. We can remove the remaining rows next time we run. That's better
41+
# than waiting forever, to fail and to have to start from scratch next
42+
# time.
43+
type_name = f'stac_api.{object_type.__name__}'
44+
total = queryset.count()
45+
deleted_count = 0
46+
while deleted_count < total:
47+
# We cannot just call queryset[:batch_size].delete() because DELETE
48+
# does not support LIMIT/OFFSET. So instead we extract the ids
49+
# then we'll build a new QuerySet to DELETE them.
50+
ids = queryset.values('id')[:batch_size]
51+
expected_deletions = len(ids)
52+
if expected_deletions == 0:
53+
break
54+
dry_run_prefix = ''
55+
if self.options['dry_run']:
56+
dry_run_prefix = '[dry run]: '
57+
deleted_objs = {}
58+
actual_deletions = expected_deletions
59+
else:
60+
(_, deleted_objs) = object_type.objects.filter(id__in=ids).delete()
61+
actual_deletions = deleted_objs.get(type_name, 0)
62+
deleted_count += actual_deletions
63+
self.print_success(
64+
f'{dry_run_prefix}Deleted {deleted_count}/{total} {type_name}.'
65+
f' In this batch: {actual_deletions}/{expected_deletions}.'
66+
f' All objects in this batch: {deleted_objs}.'
67+
)
3068

3169
def _raise_if_too_many_deletions(self, max_deletions, max_deletions_pct, items_count):
3270
if items_count > max_deletions:
@@ -42,18 +80,20 @@ def _raise_if_too_many_deletions(self, max_deletions, max_deletions_pct, items_c
4280

4381
def run(self):
4482
self.print_success('running command to remove expired items')
83+
batch_size = self.options['batch_size']
4584
min_age_hours = self.options['min_age_hours']
4685
max_deletions = self.options['max_deletions']
4786
max_deletions_pct = self.options['max_deletions_percentage']
4887
self.print_warning(
4988
f"deleting no more than {max_deletions} or "
5089
f"{max_deletions_pct}%% items expired for longer"
51-
f" than {min_age_hours} hours"
90+
f" than {min_age_hours} hours, {batch_size} at a time"
5291
)
5392

5493
expiration = timezone.now() - timedelta(hours=min_age_hours)
5594

56-
items = Item.objects.filter(properties_expires__lte=expiration)
95+
items = Item.objects.filter(properties_expires__lte=expiration
96+
).order_by('properties_expires')
5797
items_count = items.count()
5898

5999
self._raise_if_too_many_deletions(max_deletions, max_deletions_pct, items_count)
@@ -69,8 +109,8 @@ def run(self):
69109
"WARNING: There were still pending asset uploads for expired items. "
70110
"These were likely stale, so we aborted them"
71111
)
72-
self.delete(assets, 'assets')
73-
self.delete(items, 'items')
112+
self.delete_by_batch(assets, Asset, batch_size)
113+
self.delete_by_batch(items, Item, batch_size)
74114

75115
if self.options['dry_run']:
76116
self.print_success(f'[dry run] would have removed {items_count} expired items')
@@ -105,6 +145,13 @@ def add_arguments(self, parser: CommandParser) -> None:
105145
action='store_true',
106146
help='Simulate deleting items, without actually deleting them'
107147
)
148+
default_batch_size = settings.DELETE_EXPIRED_ITEMS_BATCH_SIZE
149+
parser.add_argument(
150+
'--batch-size',
151+
type='positive_int',
152+
default=default_batch_size,
153+
help=f"How many rows to delete at a time ({default_batch_size})"
154+
)
108155
default_min_age = settings.DELETE_EXPIRED_ITEMS_OLDER_THAN_HOURS
109156
parser.add_argument(
110157
'--min-age-hours',

app/tests/tests_10/test_remove_expired_items.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -230,11 +230,6 @@ def test_remove_item_dry_run(self):
230230
expected_stdout_patterns=[
231231
"running command to remove expired items",
232232
"deleting no more than 110000 or 50% items expired for longer than 24 hours",
233-
(
234-
"skipping deletion of assets <QuerySet"
235-
" [<Asset: asset-0.tiff>, <Asset: asset-1.tiff>]>"
236-
),
237-
"skipping deletion of items <ItemQuerySet [<Item: collection-1/item-0>]>",
238233
"[dry run] would have removed 1 expired items",
239234
]
240235
)
@@ -331,6 +326,12 @@ class RemoveExpiredItemsManyWithProfiling(RemoveExpiredItemsBase):
331326
def test_remove_item(self):
332327
self.run_test(command_args=["--max-deletions-percentage=99"], expected_stdout_patterns=[])
333328

329+
def test_multiple_batches(self):
330+
self.run_test(
331+
command_args=["--max-deletions-percentage=99", "--batch-size=7"],
332+
expected_stdout_patterns=[]
333+
)
334+
334335
@staticmethod
335336
def _diff_memory(before, after):
336337
diff = after.compare_to(before, 'filename')

0 commit comments

Comments
 (0)