Skip to content

Commit 9e52438

Browse files
kostasrimromange
authored andcommitted
fix: preemption in atomic section of heartbeat (#4720)
The bug is that expiring keys during heartbeat should not preempt while writing to the journal and we assert this with a FiberAtomicGuard. However, this atomicity guarantee is violated because the journal callback acquires a lock on a mutex that is already locked by on OnJournalEntry(). The fix is to release the lock when OnJournalEntry() preempts. Signed-off-by: kostas <kostas@dragonflydb.io>
1 parent e6fabfe commit 9e52438

File tree

2 files changed

+35
-3
lines changed

2 files changed

+35
-3
lines changed

src/server/snapshot.cc

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -410,9 +410,12 @@ void SliceSnapshot::OnJournalEntry(const journal::JournalItem& item, bool await)
410410
// To enable journal flushing to sync after non auto journal command is executed we call
411411
// TriggerJournalWriteToSink. This call uses the NOOP opcode with await=true. Since there is no
412412
// additional journal change to serialize, it simply invokes PushSerialized.
413-
std::lock_guard guard(big_value_mu_);
414-
if (item.opcode != journal::Op::NOOP) {
415-
serializer_->WriteJournalEntry(item.data);
413+
{
414+
// We should release the lock after we preempt
415+
std::lock_guard guard(big_value_mu_);
416+
if (item.opcode != journal::Op::NOOP) {
417+
serializer_->WriteJournalEntry(item.data);
418+
}
416419
}
417420

418421
if (await) {

tests/dragonfly/replication_test.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2770,3 +2770,32 @@ async def test_stream_approximate_trimming(df_factory):
27702770
master_data = await StaticSeeder.capture(c_master)
27712771
replica_data = await StaticSeeder.capture(c_replica)
27722772
assert master_data == replica_data
2773+
2774+
2775+
async def test_preempt_in_atomic_section_of_heartbeat(df_factory: DflyInstanceFactory):
2776+
master = df_factory.create(proactor_threads=1, serialization_max_chunk_size=100000000000)
2777+
replicas = [df_factory.create(proactor_threads=1) for i in range(2)]
2778+
2779+
# Start instances and connect clients
2780+
df_factory.start_all([master] + replicas)
2781+
c_master = master.client()
2782+
c_replicas = [replica.client() for replica in replicas]
2783+
2784+
total = 100000
2785+
await c_master.execute_command(f"DEBUG POPULATE {total} tmp 100 TYPE SET ELEMENTS 100")
2786+
2787+
thresehold = 50000
2788+
for i in range(thresehold):
2789+
rand = random.randint(1, 10)
2790+
await c_master.execute_command(f"EXPIRE tmp:{i} {rand} NX")
2791+
2792+
seeder = StaticSeeder(key_target=10000)
2793+
fill_task = asyncio.create_task(seeder.run(master.client()))
2794+
2795+
for replica in c_replicas:
2796+
await replica.execute_command(f"REPLICAOF LOCALHOST {master.port}")
2797+
2798+
async with async_timeout.timeout(240):
2799+
await wait_for_replicas_state(*c_replicas)
2800+
2801+
await fill_task

0 commit comments

Comments
 (0)