Skip to content

Commit e2d65a0

Browse files
authored
chore: reenable evictions upon insertion to avoid OOM rejections (#3387)
* chore: reenable evictions upon insertion to avoid OOM rejections Before: when running dragonfly with --cache_mode we could get OOM rejections even though the eviction policy allowed to evict items to free memory. Ideally, dragonfly in cache mode should not respond with the OOM error. This PR reuses the same Eviction step we have in the Heartbeat and conditionally applies it during the insertion. In my test the OOM errors went from 500K to 0 and the server still respected memory limit. Also, remove the old heuristics that has never been used. Test: ./dfly_bench --key_prefix=bar: -d 1024 --ratio=1:0 --qps=200 -n 3000 ./dragonfly --dbfilename= --proactor_threads=2 --maxmemory=600M --cache_mode --------- Signed-off-by: Roman Gershman <roman@dragonflydb.io>
1 parent fb4222d commit e2d65a0

File tree

7 files changed

+67
-141
lines changed

7 files changed

+67
-141
lines changed

src/core/compact_object.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,8 @@ class CompactObj {
403403
// Precondition: the object is a non-inline string.
404404
StringOrView GetRawString() const;
405405

406+
bool HasAllocated() const;
407+
406408
private:
407409
void EncodeString(std::string_view str);
408410
size_t DecodedLen(size_t sz) const;
@@ -412,8 +414,6 @@ class CompactObj {
412414
// Requires: HasAllocated() - true.
413415
void Free();
414416

415-
bool HasAllocated() const;
416-
417417
bool CmpEncoded(std::string_view sv) const;
418418

419419
void SetMeta(uint8_t taglen, uint8_t mask = 0) {

src/server/acl/validator.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212

1313
namespace dfly::acl {
1414

15-
class AclKeys;
15+
struct AclKeys;
1616

1717
std::pair<bool, AclLog::Reason> IsUserAllowedToInvokeCommandGeneric(
1818
const std::vector<uint64_t>& acl_commands, const AclKeys& keys, facade::CmdArgList tail_args,

src/server/db_slice.cc

+38-131
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@
2020
#include "util/fibers/fibers.h"
2121
#include "util/fibers/stacktrace.h"
2222

23-
ABSL_FLAG(bool, enable_heartbeat_eviction, true,
24-
"Enable eviction during heartbeat when memory is under pressure.");
25-
2623
ABSL_FLAG(uint32_t, max_eviction_per_heartbeat, 100,
2724
"The maximum number of key-value pairs that will be deleted in each eviction "
2825
"when heartbeat based eviction is triggered under memory pressure.");
@@ -484,8 +481,7 @@ OpResult<DbSlice::PrimeItAndExp> DbSlice::FindInternal(const Context& cntx, std:
484481
if (caching_mode_ && IsValid(res.it)) {
485482
if (!change_cb_.empty()) {
486483
auto bump_cb = [&](PrimeTable::bucket_iterator bit) {
487-
DVLOG(2) << "Running callbacks for key " << key << " in dbid " << cntx.db_index;
488-
CallChangeCallbacks(cntx.db_index, bit);
484+
CallChangeCallbacks(cntx.db_index, key, bit);
489485
};
490486
db.prime.CVCUponBump(change_cb_.back().first, res.it, bump_cb);
491487
}
@@ -565,8 +561,7 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
565561
CHECK(status == OpStatus::KEY_NOTFOUND || status == OpStatus::OUT_OF_MEMORY) << status;
566562

567563
// It's a new entry.
568-
DVLOG(2) << "Running callbacks for key " << key << " in dbid " << cntx.db_index;
569-
CallChangeCallbacks(cntx.db_index, key);
564+
CallChangeCallbacks(cntx.db_index, key, {key});
570565

571566
// In case we are loading from rdb file or replicating we want to disable conservative memory
572567
// checks (inside PrimeEvictionPolicy::CanGrow) and reject insertions only after we pass max
@@ -598,8 +593,8 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
598593
CompactObj co_key{key};
599594
PrimeIterator it;
600595

601-
// I try/catch just for sake of having a convenient place to set a breakpoint.
602-
size_t table_before = db.prime.mem_usage();
596+
size_t table_before = db.table_memory();
597+
603598
try {
604599
it = db.prime.InsertNew(std::move(co_key), PrimeValue{}, evp);
605600
} catch (bad_alloc& e) {
@@ -608,19 +603,20 @@ OpResult<DbSlice::AddOrFindResult> DbSlice::AddOrFindInternal(const Context& cnt
608603
return OpStatus::OUT_OF_MEMORY;
609604
}
610605

611-
table_memory_ += (db.prime.mem_usage() - table_before);
612606
size_t evicted_obj_bytes = 0;
613-
614-
// We may still reach the state when our memory usage is above the limit even if we
615-
// do not add new segments. For example, we have half full segments
616-
// and we add new objects or update the existing ones and our memory usage grows.
617-
if (evp.mem_budget() < 0) {
618-
// TODO(roman): EvictObjects is too aggressive and it's messing with cache hit-rate.
619-
// The regular eviction policy does a decent job though it may cross the passed limit
620-
// a little bit. I do not consider it as a serious bug.
621-
// evicted_obj_bytes = EvictObjects(-evp.mem_budget(), it, &db);
607+
if (evp.mem_budget() < 0 && apply_memory_limit) {
608+
// We may reach the state when our memory usage is below the limit even if we
609+
// do not add new segments. For example, we have half full segments
610+
// and we add new objects or update the existing ones and our memory usage grows.
611+
// We do not require for a single operation to unload the whole negative debt.
612+
// Instead, we create a positive, converging force that should help with freeing enough memory.
613+
// Free at least 256 bytes or 3% of the total debt.
614+
size_t evict_goal = std::max<size_t>(256, (-evp.mem_budget()) / 32);
615+
evicted_obj_bytes = FreeMemWithEvictionStep(cntx.db_index, it.segment_id(), evict_goal);
622616
}
623617

618+
table_memory_ += (db.table_memory() - table_before);
619+
624620
db.stats.inline_keys += it->first.IsInline();
625621
AccountObjectMemory(key, it->first.ObjType(), it->first.MallocUsed(), &db); // Account for key
626622

@@ -709,7 +705,7 @@ void DbSlice::FlushSlotsFb(const cluster::SlotSet& slot_ids) {
709705
string_view key = get<string_view>(req.change);
710706
table->CVCUponInsert(
711707
next_version, key,
712-
[this, db_index, next_version, iterate_bucket](PrimeTable::bucket_iterator it) {
708+
[db_index, next_version, iterate_bucket](PrimeTable::bucket_iterator it) {
713709
DCHECK_LT(it.GetVersion(), next_version);
714710
iterate_bucket(db_index, it);
715711
});
@@ -762,7 +758,7 @@ void DbSlice::FlushDbIndexes(const std::vector<DbIndex>& indexes) {
762758
}
763759

764760
CHECK(fetched_items_.empty());
765-
auto cb = [this, indexes, flush_db_arr = std::move(flush_db_arr)]() mutable {
761+
auto cb = [indexes, flush_db_arr = std::move(flush_db_arr)]() mutable {
766762
flush_db_arr.clear();
767763
ServerState::tlocal()->DecommitMemory(ServerState::kDataHeap | ServerState::kBackingHeap |
768764
ServerState::kGlibcmalloc);
@@ -1023,9 +1019,7 @@ bool DbSlice::CheckLock(IntentLock::Mode mode, DbIndex dbid, uint64_t fp) const
10231019
}
10241020

10251021
void DbSlice::PreUpdate(DbIndex db_ind, Iterator it, std::string_view key) {
1026-
DVLOG(2) << "Running callbacks in dbid " << db_ind;
1027-
CallChangeCallbacks(db_ind, ChangeReq{it.GetInnerIt()});
1028-
1022+
CallChangeCallbacks(db_ind, key, ChangeReq{it.GetInnerIt()});
10291023
it.GetInnerIt().SetVersion(NextVersion());
10301024
}
10311025

@@ -1217,25 +1211,22 @@ int32_t DbSlice::GetNextSegmentForEviction(int32_t segment_id, DbIndex db_ind) c
12171211
db_arr_[db_ind]->prime.GetSegmentCount();
12181212
}
12191213

1220-
void DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t increase_goal_bytes) {
1214+
size_t DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t starting_segment_id,
1215+
size_t increase_goal_bytes) {
12211216
DCHECK(!owner_->IsReplica());
1222-
if ((!caching_mode_) || !expire_allowed_ || !GetFlag(FLAGS_enable_heartbeat_eviction))
1223-
return;
1217+
if ((!caching_mode_) || !expire_allowed_)
1218+
return 0;
12241219

12251220
auto max_eviction_per_hb = GetFlag(FLAGS_max_eviction_per_heartbeat);
12261221
auto max_segment_to_consider = GetFlag(FLAGS_max_segment_to_consider);
12271222

12281223
auto time_start = absl::GetCurrentTimeNanos();
12291224
auto& db_table = db_arr_[db_ind];
1230-
int32_t num_segments = db_table->prime.GetSegmentCount();
1231-
int32_t num_buckets = PrimeTable::Segment_t::kTotalBuckets;
1232-
int32_t num_slots = PrimeTable::Segment_t::kSlotNum;
1225+
constexpr int32_t num_buckets = PrimeTable::Segment_t::kTotalBuckets;
1226+
constexpr int32_t num_slots = PrimeTable::Segment_t::kSlotNum;
12331227

1234-
size_t used_memory_after;
1235-
size_t evicted = 0;
1228+
size_t evicted_items = 0, evicted_bytes = 0;
12361229
string tmp;
1237-
int32_t starting_segment_id = rand() % num_segments;
1238-
size_t used_memory_before = owner_->UsedMemory();
12391230

12401231
bool record_keys = owner_->journal() != nullptr || expired_keys_events_recording_;
12411232
vector<string> keys_to_journal;
@@ -1257,7 +1248,7 @@ void DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t increase_goal_bytes
12571248
continue;
12581249

12591250
auto evict_it = db_table->prime.GetIterator(segment_id, bucket_id, slot_id);
1260-
if (evict_it->first.IsSticky())
1251+
if (evict_it->first.IsSticky() || !evict_it->second.HasAllocated())
12611252
continue;
12621253

12631254
// check if the key is locked by looking up transaction table.
@@ -1269,13 +1260,12 @@ void DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t increase_goal_bytes
12691260
if (record_keys)
12701261
keys_to_journal.emplace_back(key);
12711262

1263+
evicted_bytes += evict_it->second.MallocUsed();
1264+
++evicted_items;
12721265
PerformDeletion(Iterator(evict_it, StringOrView::FromView(key)), db_table.get());
1273-
++evicted;
12741266

1275-
used_memory_after = owner_->UsedMemory();
12761267
// returns when whichever condition is met first
1277-
if ((evicted == max_eviction_per_hb) ||
1278-
(used_memory_before - used_memory_after >= increase_goal_bytes))
1268+
if ((evicted_items == max_eviction_per_hb) || (evicted_bytes >= increase_goal_bytes))
12791269
goto finish;
12801270
}
12811271
}
@@ -1294,12 +1284,12 @@ void DbSlice::FreeMemWithEvictionStep(DbIndex db_ind, size_t increase_goal_bytes
12941284
}
12951285

12961286
auto time_finish = absl::GetCurrentTimeNanos();
1297-
events_.evicted_keys += evicted;
1298-
DVLOG(2) << "Memory usage before eviction: " << used_memory_before;
1299-
DVLOG(2) << "Memory usage after eviction: " << used_memory_after;
1300-
DVLOG(2) << "Number of keys evicted / max eviction per hb: " << evicted << "/"
1287+
events_.evicted_keys += evicted_items;
1288+
DVLOG(2) << "Evicted: " << evicted_bytes;
1289+
DVLOG(2) << "Number of keys evicted / max eviction per hb: " << evicted_items << "/"
13011290
<< max_eviction_per_hb;
13021291
DVLOG(2) << "Eviction time (us): " << (time_finish - time_start) / 1000;
1292+
return evicted_bytes;
13031293
}
13041294

13051295
void DbSlice::CreateDb(DbIndex db_ind) {
@@ -1310,93 +1300,6 @@ void DbSlice::CreateDb(DbIndex db_ind) {
13101300
}
13111301
}
13121302

1313-
// "it" is the iterator that we just added/updated and it should not be deleted.
1314-
// "table" is the instance where we should delete the objects from.
1315-
size_t DbSlice::EvictObjects(size_t memory_to_free, Iterator it, DbTable* table) {
1316-
if (owner_->IsReplica()) {
1317-
return 0;
1318-
}
1319-
PrimeTable::Segment_t* segment = table->prime.GetSegment(it.GetInnerIt().segment_id());
1320-
DCHECK(segment);
1321-
1322-
constexpr unsigned kNumStashBuckets = PrimeTable::Segment_t::kStashBucketNum;
1323-
constexpr unsigned kNumRegularBuckets = PrimeTable::Segment_t::kBucketNum;
1324-
1325-
PrimeTable::bucket_iterator it2(it.GetInnerIt());
1326-
unsigned evicted = 0;
1327-
bool evict_succeeded = false;
1328-
1329-
EngineShard* shard = owner_;
1330-
size_t used_memory_start = shard->UsedMemory();
1331-
1332-
auto freed_memory_fun = [&] {
1333-
size_t current = shard->UsedMemory();
1334-
return current < used_memory_start ? used_memory_start - current : 0;
1335-
};
1336-
1337-
for (unsigned i = 0; !evict_succeeded && i < kNumStashBuckets; ++i) {
1338-
unsigned stash_bid = i + kNumRegularBuckets;
1339-
const auto& bucket = segment->GetBucket(stash_bid);
1340-
if (bucket.IsEmpty())
1341-
continue;
1342-
1343-
for (int slot_id = PrimeTable::Segment_t::kSlotNum - 1; slot_id >= 0; --slot_id) {
1344-
if (!bucket.IsBusy(slot_id))
1345-
continue;
1346-
1347-
auto evict_it = table->prime.GetIterator(it.GetInnerIt().segment_id(), stash_bid, slot_id);
1348-
// skip the iterator that we must keep or the sticky items.
1349-
if (evict_it == it.GetInnerIt() || evict_it->first.IsSticky())
1350-
continue;
1351-
1352-
PerformDeletion(evict_it, table);
1353-
++evicted;
1354-
1355-
if (freed_memory_fun() > memory_to_free) {
1356-
evict_succeeded = true;
1357-
break;
1358-
}
1359-
}
1360-
}
1361-
1362-
if (evicted) {
1363-
DVLOG(1) << "Evicted " << evicted << " stashed items, freed " << freed_memory_fun() << " bytes";
1364-
}
1365-
1366-
// Try normal buckets now. We iterate from largest slot to smallest across the whole segment.
1367-
for (int slot_id = PrimeTable::Segment_t::kSlotNum - 1; !evict_succeeded && slot_id >= 0;
1368-
--slot_id) {
1369-
for (unsigned i = 0; i < kNumRegularBuckets; ++i) {
1370-
unsigned bid = (it.GetInnerIt().bucket_id() + i) % kNumRegularBuckets;
1371-
const auto& bucket = segment->GetBucket(bid);
1372-
if (!bucket.IsBusy(slot_id))
1373-
continue;
1374-
1375-
auto evict_it = table->prime.GetIterator(it.GetInnerIt().segment_id(), bid, slot_id);
1376-
if (evict_it == it.GetInnerIt() || evict_it->first.IsSticky())
1377-
continue;
1378-
1379-
PerformDeletion(evict_it, table);
1380-
++evicted;
1381-
1382-
if (freed_memory_fun() > memory_to_free) {
1383-
evict_succeeded = true;
1384-
break;
1385-
}
1386-
}
1387-
}
1388-
1389-
if (evicted) {
1390-
DVLOG(1) << "Evicted total: " << evicted << " items, freed " << freed_memory_fun() << " bytes "
1391-
<< "success: " << evict_succeeded;
1392-
1393-
events_.evicted_keys += evicted;
1394-
events_.hard_evictions += evicted;
1395-
}
1396-
1397-
return freed_memory_fun();
1398-
};
1399-
14001303
void DbSlice::RegisterWatchedKey(DbIndex db_indx, std::string_view key,
14011304
ConnectionState::ExecInfo* exec_info) {
14021305
db_arr_[db_indx]->watched_keys[key].push_back(exec_info);
@@ -1566,7 +1469,11 @@ void DbSlice::OnCbFinish() {
15661469
fetched_items_.clear();
15671470
}
15681471

1569-
void DbSlice::CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const {
1472+
void DbSlice::CallChangeCallbacks(DbIndex id, std::string_view key, const ChangeReq& cr) const {
1473+
if (change_cb_.empty())
1474+
return;
1475+
1476+
DVLOG(2) << "Running callbacks for key " << key << " in dbid " << id;
15701477
FetchedItemsRestorer fetched_restorer(&fetched_items_);
15711478
std::unique_lock<LocalBlockingCounter> lk(block_counter_);
15721479

src/server/db_slice.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -437,7 +437,12 @@ class DbSlice {
437437

438438
// Deletes some amount of possible expired items.
439439
DeleteExpiredStats DeleteExpiredStep(const Context& cntx, unsigned count);
440-
void FreeMemWithEvictionStep(DbIndex db_indx, size_t increase_goal_bytes);
440+
441+
// Evicts items with dynamically allocated data from the primary table.
442+
// Does not shrink tables.
443+
// Returnes number of bytes freed due to evictions.
444+
size_t FreeMemWithEvictionStep(DbIndex db_indx, size_t starting_segment_id,
445+
size_t increase_goal_bytes);
441446
void ScheduleForOffloadStep(DbIndex db_indx, size_t increase_goal_bytes);
442447

443448
int32_t GetNextSegmentForEviction(int32_t segment_id, DbIndex db_ind) const;
@@ -469,6 +474,8 @@ class DbSlice {
469474
// Resets events_ member. Used by CONFIG RESETSTAT
470475
void ResetEvents();
471476

477+
// Controls the expiry/eviction state. The server may enter states where
478+
// Both evictions and expiries will be stopped for a short period of time.
472479
void SetExpireAllowed(bool is_allowed) {
473480
expire_allowed_ = is_allowed;
474481
}
@@ -508,7 +515,6 @@ class DbSlice {
508515
void SendInvalidationTrackingMessage(std::string_view key);
509516

510517
void CreateDb(DbIndex index);
511-
size_t EvictObjects(size_t memory_to_free, Iterator it, DbTable* table);
512518

513519
enum class UpdateStatsMode {
514520
kReadStats,
@@ -534,7 +540,7 @@ class DbSlice {
534540
return version_++;
535541
}
536542

537-
void CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const;
543+
void CallChangeCallbacks(DbIndex id, std::string_view key, const ChangeReq& cr) const;
538544

539545
class LocalBlockingCounter {
540546
public:

src/server/engine_shard_set.cc

+15-2
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ ABSL_FLAG(string, shard_round_robin_prefix, "",
7474
ABSL_FLAG(uint32_t, mem_defrag_check_sec_interval, 10,
7575
"Number of seconds between every defragmentation necessity check");
7676

77+
ABSL_FLAG(bool, enable_heartbeat_eviction, true,
78+
"Enable eviction during heartbeat when memory is under pressure.");
79+
7780
namespace dfly {
7881

7982
using namespace tiering::literals;
@@ -641,8 +644,10 @@ void EngineShard::Heartbeat() {
641644
}
642645

643646
// if our budget is below the limit
644-
if (db_slice.memory_budget() < eviction_redline) {
645-
db_slice.FreeMemWithEvictionStep(i, eviction_redline - db_slice.memory_budget());
647+
if (db_slice.memory_budget() < eviction_redline && GetFlag(FLAGS_enable_heartbeat_eviction)) {
648+
uint32_t starting_segment_id = rand() % pt->GetSegmentCount();
649+
db_slice.FreeMemWithEvictionStep(i, starting_segment_id,
650+
eviction_redline - db_slice.memory_budget());
646651
}
647652

648653
if (UsedMemory() > tiering_offload_threshold) {
@@ -658,17 +663,25 @@ void EngineShard::Heartbeat() {
658663
}
659664

660665
void EngineShard::RunPeriodic(std::chrono::milliseconds period_ms) {
666+
VLOG(1) << "RunPeriodic with period " << period_ms.count() << "ms";
667+
661668
bool runs_global_periodic = (shard_id() == 0); // Only shard 0 runs global periodic.
662669
unsigned global_count = 0;
663670
int64_t last_stats_time = time(nullptr);
671+
int64_t last_heartbeat_ms = INT64_MAX;
664672

665673
while (true) {
666674
if (fiber_periodic_done_.WaitFor(period_ms)) {
667675
VLOG(2) << "finished running engine shard periodic task";
668676
return;
669677
}
670678

679+
int64_t now_ms = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;
680+
if (now_ms - 5 * period_ms.count() > last_heartbeat_ms) {
681+
VLOG(1) << "This heartbeat took " << now_ms - last_heartbeat_ms << "ms";
682+
}
671683
Heartbeat();
684+
last_heartbeat_ms = fb2::ProactorBase::GetMonotonicTimeNs() / 1000000;
672685

673686
if (runs_global_periodic) {
674687
++global_count;

0 commit comments

Comments
 (0)