Skip to content

Commit 0b4a673

Browse files
committed
chore(tiering): Faster smallbins serialization #2
Signed-off-by: Vladislav Oleshko <vlad@dragonflydb.io>
1 parent e2d65a0 commit 0b4a673

13 files changed

+254
-27
lines changed

src/core/compact_object.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -985,7 +985,7 @@ std::pair<size_t, size_t> CompactObj::GetExternalSlice() const {
985985
}
986986

987987
void CompactObj::Materialize(std::string_view blob, bool is_raw) {
988-
CHECK(IsExternal()) << int(taglen_);
988+
// CHECK(IsExternal()) << int(taglen_);
989989

990990
DCHECK_GT(blob.size(), kInlineLen);
991991

src/core/compact_object.h

+9
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@ class CompactObj {
405405

406406
bool HasAllocated() const;
407407

408+
uint8_t GetEncodingMask() const {
409+
return mask_ & kEncMask;
410+
}
411+
412+
void SetEncodingMask(uint8_t mask) {
413+
mask_ &= ~kEncMask;
414+
mask_ |= (mask & kEncMask);
415+
}
416+
408417
private:
409418
void EncodeString(std::string_view str);
410419
size_t DecodedLen(size_t sz) const;

src/server/rdb_extensions.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,12 @@ constexpr uint8_t RDB_TYPE_JSON = 30;
1414
constexpr uint8_t RDB_TYPE_HASH_WITH_EXPIRY = 31;
1515
constexpr uint8_t RDB_TYPE_SET_WITH_EXPIRY = 32;
1616
constexpr uint8_t RDB_TYPE_SBF = 33;
17+
constexpr uint8_t RDB_TYPE_TIERED_SEGMENT = 34;
1718

1819
constexpr bool rdbIsObjectTypeDF(uint8_t type) {
1920
return __rdbIsObjectType(type) || (type == RDB_TYPE_JSON) ||
2021
(type == RDB_TYPE_HASH_WITH_EXPIRY) || (type == RDB_TYPE_SET_WITH_EXPIRY) ||
21-
(type == RDB_TYPE_SBF);
22+
(type == RDB_TYPE_SBF) || (type == RDB_TYPE_TIERED_SEGMENT);
2223
}
2324

2425
// Opcodes: Range 200-240 is used by DF extensions.
@@ -40,6 +41,8 @@ constexpr uint8_t RDB_OPCODE_JOURNAL_BLOB = 210;
4041
// so it is always sent at the end of the RDB stream.
4142
constexpr uint8_t RDB_OPCODE_JOURNAL_OFFSET = 211;
4243

44+
constexpr uint8_t RDB_OPCODE_TIERED_PAGE = 212;
45+
4346
constexpr uint8_t RDB_OPCODE_DF_MASK = 220; /* Mask for key properties */
4447

4548
// RDB_OPCODE_DF_MASK define 4byte field with next flags

src/server/rdb_load.cc

+130-5
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ extern "C" {
3232
#include "base/logging.h"
3333
#include "core/bloom.h"
3434
#include "core/json/json_object.h"
35+
#include "core/overloaded.h"
3536
#include "core/sorted_map.h"
3637
#include "core/string_map.h"
3738
#include "core/string_set.h"
@@ -48,6 +49,7 @@ extern "C" {
4849
#include "server/server_state.h"
4950
#include "server/set_family.h"
5051
#include "server/tiering/common.h" // for _KB literal
52+
#include "server/tiering/disk_storage.h"
5153
#include "server/transaction.h"
5254
#include "strings/human_readable.h"
5355

@@ -387,6 +389,7 @@ class RdbLoaderBase::OpaqueObjLoader {
387389
void operator()(const LzfString& lzfstr);
388390
void operator()(const unique_ptr<LoadTrace>& ptr);
389391
void operator()(const RdbSBF& src);
392+
void operator()(const RdbTieredSegment& segmnet);
390393

391394
std::error_code ec() const {
392395
return ec_;
@@ -481,6 +484,10 @@ void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbSBF& src) {
481484
pv_->SetSBF(sbf);
482485
}
483486

487+
void RdbLoaderBase::OpaqueObjLoader::operator()(const RdbTieredSegment& src) {
488+
CHECK(false) << "unreachable";
489+
}
490+
484491
void RdbLoaderBase::OpaqueObjLoader::CreateSet(const LoadTrace* ltrace) {
485492
size_t len = ltrace->blob_count();
486493

@@ -1385,6 +1392,9 @@ error_code RdbLoaderBase::ReadObj(int rdbtype, OpaqueObj* dest) {
13851392
case RDB_TYPE_SBF:
13861393
iores = ReadSBF();
13871394
break;
1395+
case RDB_TYPE_TIERED_SEGMENT:
1396+
iores = ReadTieredSegment();
1397+
break;
13881398
default:
13891399
LOG(ERROR) << "Unsupported rdb type " << rdbtype;
13901400

@@ -1878,6 +1888,14 @@ auto RdbLoaderBase::ReadSBF() -> io::Result<OpaqueObj> {
18781888
return OpaqueObj{std::move(res), RDB_TYPE_SBF};
18791889
}
18801890

1891+
auto RdbLoaderBase::ReadTieredSegment() -> io::Result<OpaqueObj> {
1892+
RdbTieredSegment segment;
1893+
SET_OR_UNEXPECT(LoadLen(nullptr), segment.offset);
1894+
SET_OR_UNEXPECT(LoadLen(nullptr), segment.length);
1895+
SET_OR_UNEXPECT(LoadLen(nullptr), segment.enc_mask);
1896+
return OpaqueObj{segment, RDB_TYPE_TIERED_SEGMENT};
1897+
};
1898+
18811899
template <typename T> io::Result<T> RdbLoaderBase::FetchInt() {
18821900
auto ec = EnsureRead(sizeof(T));
18831901
if (ec)
@@ -1924,6 +1942,18 @@ RdbLoader::RdbLoader(Service* service)
19241942
}
19251943

19261944
RdbLoader::~RdbLoader() {
1945+
for (auto& [_, page] : small_items_pages_) {
1946+
if (!holds_alternative<tiering::DiskSegment>(page))
1947+
continue;
1948+
auto segment = get<tiering::DiskSegment>(page);
1949+
EngineShard::tlocal()->tiered_storage()->BorrowStorage().MarkAsFree(segment);
1950+
}
1951+
1952+
for (auto& [_, items] : small_items_) {
1953+
for (Item* item : items)
1954+
delete item;
1955+
}
1956+
19271957
while (true) {
19281958
Item* item = item_queue_.Pop();
19291959
if (item == nullptr)
@@ -2117,6 +2147,11 @@ error_code RdbLoader::Load(io::Source* src) {
21172147
continue;
21182148
}
21192149

2150+
if (type == RDB_OPCODE_TIERED_PAGE) {
2151+
RETURN_ON_ERR(LoadTieredPage());
2152+
continue;
2153+
}
2154+
21202155
if (!rdbIsObjectTypeDF(type)) {
21212156
return RdbError(errc::invalid_rdb_type);
21222157
}
@@ -2126,6 +2161,11 @@ error_code RdbLoader::Load(io::Source* src) {
21262161
settings.Reset();
21272162
} // main load loop
21282163

2164+
// Flush all small items
2165+
HandleSmallItems(true);
2166+
2167+
FlushAllShards();
2168+
21292169
DVLOG(1) << "RdbLoad loop finished";
21302170

21312171
if (stop_early_) {
@@ -2348,6 +2388,38 @@ error_code RdbLoaderBase::HandleJournalBlob(Service* service) {
23482388
return std::error_code{};
23492389
}
23502390

2391+
error_code RdbLoader::LoadTieredPage() {
2392+
size_t offset;
2393+
SET_OR_RETURN(LoadLen(nullptr), offset);
2394+
2395+
std::string page;
2396+
SET_OR_RETURN(FetchGenericString(), page);
2397+
2398+
// If tiering is enabled, try saving the received page on disk
2399+
// Fall back to memory in case of errors
2400+
if (EngineShard::tlocal() && EngineShard::tlocal()->tiered_storage()) {
2401+
auto& storage = EngineShard::tlocal()->tiered_storage()->BorrowStorage();
2402+
2403+
util::fb2::Done done;
2404+
std::error_code ec;
2405+
auto cb = [this, offset, &ec, &done](io::Result<tiering::DiskSegment> res) {
2406+
if (res.has_value())
2407+
small_items_pages_[offset] = res.value();
2408+
else
2409+
ec = res.error();
2410+
done.Notify();
2411+
};
2412+
ec = storage.Stash(io::Buffer(page), {}, cb);
2413+
2414+
done.Wait();
2415+
if (!ec)
2416+
return {};
2417+
}
2418+
2419+
small_items_pages_[offset] = page;
2420+
return {};
2421+
}
2422+
23512423
error_code RdbLoader::HandleAux() {
23522424
/* AUX: generic string-string fields. Use to add state to RDB
23532425
* which is backward compatible. Implementations of RDB loading
@@ -2531,20 +2603,37 @@ error_code RdbLoader::LoadKeyValPair(int type, ObjSettings* settings) {
25312603

25322604
item->is_sticky = settings->is_sticky;
25332605

2534-
ShardId sid = Shard(item->key, shard_set->size());
25352606
item->expire_ms = settings->expiretime;
25362607

2537-
auto& out_buf = shard_buf_[sid];
2608+
std::move(cleanup).Cancel();
2609+
2610+
if (item->val.rdb_type == RDB_TYPE_TIERED_SEGMENT) {
2611+
auto segment = get<RdbTieredSegment>(item->val.obj);
2612+
{
2613+
size_t offset = segment.offset / tiering::kPageSize * tiering::kPageSize;
2614+
auto& items = small_items_[offset];
2615+
small_items_sizes_.erase({items.size(), offset});
2616+
items.push_back(item);
2617+
small_items_sizes_.insert({items.size(), offset});
2618+
}
2619+
HandleSmallItems(false); // don't force flush
2620+
return kOk;
2621+
}
2622+
2623+
Add(item);
2624+
return kOk;
2625+
}
2626+
2627+
void RdbLoader::Add(Item* item) {
2628+
ShardId sid = Shard(item->key, shard_set->size());
25382629

2630+
auto& out_buf = shard_buf_[sid];
25392631
out_buf.emplace_back(item);
2540-
std::move(cleanup).Cancel();
25412632

25422633
constexpr size_t kBufSize = 128;
25432634
if (out_buf.size() >= kBufSize) {
25442635
FlushShardAsync(sid);
25452636
}
2546-
2547-
return kOk;
25482637
}
25492638

25502639
void RdbLoader::LoadScriptFromAux(string&& body) {
@@ -2559,6 +2648,42 @@ void RdbLoader::LoadScriptFromAux(string&& body) {
25592648
}
25602649
}
25612650

2651+
void RdbLoader::HandleSmallItems(bool flush) {
2652+
while (!small_items_.empty() && (flush || small_items_.size() > 1000)) {
2653+
auto [_, offset] = small_items_sizes_.extract(small_items_sizes_.begin()).value();
2654+
auto node = small_items_.extract(offset);
2655+
2656+
auto page_reader = [](tiering::DiskSegment segment) {
2657+
auto& store = EngineShard::tlocal()->tiered_storage()->BorrowStorage();
2658+
util::fb2::Future<std::string> f;
2659+
store.Read(segment, [f](io::Result<std::string_view> result) mutable {
2660+
CHECK(result.has_value()); // TODO
2661+
f.Resolve(string{result.value()});
2662+
});
2663+
return f.Get();
2664+
};
2665+
string page = visit(Overloaded{[](const string& s) { return s; }, page_reader},
2666+
small_items_pages_[offset]);
2667+
2668+
for (Item* item : node.mapped()) {
2669+
RdbTieredSegment segment = get<RdbTieredSegment>(item->val.obj);
2670+
2671+
CompactObj co;
2672+
co.SetEncodingMask(segment.enc_mask);
2673+
co.Materialize({page.data() + (segment.offset - offset), segment.length}, true);
2674+
2675+
VLOG(0) << "Loaded " << co.ToString();
2676+
2677+
base::PODArray<char> arr(co.Size(), nullptr);
2678+
co.GetString(arr.data());
2679+
2680+
item->val.rdb_type = RDB_TYPE_STRING;
2681+
item->val.obj = std::move(arr);
2682+
Add(item);
2683+
}
2684+
}
2685+
}
2686+
25622687
void RdbLoader::LoadSearchIndexDefFromAux(string&& def) {
25632688
facade::CapturingReplyBuilder crb{};
25642689
ConnectionContext cntx{nullptr, nullptr, &crb};

src/server/rdb_load.h

+30-2
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
//
44
#pragma once
55

6+
#include <absl/container/btree_set.h>
7+
#include <absl/container/flat_hash_map.h>
8+
69
#include <system_error>
710

811
extern "C" {
@@ -15,6 +18,7 @@ extern "C" {
1518
#include "io/io_buf.h"
1619
#include "server/common.h"
1720
#include "server/journal/serializer.h"
21+
#include "server/tiering/common.h"
1822

1923
namespace dfly {
2024

@@ -54,8 +58,18 @@ class RdbLoaderBase {
5458
std::vector<Filter> filters;
5559
};
5660

57-
using RdbVariant =
58-
std::variant<long long, base::PODArray<char>, LzfString, std::unique_ptr<LoadTrace>, RdbSBF>;
61+
struct RdbTieredSegment {
62+
size_t offset, length;
63+
uint8_t enc_mask;
64+
};
65+
66+
struct RdbTieredPage {
67+
size_t offset;
68+
std::string blob;
69+
};
70+
71+
using RdbVariant = std::variant<long long, base::PODArray<char>, LzfString,
72+
std::unique_ptr<LoadTrace>, RdbSBF, RdbTieredSegment>;
5973

6074
struct OpaqueObj {
6175
RdbVariant obj;
@@ -148,6 +162,7 @@ class RdbLoaderBase {
148162
::io::Result<OpaqueObj> ReadRedisJson();
149163
::io::Result<OpaqueObj> ReadJson();
150164
::io::Result<OpaqueObj> ReadSBF();
165+
::io::Result<OpaqueObj> ReadTieredSegment();
151166

152167
std::error_code SkipModuleData();
153168
std::error_code HandleCompressedBlob(int op_type);
@@ -168,10 +183,13 @@ class RdbLoaderBase {
168183

169184
size_t bytes_read_ = 0;
170185
size_t source_limit_ = SIZE_MAX;
186+
171187
base::PODArray<uint8_t> compr_buf_;
172188
std::unique_ptr<DecompressImpl> decompress_impl_;
189+
173190
JournalReader journal_reader_{nullptr, 0};
174191
std::optional<uint64_t> journal_offset_ = std::nullopt;
192+
175193
RdbVersion rdb_version_ = RDB_VERSION;
176194
};
177195

@@ -259,10 +277,14 @@ class RdbLoader : protected RdbLoaderBase {
259277
void FlushShardAsync(ShardId sid);
260278
void FlushAllShards();
261279

280+
void Add(Item* item);
262281
void LoadItemsBuffer(DbIndex db_ind, const ItemsBuf& ib);
263282

264283
void LoadScriptFromAux(std::string&& value);
265284

285+
void HandleSmallItems(bool flush);
286+
std::error_code LoadTieredPage();
287+
266288
// Load index definition from RESP string describing it in FT.CREATE format,
267289
// issues an FT.CREATE call, but does not start indexing
268290
void LoadSearchIndexDefFromAux(std::string&& value);
@@ -285,6 +307,12 @@ class RdbLoader : protected RdbLoaderBase {
285307
std::function<void()> full_sync_cut_cb;
286308

287309
base::MPSCIntrusiveQueue<Item> item_queue_;
310+
311+
absl::flat_hash_map<size_t /* offset */, std::vector<Item*>> small_items_;
312+
absl::btree_set<std::pair<size_t /* num entries*/, size_t /* offset */>, std::greater<>>
313+
small_items_sizes_;
314+
absl::flat_hash_map<size_t /* offset */, std::variant<std::string, tiering::DiskSegment>>
315+
small_items_pages_;
288316
};
289317

290318
} // namespace dfly

0 commit comments

Comments
 (0)