Skip to content

Commit f7e897a

Browse files
committed
WIP: DecayRange
Implemenation of a range that gradually releases memory back to the OS. It quickly pulls memory, but the dealloc_range locally caches the memory and uses Pal timers to release it back to the next level range when sufficient time has passed. TODO: codify that parent range needs to be concurrency safe.
1 parent 6ad7f65 commit f7e897a

File tree

2 files changed

+298
-4
lines changed

2 files changed

+298
-4
lines changed

src/backend/backend.h

+8-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include "chunkallocator.h"
66
#include "commitrange.h"
77
#include "commonconfig.h"
8+
#include "decayrange.h"
89
#include "empty_range.h"
910
#include "globalrange.h"
1011
#include "largebuddyrange.h"
@@ -144,9 +145,10 @@ namespace snmalloc
144145
using GlobalR = GlobalRange<StatsR>;
145146

146147
# ifdef SNMALLOC_META_PROTECTED
148+
using CommittedRange =
149+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
147150
// Source for object allocations
148-
using ObjectRange =
149-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>;
151+
using ObjectRange = LargeBuddyRange<CommittedRange, 21, 21, Pagemap>;
150152
// Set up protected range for metadata
151153
using SubR = CommitRange<SubRange<GlobalR, DefaultPal, 6>, DefaultPal>;
152154
using MetaRange =
@@ -155,8 +157,10 @@ namespace snmalloc
155157
# else
156158
// Source for object allocations and metadata
157159
// No separation between the two
158-
using ObjectRange = SmallBuddyRange<
159-
LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>>;
160+
using CommittedRange =
161+
DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
162+
using ObjectRange =
163+
SmallBuddyRange<LargeBuddyRange<CommittedRange, 21, 21, Pagemap>>;
160164
using GlobalMetaRange = GlobalRange<ObjectRange>;
161165
# endif
162166
#endif

src/backend/decayrange.h

+290
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
#pragma once
2+
3+
#include "../ds/ptrwrap.h"
4+
#include "../pal/pal_ds.h"
5+
#include "largebuddyrange.h"
6+
7+
namespace snmalloc
8+
{
9+
template<typename Rep>
10+
class PagemapList
11+
{
12+
uintptr_t head = 0;
13+
14+
PagemapList(uintptr_t head) : head(head) {}
15+
16+
public:
17+
constexpr PagemapList() = default;
18+
19+
bool is_empty() const
20+
{
21+
return head == 0;
22+
}
23+
24+
PagemapList get_next()
25+
{
26+
SNMALLOC_ASSERT(!is_empty());
27+
auto next_field = &(Rep::ref(false, head));
28+
auto next = Rep::get(next_field);
29+
return {next};
30+
}
31+
32+
capptr::Chunk<void> get_capability()
33+
{
34+
return capptr::Chunk<void>(reinterpret_cast<void*>(head));
35+
}
36+
37+
PagemapList cons(capptr::Chunk<void> new_head_cap)
38+
{
39+
auto new_head = new_head_cap.unsafe_uintptr();
40+
auto field = &(Rep::ref(false, new_head));
41+
Rep::set(field, head);
42+
return {new_head};
43+
}
44+
};
45+
46+
/**
47+
* Concurrent Stack
48+
*
49+
* This stack supports the following clients
50+
* (push|pop)* || pop_all* || ... || pop_all*
51+
*
52+
* That is a single thread that can do push and pop, and other threads
53+
* that do pop_all. pop_all if it returns a value, returns all of the
54+
* stack, however, it may return nullptr if it races with either a push
55+
* or a pop.
56+
*
57+
* The primary use case is single-threaded access, where other threads
58+
* can attempt to steal all the values.
59+
*/
60+
template<typename Rep>
61+
class PagemapStack
62+
{
63+
static constexpr auto empty = PagemapList<Rep>{};
64+
65+
private:
66+
alignas(CACHELINE_SIZE) std::atomic<PagemapList<Rep>> stack{};
67+
68+
PagemapList<Rep> take()
69+
{
70+
if (stack.load(std::memory_order_relaxed).is_empty())
71+
return empty;
72+
return stack.exchange(empty, std::memory_order_acquire);
73+
}
74+
75+
void replace(PagemapList<Rep> new_head)
76+
{
77+
SNMALLOC_ASSERT(stack.load().is_empty());
78+
stack.store(new_head, std::memory_order_release);
79+
}
80+
81+
public:
82+
constexpr PagemapStack() = default;
83+
84+
void push(capptr::Chunk<void> new_head_cap)
85+
{
86+
auto old_head = take();
87+
auto new_head = old_head.cons(new_head_cap);
88+
replace(new_head);
89+
}
90+
91+
capptr::Chunk<void> pop()
92+
{
93+
auto old_head = take();
94+
if (old_head.is_empty())
95+
return nullptr;
96+
97+
auto next = old_head.get_next();
98+
auto result = old_head.get_capability();
99+
100+
replace(next);
101+
return result;
102+
}
103+
104+
PagemapList<Rep> pop_all()
105+
{
106+
return take();
107+
}
108+
};
109+
110+
template<typename ParentRange, typename PAL, typename Pagemap>
111+
class DecayRange
112+
{
113+
typename ParentRange::State parent{};
114+
115+
/**
116+
* How many slab sizes that can be provided.
117+
*/
118+
static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS;
119+
120+
/**
121+
* Number of free stacks per chunk size that each allocator will use.
122+
* For performance ideally a power of 2. We will return to the central
123+
* pool anything that has not be used in the last NUM_EPOCHS - 1, where
124+
* each epoch is separated by DecayMemoryTimerObject::PERIOD.
125+
* I.e. if period is 500ms and num of epochs is 4, then we will return to
126+
* the central pool anything not used for the last 1500-2000ms.
127+
*/
128+
static constexpr size_t NUM_EPOCHS = 4;
129+
static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two.");
130+
131+
/**
132+
* Stack of ranges that have been returned for reuse.
133+
*/
134+
ModArray<
135+
NUM_SLAB_SIZES,
136+
ModArray<NUM_EPOCHS, PagemapStack<BuddyChunkRep<Pagemap>>>>
137+
chunk_stack;
138+
139+
/**
140+
* Which is the current epoch to place dealloced chunks, and the
141+
* first place we look for allocating chunks.
142+
*/
143+
static inline // alignas(CACHELINE_SIZE)
144+
std::atomic<size_t>
145+
epoch{0};
146+
147+
/**
148+
* Flag to ensure one-shot registration with the PAL.
149+
*/
150+
static inline std::atomic_bool registered_timer{false};
151+
152+
std::atomic_bool registered_local{false};
153+
154+
/**
155+
* All activated DecayRanges.
156+
*/
157+
static inline std::atomic<DecayRange*> all_local{nullptr};
158+
159+
DecayRange* next{nullptr};
160+
161+
static void handle_decay_tick()
162+
{
163+
auto new_epoch = (epoch + 1) % NUM_EPOCHS;
164+
// Flush old index for all threads.
165+
auto curr = all_local.load(std::memory_order_acquire);
166+
while (curr != nullptr)
167+
{
168+
for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++)
169+
{
170+
// Don't use ChunkRecord, store in pagemap.
171+
auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all();
172+
while (!old_stack.is_empty())
173+
{
174+
auto next = old_stack.get_next();
175+
176+
curr->parent->dealloc_range(
177+
old_stack.get_capability(), MIN_CHUNK_SIZE << sc);
178+
179+
old_stack = next;
180+
}
181+
}
182+
curr = curr->next;
183+
}
184+
185+
// Advance current index
186+
epoch = new_epoch;
187+
}
188+
189+
class DecayMemoryTimerObject : public PalTimerObject
190+
{
191+
/***
192+
* Method for callback object to perform lazy decommit.
193+
*/
194+
static void process(PalTimerObject*)
195+
{
196+
handle_decay_tick();
197+
}
198+
199+
// Specify that we notify the ChunkAllocator every 500ms.
200+
static constexpr size_t PERIOD = 500;
201+
202+
public:
203+
constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {}
204+
};
205+
206+
static inline DecayMemoryTimerObject timer_object;
207+
208+
public:
209+
class State
210+
{
211+
DecayRange commit_range{};
212+
213+
public:
214+
constexpr State() = default;
215+
216+
DecayRange* operator->()
217+
{
218+
return &commit_range;
219+
}
220+
};
221+
222+
static constexpr bool Aligned = ParentRange::Aligned;
223+
224+
constexpr DecayRange() = default;
225+
226+
capptr::Chunk<void> alloc_range(size_t size)
227+
{
228+
// Check local cache
229+
230+
if constexpr (pal_supports<Time, PAL>)
231+
{
232+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
233+
// Try local cache of chunks first
234+
for (size_t e = 0; e < NUM_EPOCHS; e++)
235+
{
236+
auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop();
237+
238+
if (p != nullptr)
239+
return p;
240+
}
241+
}
242+
243+
capptr::Chunk<void> result;
244+
for (auto i = NUM_EPOCHS + 2; i > 0; i--)
245+
{
246+
// Nothing in local cache to allocate from parent.
247+
result = parent->alloc_range(size);
248+
if (result != nullptr)
249+
{
250+
return result;
251+
}
252+
253+
// We have run out of memory.
254+
handle_decay_tick(); // Try to free some memory.
255+
}
256+
257+
return result;
258+
}
259+
260+
void dealloc_range(capptr::Chunk<void> base, size_t size)
261+
{
262+
if constexpr (!pal_supports<Time, PAL>)
263+
{
264+
parent->dealloc_range(base, size);
265+
return;
266+
}
267+
268+
if (!registered_timer.exchange(true))
269+
{
270+
// Register with the PAL.
271+
PAL::register_timer(&timer_object);
272+
}
273+
274+
// Check we have registered
275+
if (!registered_local.exchange(true))
276+
{
277+
// Add to the list of local states.
278+
auto* head = all_local.load();
279+
do
280+
{
281+
next = head;
282+
} while (!all_local.compare_exchange_strong(head, this));
283+
}
284+
285+
auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
286+
// Add to local cache.
287+
chunk_stack[slab_sizeclass][epoch].push(base);
288+
}
289+
};
290+
} // namespace snmalloc

0 commit comments

Comments
 (0)