WIP: DecayRange

mjp41 · mjp41 · commit f7e897ac35ab · 2022-03-21T21:31:20.000Z
Implemenation of a range that gradually releases memory back to
the OS. It quickly pulls memory, but the dealloc_range locally caches
the memory and uses Pal timers to release it back to the next level
range when sufficient time has passed.

TODO:
  codify that parent range needs to be concurrency safe.
diff --git a/src/backend/backend.h b/src/backend/backend.h
@@ -5,6 +5,7 @@
 #include "chunkallocator.h"
 #include "commitrange.h"
 #include "commonconfig.h"
+#include "decayrange.h"
 #include "empty_range.h"
 #include "globalrange.h"
 #include "largebuddyrange.h"
@@ -144,9 +145,10 @@ namespace snmalloc
     using GlobalR = GlobalRange<StatsR>;
 
 #  ifdef SNMALLOC_META_PROTECTED
+    using CommittedRange =
+      DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
     // Source for object allocations
-    using ObjectRange =
-      LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>;
+    using ObjectRange = LargeBuddyRange<CommittedRange, 21, 21, Pagemap>;
     // Set up protected range for metadata
     using SubR = CommitRange<SubRange<GlobalR, DefaultPal, 6>, DefaultPal>;
     using MetaRange =
@@ -155,8 +157,10 @@ namespace snmalloc
 #  else
     // Source for object allocations and metadata
     // No separation between the two
-    using ObjectRange = SmallBuddyRange<
-      LargeBuddyRange<CommitRange<GlobalR, DefaultPal>, 21, 21, Pagemap>>;
+    using CommittedRange =
+      DecayRange<CommitRange<GlobalR, DefaultPal>, DefaultPal, Pagemap>;
+    using ObjectRange =
+      SmallBuddyRange<LargeBuddyRange<CommittedRange, 21, 21, Pagemap>>;
     using GlobalMetaRange = GlobalRange<ObjectRange>;
 #  endif
 #endif
diff --git a/src/backend/decayrange.h b/src/backend/decayrange.h
@@ -0,0 +1,290 @@
+#pragma once
+
+#include "../ds/ptrwrap.h"
+#include "../pal/pal_ds.h"
+#include "largebuddyrange.h"
+
+namespace snmalloc
+{
+  template<typename Rep>
+  class PagemapList
+  {
+    uintptr_t head = 0;
+
+    PagemapList(uintptr_t head) : head(head) {}
+
+  public:
+    constexpr PagemapList() = default;
+
+    bool is_empty() const
+    {
+      return head == 0;
+    }
+
+    PagemapList get_next()
+    {
+      SNMALLOC_ASSERT(!is_empty());
+      auto next_field = &(Rep::ref(false, head));
+      auto next = Rep::get(next_field);
+      return {next};
+    }
+
+    capptr::Chunk<void> get_capability()
+    {
+      return capptr::Chunk<void>(reinterpret_cast<void*>(head));
+    }
+
+    PagemapList cons(capptr::Chunk<void> new_head_cap)
+    {
+      auto new_head = new_head_cap.unsafe_uintptr();
+      auto field = &(Rep::ref(false, new_head));
+      Rep::set(field, head);
+      return {new_head};
+    }
+  };
+
+  /**
+   * Concurrent Stack
+   *
+   * This stack supports the following clients
+   * (push|pop)* || pop_all* || ... || pop_all*
+   *
+   * That is a single thread that can do push and pop, and other threads
+   * that do pop_all.  pop_all if it returns a value, returns all of the
+   * stack, however, it may return nullptr if it races with either a push
+   * or a pop.
+   *
+   * The primary use case is single-threaded access, where other threads
+   * can attempt to steal all the values.
+   */
+  template<typename Rep>
+  class PagemapStack
+  {
+    static constexpr auto empty = PagemapList<Rep>{};
+
+  private:
+    alignas(CACHELINE_SIZE) std::atomic<PagemapList<Rep>> stack{};
+
+    PagemapList<Rep> take()
+    {
+      if (stack.load(std::memory_order_relaxed).is_empty())
+        return empty;
+      return stack.exchange(empty, std::memory_order_acquire);
+    }
+
+    void replace(PagemapList<Rep> new_head)
+    {
+      SNMALLOC_ASSERT(stack.load().is_empty());
+      stack.store(new_head, std::memory_order_release);
+    }
+
+  public:
+    constexpr PagemapStack() = default;
+
+    void push(capptr::Chunk<void> new_head_cap)
+    {
+      auto old_head = take();
+      auto new_head = old_head.cons(new_head_cap);
+      replace(new_head);
+    }
+
+    capptr::Chunk<void> pop()
+    {
+      auto old_head = take();
+      if (old_head.is_empty())
+        return nullptr;
+
+      auto next = old_head.get_next();
+      auto result = old_head.get_capability();
+
+      replace(next);
+      return result;
+    }
+
+    PagemapList<Rep> pop_all()
+    {
+      return take();
+    }
+  };
+
+  template<typename ParentRange, typename PAL, typename Pagemap>
+  class DecayRange
+  {
+    typename ParentRange::State parent{};
+
+    /**
+     * How many slab sizes that can be provided.
+     */
+    static constexpr size_t NUM_SLAB_SIZES = Pal::address_bits - MIN_CHUNK_BITS;
+
+    /**
+     * Number of free stacks per chunk size that each allocator will use.
+     * For performance ideally a power of 2.  We will return to the central
+     * pool anything that has not be used in the last NUM_EPOCHS - 1, where
+     * each epoch is separated by DecayMemoryTimerObject::PERIOD.
+     * I.e. if period is 500ms and num of epochs is 4, then we will return to
+     * the central pool anything not used for the last 1500-2000ms.
+     */
+    static constexpr size_t NUM_EPOCHS = 4;
+    static_assert(bits::is_pow2(NUM_EPOCHS), "Code assumes power of two.");
+
+    /**
+     * Stack of ranges that have been returned for reuse.
+     */
+    ModArray<
+      NUM_SLAB_SIZES,
+      ModArray<NUM_EPOCHS, PagemapStack<BuddyChunkRep<Pagemap>>>>
+      chunk_stack;
+
+    /**
+     * Which is the current epoch to place dealloced chunks, and the
+     * first place we look for allocating chunks.
+     */
+    static inline // alignas(CACHELINE_SIZE)
+      std::atomic<size_t>
+        epoch{0};
+
+    /**
+     * Flag to ensure one-shot registration with the PAL.
+     */
+    static inline std::atomic_bool registered_timer{false};
+
+    std::atomic_bool registered_local{false};
+
+    /**
+     * All activated DecayRanges.
+     */
+    static inline std::atomic<DecayRange*> all_local{nullptr};
+
+    DecayRange* next{nullptr};
+
+    static void handle_decay_tick()
+    {
+      auto new_epoch = (epoch + 1) % NUM_EPOCHS;
+      // Flush old index for all threads.
+      auto curr = all_local.load(std::memory_order_acquire);
+      while (curr != nullptr)
+      {
+        for (size_t sc = 0; sc < NUM_SLAB_SIZES; sc++)
+        {
+          // Don't use ChunkRecord, store in pagemap.
+          auto old_stack = curr->chunk_stack[sc][new_epoch].pop_all();
+          while (!old_stack.is_empty())
+          {
+            auto next = old_stack.get_next();
+
+            curr->parent->dealloc_range(
+              old_stack.get_capability(), MIN_CHUNK_SIZE << sc);
+
+            old_stack = next;
+          }
+        }
+        curr = curr->next;
+      }
+
+      // Advance current index
+      epoch = new_epoch;
+    }
+
+    class DecayMemoryTimerObject : public PalTimerObject
+    {
+      /***
+       * Method for callback object to perform lazy decommit.
+       */
+      static void process(PalTimerObject*)
+      {
+        handle_decay_tick();
+      }
+
+      // Specify that we notify the ChunkAllocator every 500ms.
+      static constexpr size_t PERIOD = 500;
+
+    public:
+      constexpr DecayMemoryTimerObject() : PalTimerObject(&process, PERIOD) {}
+    };
+
+    static inline DecayMemoryTimerObject timer_object;
+
+  public:
+    class State
+    {
+      DecayRange commit_range{};
+
+    public:
+      constexpr State() = default;
+
+      DecayRange* operator->()
+      {
+        return &commit_range;
+      }
+    };
+
+    static constexpr bool Aligned = ParentRange::Aligned;
+
+    constexpr DecayRange() = default;
+
+    capptr::Chunk<void> alloc_range(size_t size)
+    {
+      // Check local cache
+
+      if constexpr (pal_supports<Time, PAL>)
+      {
+        auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
+        // Try local cache of chunks first
+        for (size_t e = 0; e < NUM_EPOCHS; e++)
+        {
+          auto p = chunk_stack[slab_sizeclass][(epoch - e) % NUM_EPOCHS].pop();
+
+          if (p != nullptr)
+            return p;
+        }
+      }
+
+      capptr::Chunk<void> result;
+      for (auto i = NUM_EPOCHS + 2; i > 0; i--)
+      {
+        // Nothing in local cache to allocate from parent.
+        result = parent->alloc_range(size);
+        if (result != nullptr)
+        {
+          return result;
+        }
+
+        // We have run out of memory.
+        handle_decay_tick(); // Try to free some memory.
+      }
+
+      return result;
+    }
+
+    void dealloc_range(capptr::Chunk<void> base, size_t size)
+    {
+      if constexpr (!pal_supports<Time, PAL>)
+      {
+        parent->dealloc_range(base, size);
+        return;
+      }
+
+      if (!registered_timer.exchange(true))
+      {
+        // Register with the PAL.
+        PAL::register_timer(&timer_object);
+      }
+
+      // Check we have registered
+      if (!registered_local.exchange(true))
+      {
+        // Add to the list of local states.
+        auto* head = all_local.load();
+        do
+        {
+          next = head;
+        } while (!all_local.compare_exchange_strong(head, this));
+      }
+
+      auto slab_sizeclass = bits::next_pow2_bits(size) - MIN_CHUNK_BITS;
+      // Add to local cache.
+      chunk_stack[slab_sizeclass][epoch].push(base);
+    }
+  };
+} // namespace snmalloc