Skip to content

Commit df5f437

Browse files
authored
Implement parallel sweeping of stack pools (#55643)
Also use a round robin to only return stacks one thread at a time to avoid contention on munmap syscalls. Using https://github.com/gbaraldi/cilkbench_julia/blob/main/cilk5julia/nqueens.jl as a benchmark it's about 12% faster wall time. This benchmark has other weird behaviours specially single threaded. Where if calls `wait` thousandas of times per second, and if single threaded every single one does a `jl_process_events` call which is a syscall + preemption. So it looks like a hang. With threads the issue isn't there The idea behind the round robin is twofold. One we are just freeing too much and talking with vtjnash we maybe want some less agressive behaviour, the second is that munmap takes a lock in most OSs. So doing it in parallel has severe negative scaling.
1 parent b7b79eb commit df5f437

File tree

7 files changed

+109
-36
lines changed

7 files changed

+109
-36
lines changed

base/timing.jl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ struct GC_Num
2222
total_time_to_safepoint ::Int64
2323
sweep_time ::Int64
2424
mark_time ::Int64
25+
stack_pool_sweep_time ::Int64
2526
total_sweep_time ::Int64
2627
total_mark_time ::Int64
28+
total_stack_pool_sweep_time::Int64
2729
last_full_sweep ::Int64
2830
last_incremental_sweep ::Int64
2931
end

src/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,7 @@ $(BUILDDIR)/debuginfo.o $(BUILDDIR)/debuginfo.dbg.obj: $(addprefix $(SRCDIR)/,de
318318
$(BUILDDIR)/disasm.o $(BUILDDIR)/disasm.dbg.obj: $(SRCDIR)/debuginfo.h $(SRCDIR)/processor.h
319319
$(BUILDDIR)/gc-debug.o $(BUILDDIR)/gc-debug.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
320320
$(BUILDDIR)/gc-pages.o $(BUILDDIR)/gc-pages.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
321+
$(BUILDDIR)/gc-stacks.o $(BUILDDIR)/gc-stacks.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h
321322
$(BUILDDIR)/gc-stock.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-common.h $(SRCDIR)/gc-stock.h $(SRCDIR)/gc-heap-snapshot.h $(SRCDIR)/gc-alloc-profiler.h $(SRCDIR)/gc-page-profiler.h
322323
$(BUILDDIR)/gc-heap-snapshot.o $(BUILDDIR)/gc-heap-snapshot.dbg.obj: $(SRCDIR)/gc-heap-snapshot.h
323324
$(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)/gc-alloc-profiler.h

src/gc-interface.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,10 @@ typedef struct {
4444
uint64_t total_time_to_safepoint;
4545
uint64_t sweep_time;
4646
uint64_t mark_time;
47+
uint64_t stack_pool_sweep_time;
4748
uint64_t total_sweep_time;
4849
uint64_t total_mark_time;
50+
uint64_t total_stack_pool_sweep_time;
4951
uint64_t last_full_sweep;
5052
uint64_t last_incremental_sweep;
5153
} jl_gc_num_t;

src/gc-stacks.c

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// This file is a part of Julia. License is MIT: https://julialang.org/license
22

33
#include "gc-common.h"
4+
#include "gc-stock.h"
45
#include "threading.h"
56
#ifndef _OS_WINDOWS_
67
# include <sys/resource.h>
@@ -202,7 +203,7 @@ JL_DLLEXPORT void *jl_malloc_stack(size_t *bufsz, jl_task_t *owner) JL_NOTSAFEPO
202203
return stk;
203204
}
204205

205-
void sweep_stack_pools(void) JL_NOTSAFEPOINT
206+
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT
206207
{
207208
// Stack sweeping algorithm:
208209
// // deallocate stacks if we have too many sitting around unused
@@ -215,33 +216,38 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
215216
// bufsz = t->bufsz
216217
// if (stkbuf)
217218
// push(free_stacks[sz], stkbuf)
218-
assert(gc_n_threads);
219-
for (int i = 0; i < gc_n_threads; i++) {
219+
jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, 1);
220+
while (1) {
221+
int i = jl_atomic_fetch_add_relaxed(&gc_ptls_sweep_idx, -1);
222+
if (i < 0)
223+
break;
220224
jl_ptls_t ptls2 = gc_all_tls_states[i];
221225
if (ptls2 == NULL)
222226
continue;
223-
227+
assert(gc_n_threads);
224228
// free half of stacks that remain unused since last sweep
225-
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
226-
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
227-
size_t n_to_free;
228-
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
229-
n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
230-
}
231-
else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
232-
n_to_free = al->len / 2;
233-
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
234-
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
235-
}
236-
else {
237-
n_to_free = 0;
238-
}
239-
for (int n = 0; n < n_to_free; n++) {
240-
void *stk = small_arraylist_pop(al);
241-
free_stack(stk, pool_sizes[p]);
242-
}
243-
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
244-
small_arraylist_free(al);
229+
if (i == jl_atomic_load_relaxed(&gc_stack_free_idx)) {
230+
for (int p = 0; p < JL_N_STACK_POOLS; p++) {
231+
small_arraylist_t *al = &ptls2->gc_tls.heap.free_stacks[p];
232+
size_t n_to_free;
233+
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
234+
n_to_free = al->len; // not alive yet or dead, so it does not need these anymore
235+
}
236+
else if (al->len > MIN_STACK_MAPPINGS_PER_POOL) {
237+
n_to_free = al->len / 2;
238+
if (n_to_free > (al->len - MIN_STACK_MAPPINGS_PER_POOL))
239+
n_to_free = al->len - MIN_STACK_MAPPINGS_PER_POOL;
240+
}
241+
else {
242+
n_to_free = 0;
243+
}
244+
for (int n = 0; n < n_to_free; n++) {
245+
void *stk = small_arraylist_pop(al);
246+
free_stack(stk, pool_sizes[p]);
247+
}
248+
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
249+
small_arraylist_free(al);
250+
}
245251
}
246252
}
247253
if (jl_atomic_load_relaxed(&ptls2->current_task) == NULL) {
@@ -287,6 +293,7 @@ void sweep_stack_pools(void) JL_NOTSAFEPOINT
287293
}
288294
live_tasks->len -= ndel;
289295
}
296+
jl_atomic_fetch_add(&gc_n_threads_sweeping_stacks, -1);
290297
}
291298

292299
JL_DLLEXPORT jl_array_t *jl_live_tasks(void)

src/gc-stock.c

Lines changed: 67 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,17 @@ int jl_n_sweepthreads;
2424
// Number of threads currently running the GC mark-loop
2525
_Atomic(int) gc_n_threads_marking;
2626
// Number of threads sweeping
27-
_Atomic(int) gc_n_threads_sweeping;
27+
_Atomic(int) gc_n_threads_sweeping_pools;
28+
// Number of threads sweeping stacks
29+
_Atomic(int) gc_n_threads_sweeping_stacks;
2830
// Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
2931
_Atomic(jl_gc_padded_page_stack_t *) gc_allocd_scratch;
3032
// `tid` of mutator thread that triggered GC
3133
_Atomic(int) gc_master_tid;
34+
// counter for sharing work when sweeping stacks
35+
_Atomic(int) gc_ptls_sweep_idx;
36+
// counter for round robin of giving back stack pages to the OS
37+
_Atomic(int) gc_stack_free_idx = 0;
3238
// `tid` of first GC thread
3339
int gc_first_tid;
3440
// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
@@ -996,13 +1002,50 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
9961002
// sweep over all memory that is being used and not in a pool
9971003
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
9981004
{
999-
sweep_stack_pools();
10001005
gc_sweep_foreign_objs();
10011006
sweep_malloced_memory();
10021007
sweep_big(ptls);
10031008
jl_engine_sweep(gc_all_tls_states);
10041009
}
10051010

1011+
// wake up all threads to sweep the stacks
1012+
void gc_sweep_wake_all_stacks(jl_ptls_t ptls) JL_NOTSAFEPOINT
1013+
{
1014+
uv_mutex_lock(&gc_threads_lock);
1015+
int first = gc_first_parallel_collector_thread_id();
1016+
int last = gc_last_parallel_collector_thread_id();
1017+
for (int i = first; i <= last; i++) {
1018+
jl_ptls_t ptls2 = gc_all_tls_states[i];
1019+
gc_check_ptls_of_parallel_collector_thread(ptls2);
1020+
jl_atomic_fetch_add(&ptls2->gc_tls.gc_stack_sweep_requested, 1);
1021+
}
1022+
uv_cond_broadcast(&gc_threads_cond);
1023+
uv_mutex_unlock(&gc_threads_lock);
1024+
return;
1025+
}
1026+
1027+
void gc_sweep_wait_for_all_stacks(void) JL_NOTSAFEPOINT
1028+
{
1029+
while ((jl_atomic_load_acquire(&gc_ptls_sweep_idx) >= 0 ) || jl_atomic_load_acquire(&gc_n_threads_sweeping_stacks) != 0) {
1030+
jl_cpu_pause();
1031+
}
1032+
}
1033+
1034+
void sweep_stack_pools(jl_ptls_t ptls) JL_NOTSAFEPOINT
1035+
{
1036+
// initialize ptls index for parallel sweeping of stack pools
1037+
assert(gc_n_threads);
1038+
int stack_free_idx = jl_atomic_load_relaxed(&gc_stack_free_idx);
1039+
if (stack_free_idx + 1 == gc_n_threads)
1040+
jl_atomic_store_relaxed(&gc_stack_free_idx, 0);
1041+
else
1042+
jl_atomic_store_relaxed(&gc_stack_free_idx, stack_free_idx + 1);
1043+
jl_atomic_store_release(&gc_ptls_sweep_idx, gc_n_threads - 1); // idx == gc_n_threads = release stacks to the OS so it's serial
1044+
gc_sweep_wake_all_stacks(ptls);
1045+
sweep_stack_pool_loop();
1046+
gc_sweep_wait_for_all_stacks();
1047+
}
1048+
10061049
static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
10071050
{
10081051
assert(pg->fl_begin_offset != UINT16_MAX);
@@ -1078,7 +1121,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc
10781121
}
10791122

10801123
// wake up all threads to sweep the pages
1081-
void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
1124+
void gc_sweep_wake_all_pages(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_scratch)
10821125
{
10831126
int parallel_sweep_worthwhile = gc_sweep_prescan(ptls, new_gc_allocd_scratch);
10841127
if (parallel_sweep_worthwhile && !page_profile_enabled) {
@@ -1114,18 +1157,18 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_
11141157
}
11151158

11161159
// wait for all threads to finish sweeping
1117-
void gc_sweep_wait_for_all(void)
1160+
void gc_sweep_wait_for_all_pages(void)
11181161
{
11191162
jl_atomic_store(&gc_allocd_scratch, NULL);
1120-
while (jl_atomic_load_acquire(&gc_n_threads_sweeping) != 0) {
1163+
while (jl_atomic_load_acquire(&gc_n_threads_sweeping_pools) != 0) {
11211164
jl_cpu_pause();
11221165
}
11231166
}
11241167

11251168
// sweep all pools
11261169
void gc_sweep_pool_parallel(jl_ptls_t ptls)
11271170
{
1128-
jl_atomic_fetch_add(&gc_n_threads_sweeping, 1);
1171+
jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, 1);
11291172
jl_gc_padded_page_stack_t *allocd_scratch = jl_atomic_load(&gc_allocd_scratch);
11301173
if (allocd_scratch != NULL) {
11311174
gc_page_profiler_serializer_t serializer = gc_page_serializer_create();
@@ -1170,7 +1213,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls)
11701213
}
11711214
gc_page_serializer_destroy(&serializer);
11721215
}
1173-
jl_atomic_fetch_add(&gc_n_threads_sweeping, -1);
1216+
jl_atomic_fetch_add(&gc_n_threads_sweeping_pools, -1);
11741217
}
11751218

11761219
// free all pages (i.e. through `madvise` on Linux) that were lazily freed
@@ -1260,9 +1303,9 @@ static void gc_sweep_pool(void)
12601303
// the actual sweeping
12611304
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
12621305
jl_ptls_t ptls = jl_current_task->ptls;
1263-
gc_sweep_wake_all(ptls, new_gc_allocd_scratch);
1306+
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
12641307
gc_sweep_pool_parallel(ptls);
1265-
gc_sweep_wait_for_all();
1308+
gc_sweep_wait_for_all_pages();
12661309

12671310
// reset half-pages pointers
12681311
for (int t_i = 0; t_i < n_threads; t_i++) {
@@ -3073,6 +3116,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
30733116
#endif
30743117
current_sweep_full = sweep_full;
30753118
sweep_weak_refs();
3119+
uint64_t stack_pool_time = jl_hrtime();
3120+
sweep_stack_pools(ptls);
3121+
stack_pool_time = jl_hrtime() - stack_pool_time;
3122+
gc_num.total_stack_pool_sweep_time += stack_pool_time;
3123+
gc_num.stack_pool_sweep_time = stack_pool_time;
30763124
gc_sweep_other(ptls, sweep_full);
30773125
gc_scrub();
30783126
gc_verify_tags();
@@ -3491,6 +3539,10 @@ STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
34913539
return (jl_atomic_load(&ptls->gc_tls.gc_sweeps_requested) > 0);
34923540
}
34933541

3542+
STATIC_INLINE int may_sweep_stack(jl_ptls_t ptls) JL_NOTSAFEPOINT
3543+
{
3544+
return (jl_atomic_load(&ptls->gc_tls.gc_stack_sweep_requested) > 0);
3545+
}
34943546
// parallel gc thread function
34953547
void jl_parallel_gc_threadfun(void *arg)
34963548
{
@@ -3513,12 +3565,17 @@ void jl_parallel_gc_threadfun(void *arg)
35133565

35143566
while (1) {
35153567
uv_mutex_lock(&gc_threads_lock);
3516-
while (!may_mark() && !may_sweep(ptls)) {
3568+
while (!may_mark() && !may_sweep(ptls) && !may_sweep_stack(ptls)) {
35173569
uv_cond_wait(&gc_threads_cond, &gc_threads_lock);
35183570
}
35193571
uv_mutex_unlock(&gc_threads_lock);
35203572
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
35213573
gc_mark_loop_parallel(ptls, 0);
3574+
if (may_sweep_stack(ptls)) {
3575+
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
3576+
sweep_stack_pool_loop();
3577+
jl_atomic_fetch_add(&ptls->gc_tls.gc_stack_sweep_requested, -1);
3578+
}
35223579
if (may_sweep(ptls)) {
35233580
assert(jl_atomic_load_relaxed(&ptls->gc_state) == JL_GC_PARALLEL_COLLECTOR_THREAD);
35243581
gc_sweep_pool_parallel(ptls);

src/gc-stock.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,7 +524,10 @@ extern uv_mutex_t gc_threads_lock;
524524
extern uv_cond_t gc_threads_cond;
525525
extern uv_sem_t gc_sweep_assists_needed;
526526
extern _Atomic(int) gc_n_threads_marking;
527-
extern _Atomic(int) gc_n_threads_sweeping;
527+
extern _Atomic(int) gc_n_threads_sweeping_pools;
528+
extern _Atomic(int) gc_n_threads_sweeping_stacks;
529+
extern _Atomic(int) gc_ptls_sweep_idx;
530+
extern _Atomic(int) gc_stack_free_idx;
528531
extern _Atomic(int) n_threads_running;
529532
extern uv_barrier_t thread_init_done;
530533
void gc_mark_queue_all_roots(jl_ptls_t ptls, jl_gc_markqueue_t *mq);
@@ -535,7 +538,7 @@ void gc_mark_loop_serial(jl_ptls_t ptls);
535538
void gc_mark_loop_parallel(jl_ptls_t ptls, int master);
536539
void gc_sweep_pool_parallel(jl_ptls_t ptls);
537540
void gc_free_pages(void);
538-
void sweep_stack_pools(void) JL_NOTSAFEPOINT;
541+
void sweep_stack_pool_loop(void) JL_NOTSAFEPOINT;
539542
void jl_gc_debug_init(void);
540543

541544
// GC pages

src/gc-tls.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ typedef struct {
8282
jl_gc_markqueue_t mark_queue;
8383
jl_gc_mark_cache_t gc_cache;
8484
_Atomic(size_t) gc_sweeps_requested;
85+
_Atomic(size_t) gc_stack_sweep_requested;
8586
arraylist_t sweep_objs;
8687
} jl_gc_tls_states_t;
8788

0 commit comments

Comments
 (0)