@@ -24,11 +24,17 @@ int jl_n_sweepthreads;
24
24
// Number of threads currently running the GC mark-loop
25
25
_Atomic(int ) gc_n_threads_marking ;
26
26
// Number of threads sweeping
27
- _Atomic(int ) gc_n_threads_sweeping ;
27
+ _Atomic(int ) gc_n_threads_sweeping_pools ;
28
+ // Number of threads sweeping stacks
29
+ _Atomic(int ) gc_n_threads_sweeping_stacks ;
28
30
// Temporary for the `ptls->gc_tls.page_metadata_allocd` used during parallel sweeping (padded to avoid false sharing)
29
31
_Atomic(jl_gc_padded_page_stack_t * ) gc_allocd_scratch ;
30
32
// `tid` of mutator thread that triggered GC
31
33
_Atomic(int ) gc_master_tid ;
34
+ // counter for sharing work when sweeping stacks
35
+ _Atomic(int ) gc_ptls_sweep_idx ;
36
+ // counter for round robin of giving back stack pages to the OS
37
+ _Atomic(int ) gc_stack_free_idx = 0 ;
32
38
// `tid` of first GC thread
33
39
int gc_first_tid ;
34
40
// Mutex/cond used to synchronize wakeup of GC threads on parallel marking
@@ -996,13 +1002,50 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
996
1002
// sweep over all memory that is being used and not in a pool
997
1003
static void gc_sweep_other (jl_ptls_t ptls , int sweep_full ) JL_NOTSAFEPOINT
998
1004
{
999
- sweep_stack_pools ();
1000
1005
gc_sweep_foreign_objs ();
1001
1006
sweep_malloced_memory ();
1002
1007
sweep_big (ptls );
1003
1008
jl_engine_sweep (gc_all_tls_states );
1004
1009
}
1005
1010
1011
+ // wake up all threads to sweep the stacks
1012
+ void gc_sweep_wake_all_stacks (jl_ptls_t ptls ) JL_NOTSAFEPOINT
1013
+ {
1014
+ uv_mutex_lock (& gc_threads_lock );
1015
+ int first = gc_first_parallel_collector_thread_id ();
1016
+ int last = gc_last_parallel_collector_thread_id ();
1017
+ for (int i = first ; i <= last ; i ++ ) {
1018
+ jl_ptls_t ptls2 = gc_all_tls_states [i ];
1019
+ gc_check_ptls_of_parallel_collector_thread (ptls2 );
1020
+ jl_atomic_fetch_add (& ptls2 -> gc_tls .gc_stack_sweep_requested , 1 );
1021
+ }
1022
+ uv_cond_broadcast (& gc_threads_cond );
1023
+ uv_mutex_unlock (& gc_threads_lock );
1024
+ return ;
1025
+ }
1026
+
1027
+ void gc_sweep_wait_for_all_stacks (void ) JL_NOTSAFEPOINT
1028
+ {
1029
+ while ((jl_atomic_load_acquire (& gc_ptls_sweep_idx ) >= 0 ) || jl_atomic_load_acquire (& gc_n_threads_sweeping_stacks ) != 0 ) {
1030
+ jl_cpu_pause ();
1031
+ }
1032
+ }
1033
+
1034
+ void sweep_stack_pools (jl_ptls_t ptls ) JL_NOTSAFEPOINT
1035
+ {
1036
+ // initialize ptls index for parallel sweeping of stack pools
1037
+ assert (gc_n_threads );
1038
+ int stack_free_idx = jl_atomic_load_relaxed (& gc_stack_free_idx );
1039
+ if (stack_free_idx + 1 == gc_n_threads )
1040
+ jl_atomic_store_relaxed (& gc_stack_free_idx , 0 );
1041
+ else
1042
+ jl_atomic_store_relaxed (& gc_stack_free_idx , stack_free_idx + 1 );
1043
+ jl_atomic_store_release (& gc_ptls_sweep_idx , gc_n_threads - 1 ); // idx == gc_n_threads = release stacks to the OS so it's serial
1044
+ gc_sweep_wake_all_stacks (ptls );
1045
+ sweep_stack_pool_loop ();
1046
+ gc_sweep_wait_for_all_stacks ();
1047
+ }
1048
+
1006
1049
static void gc_pool_sync_nfree (jl_gc_pagemeta_t * pg , jl_taggedvalue_t * last ) JL_NOTSAFEPOINT
1007
1050
{
1008
1051
assert (pg -> fl_begin_offset != UINT16_MAX );
@@ -1078,7 +1121,7 @@ int gc_sweep_prescan(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_sc
1078
1121
}
1079
1122
1080
1123
// wake up all threads to sweep the pages
1081
- void gc_sweep_wake_all (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1124
+ void gc_sweep_wake_all_pages (jl_ptls_t ptls , jl_gc_padded_page_stack_t * new_gc_allocd_scratch )
1082
1125
{
1083
1126
int parallel_sweep_worthwhile = gc_sweep_prescan (ptls , new_gc_allocd_scratch );
1084
1127
if (parallel_sweep_worthwhile && !page_profile_enabled ) {
@@ -1114,18 +1157,18 @@ void gc_sweep_wake_all(jl_ptls_t ptls, jl_gc_padded_page_stack_t *new_gc_allocd_
1114
1157
}
1115
1158
1116
1159
// wait for all threads to finish sweeping
1117
- void gc_sweep_wait_for_all (void )
1160
+ void gc_sweep_wait_for_all_pages (void )
1118
1161
{
1119
1162
jl_atomic_store (& gc_allocd_scratch , NULL );
1120
- while (jl_atomic_load_acquire (& gc_n_threads_sweeping ) != 0 ) {
1163
+ while (jl_atomic_load_acquire (& gc_n_threads_sweeping_pools ) != 0 ) {
1121
1164
jl_cpu_pause ();
1122
1165
}
1123
1166
}
1124
1167
1125
1168
// sweep all pools
1126
1169
void gc_sweep_pool_parallel (jl_ptls_t ptls )
1127
1170
{
1128
- jl_atomic_fetch_add (& gc_n_threads_sweeping , 1 );
1171
+ jl_atomic_fetch_add (& gc_n_threads_sweeping_pools , 1 );
1129
1172
jl_gc_padded_page_stack_t * allocd_scratch = jl_atomic_load (& gc_allocd_scratch );
1130
1173
if (allocd_scratch != NULL ) {
1131
1174
gc_page_profiler_serializer_t serializer = gc_page_serializer_create ();
@@ -1170,7 +1213,7 @@ void gc_sweep_pool_parallel(jl_ptls_t ptls)
1170
1213
}
1171
1214
gc_page_serializer_destroy (& serializer );
1172
1215
}
1173
- jl_atomic_fetch_add (& gc_n_threads_sweeping , -1 );
1216
+ jl_atomic_fetch_add (& gc_n_threads_sweeping_pools , -1 );
1174
1217
}
1175
1218
1176
1219
// free all pages (i.e. through `madvise` on Linux) that were lazily freed
@@ -1260,9 +1303,9 @@ static void gc_sweep_pool(void)
1260
1303
// the actual sweeping
1261
1304
jl_gc_padded_page_stack_t * new_gc_allocd_scratch = (jl_gc_padded_page_stack_t * ) calloc_s (n_threads * sizeof (jl_gc_padded_page_stack_t ));
1262
1305
jl_ptls_t ptls = jl_current_task -> ptls ;
1263
- gc_sweep_wake_all (ptls , new_gc_allocd_scratch );
1306
+ gc_sweep_wake_all_pages (ptls , new_gc_allocd_scratch );
1264
1307
gc_sweep_pool_parallel (ptls );
1265
- gc_sweep_wait_for_all ();
1308
+ gc_sweep_wait_for_all_pages ();
1266
1309
1267
1310
// reset half-pages pointers
1268
1311
for (int t_i = 0 ; t_i < n_threads ; t_i ++ ) {
@@ -3073,6 +3116,11 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
3073
3116
#endif
3074
3117
current_sweep_full = sweep_full ;
3075
3118
sweep_weak_refs ();
3119
+ uint64_t stack_pool_time = jl_hrtime ();
3120
+ sweep_stack_pools (ptls );
3121
+ stack_pool_time = jl_hrtime () - stack_pool_time ;
3122
+ gc_num .total_stack_pool_sweep_time += stack_pool_time ;
3123
+ gc_num .stack_pool_sweep_time = stack_pool_time ;
3076
3124
gc_sweep_other (ptls , sweep_full );
3077
3125
gc_scrub ();
3078
3126
gc_verify_tags ();
@@ -3491,6 +3539,10 @@ STATIC_INLINE int may_sweep(jl_ptls_t ptls) JL_NOTSAFEPOINT
3491
3539
return (jl_atomic_load (& ptls -> gc_tls .gc_sweeps_requested ) > 0 );
3492
3540
}
3493
3541
3542
+ STATIC_INLINE int may_sweep_stack (jl_ptls_t ptls ) JL_NOTSAFEPOINT
3543
+ {
3544
+ return (jl_atomic_load (& ptls -> gc_tls .gc_stack_sweep_requested ) > 0 );
3545
+ }
3494
3546
// parallel gc thread function
3495
3547
void jl_parallel_gc_threadfun (void * arg )
3496
3548
{
@@ -3513,12 +3565,17 @@ void jl_parallel_gc_threadfun(void *arg)
3513
3565
3514
3566
while (1 ) {
3515
3567
uv_mutex_lock (& gc_threads_lock );
3516
- while (!may_mark () && !may_sweep (ptls )) {
3568
+ while (!may_mark () && !may_sweep (ptls ) && ! may_sweep_stack ( ptls ) ) {
3517
3569
uv_cond_wait (& gc_threads_cond , & gc_threads_lock );
3518
3570
}
3519
3571
uv_mutex_unlock (& gc_threads_lock );
3520
3572
assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
3521
3573
gc_mark_loop_parallel (ptls , 0 );
3574
+ if (may_sweep_stack (ptls )) {
3575
+ assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
3576
+ sweep_stack_pool_loop ();
3577
+ jl_atomic_fetch_add (& ptls -> gc_tls .gc_stack_sweep_requested , -1 );
3578
+ }
3522
3579
if (may_sweep (ptls )) {
3523
3580
assert (jl_atomic_load_relaxed (& ptls -> gc_state ) == JL_GC_PARALLEL_COLLECTOR_THREAD );
3524
3581
gc_sweep_pool_parallel (ptls );
0 commit comments