@@ -99,9 +99,9 @@ mutable struct Worker
99
99
add_msgs:: Array{Any,1}
100
100
gcflag:: Bool
101
101
state:: WorkerState
102
- c_state:: Condition # wait for state changes
103
- ct_time:: Float64 # creation time
104
- conn_func:: Any # used to setup connections lazily
102
+ c_state:: Threads. Condition # wait for state changes, lock for state
103
+ ct_time:: Float64 # creation time
104
+ conn_func:: Any # used to setup connections lazily
105
105
106
106
r_stream:: IO
107
107
w_stream:: IO
@@ -133,7 +133,7 @@ mutable struct Worker
133
133
if haskey (map_pid_wrkr, id)
134
134
return map_pid_wrkr[id]
135
135
end
136
- w= new (id, [], [], false , W_CREATED, Condition (), time (), conn_func)
136
+ w= new (id, [], [], false , W_CREATED, Threads . Condition (), time (), conn_func)
137
137
w. initialized = Event ()
138
138
register_worker (w)
139
139
w
@@ -143,12 +143,16 @@ mutable struct Worker
143
143
end
144
144
145
145
function set_worker_state (w, state)
146
- w. state = state
147
- notify (w. c_state; all= true )
146
+ lock (w. c_state) do
147
+ w. state = state
148
+ notify (w. c_state; all= true )
149
+ end
148
150
end
149
151
150
152
function check_worker_state (w:: Worker )
153
+ lock (w. c_state)
151
154
if w. state === W_CREATED
155
+ unlock (w. c_state)
152
156
if ! isclusterlazy ()
153
157
if PGRP. topology === :all_to_all
154
158
# Since higher pids connect with lower pids, the remote worker
@@ -168,6 +172,8 @@ function check_worker_state(w::Worker)
168
172
errormonitor (t)
169
173
wait_for_conn (w)
170
174
end
175
+ else
176
+ unlock (w. c_state)
171
177
end
172
178
end
173
179
@@ -186,13 +192,25 @@ function exec_conn_func(w::Worker)
186
192
end
187
193
188
194
function wait_for_conn (w)
195
+ lock (w. c_state)
189
196
if w. state === W_CREATED
197
+ unlock (w. c_state)
190
198
timeout = worker_timeout () - (time () - w. ct_time)
191
199
timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
192
200
193
- @async (sleep (timeout); notify (w. c_state; all= true ))
194
- wait (w. c_state)
195
- w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
201
+ T = Threads. @spawn begin
202
+ sleep ($ timeout)
203
+ lock (w. c_state) do
204
+ notify (w. c_state; all= true )
205
+ end
206
+ end
207
+ errormonitor (T)
208
+ lock (w. c_state) do
209
+ wait (w. c_state)
210
+ w. state === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
211
+ end
212
+ else
213
+ unlock (w. c_state)
196
214
end
197
215
nothing
198
216
end
@@ -483,7 +501,10 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
483
501
while true
484
502
if isempty (launched)
485
503
istaskdone (t_launch) && break
486
- @async (sleep (1 ); notify (launch_ntfy))
504
+ @async begin
505
+ sleep (1 )
506
+ notify (launch_ntfy)
507
+ end
487
508
wait (launch_ntfy)
488
509
end
489
510
@@ -636,7 +657,12 @@ function create_worker(manager, wconfig)
636
657
# require the value of config.connect_at which is set only upon connection completion
637
658
for jw in PGRP. workers
638
659
if (jw. id != 1 ) && (jw. id < w. id)
639
- (jw. state === W_CREATED) && wait (jw. c_state)
660
+ # wait for wl to join
661
+ lock (jw. c_state) do
662
+ if jw. state === W_CREATED
663
+ wait (jw. c_state)
664
+ end
665
+ end
640
666
push! (join_list, jw)
641
667
end
642
668
end
@@ -659,7 +685,12 @@ function create_worker(manager, wconfig)
659
685
end
660
686
661
687
for wl in wlist
662
- (wl. state === W_CREATED) && wait (wl. c_state)
688
+ lock (wl. c_state) do
689
+ if wl. state === W_CREATED
690
+ # wait for wl to join
691
+ wait (wl. c_state)
692
+ end
693
+ end
663
694
push! (join_list, wl)
664
695
end
665
696
end
@@ -676,7 +707,11 @@ function create_worker(manager, wconfig)
676
707
@async manage (w. manager, w. id, w. config, :register )
677
708
# wait for rr_ntfy_join with timeout
678
709
timedout = false
679
- @async (sleep ($ timeout); timedout = true ; put! (rr_ntfy_join, 1 ))
710
+ @async begin
711
+ sleep ($ timeout)
712
+ timedout = true
713
+ put! (rr_ntfy_join, 1 )
714
+ end
680
715
wait (rr_ntfy_join)
681
716
if timedout
682
717
error (" worker did not connect within $timeout seconds" )
0 commit comments