Make sure to first run cleanup, then kill the cleaner process

nickva · nickva · commit 1188d66c9fe7 · 2024-08-16T11:20:43.000-04:00
Investigating streaming RPC worker timeouts noticed in our cleanup logic, there
is a is tiny chance of leaking workers in case the coordinator process is
killed right when it's running cleanup: after it stops the cleanup process and
before it sends the `kill_all` messages.

To fix it, first run cleanup, then kill the cleanup process.
diff --git a/src/fabric/src/fabric_streams.erl b/src/fabric/src/fabric_streams.erl
@@ -95,16 +95,23 @@ start(Workers0, Keypos, StartFun, Replacements, RingOpts) ->
 
 cleanup(Workers) ->
     % Stop the auxiliary cleaner process as we got to the point where cleanup
-    % happesn in the regular fashion so we don't want to send 2x the number kill
-    % messages
+    % happens in the regular fashion and we don't want to send 2x the number
+    % of kill messages.
+    %
+    % First, we run the cleanup/1 function, then, we stop the cleaner;
+    % otherwise there is a tiny risk we get killed after we stop the process
+    % and before finish calling cleanup/1. This early, forced process kill may
+    % happen when running the recovery login in the ddoc cache.
+    %
+    Res = fabric_util:cleanup(Workers),
     case get(?WORKER_CLEANER) of
         CleanerPid when is_pid(CleanerPid) ->
             erase(?WORKER_CLEANER),
             exit(CleanerPid, kill);
         _ ->
             ok
     end,
-    fabric_util:cleanup(Workers).
+    Res.
 
 handle_stream_start({rexi_DOWN, _, {_, NodeRef}, _}, _, St) ->
     #stream_acc{workers = Workers, ready = Ready, ring_opts = RingOpts} = St,