Skip to content

Commit 2d4f19c

Browse files
committed
Skip peer discovery cleanup when backend returns error
Previously if the peer discovery backend returned an error from failing to discover nodes, the `service_discovery_nodes/0` helper returned an empty list. During cleanup this would mean that any nodes unreachable during a partition would have destructive action taken against them: `rabbit_db_cluster:forget_member/2` and `rabbit_quorum_queue:shrink_all/1`. The `list_nodes/0` callback can fail transiently, though, and a failure shouldn't mean that the cluster is empty. It's safer to avoid cleaning up any nodes when the peer discovery backend fails to return the intended set of nodes.
1 parent f11198a commit 2d4f19c

File tree

2 files changed

+64
-37
lines changed

2 files changed

+64
-37
lines changed

deps/rabbitmq_peer_discovery_common/src/rabbit_peer_discovery_cleanup.erl

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -240,19 +240,29 @@ maybe_cleanup(State, UnreachableNodes) ->
240240
?LOG_DEBUG(
241241
"Peer discovery: cleanup discovered unreachable nodes: ~tp",
242242
[UnreachableNodes]),
243-
case lists:subtract(as_list(UnreachableNodes), as_list(service_discovery_nodes())) of
244-
[] ->
245-
?LOG_DEBUG(
246-
"Peer discovery: all unreachable nodes are still "
247-
"registered with the discovery backend ~tp",
248-
[rabbit_peer_discovery:backend()],
249-
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
250-
ok;
251-
Nodes ->
252-
?LOG_DEBUG(
253-
"Peer discovery: unreachable nodes are not registered "
254-
"with the discovery backend ~tp", [Nodes]),
255-
maybe_remove_nodes(Nodes, State#state.warn_only)
243+
Module = rabbit_peer_discovery:backend(),
244+
case rabbit_peer_discovery:normalize(Module:list_nodes()) of
245+
{ok, {OneOrMultipleNodes, _Type}} ->
246+
DiscoveredNodes = as_list(OneOrMultipleNodes),
247+
case lists:subtract(UnreachableNodes, DiscoveredNodes) of
248+
[] ->
249+
?LOG_DEBUG(
250+
"Peer discovery: all unreachable nodes are still "
251+
"registered with the discovery backend ~tp",
252+
[rabbit_peer_discovery:backend()],
253+
#{domain => ?RMQLOG_DOMAIN_PEER_DISC}),
254+
ok;
255+
Nodes ->
256+
?LOG_DEBUG(
257+
"Peer discovery: unreachable nodes are not registered "
258+
"with the discovery backend ~tp", [Nodes]),
259+
maybe_remove_nodes(Nodes, State#state.warn_only)
260+
end;
261+
{error, Reason} ->
262+
?LOG_INFO(
263+
"Peer discovery cleanup: ~tp returned error ~tp",
264+
[Module, Reason]),
265+
ok
256266
end.
257267

258268
%%--------------------------------------------------------------------
@@ -288,26 +298,3 @@ maybe_remove_nodes([Node | Nodes], false) ->
288298
-spec unreachable_nodes() -> [node()].
289299
unreachable_nodes() ->
290300
rabbit_nodes:list_unreachable().
291-
292-
%%--------------------------------------------------------------------
293-
%% @private
294-
%% @doc Return the nodes that the service discovery backend knows about
295-
%% @spec service_discovery_nodes() -> [node()]
296-
%% @end
297-
%%--------------------------------------------------------------------
298-
-spec service_discovery_nodes() -> [node()].
299-
service_discovery_nodes() ->
300-
Module = rabbit_peer_discovery:backend(),
301-
case rabbit_peer_discovery:normalize(Module:list_nodes()) of
302-
{ok, {OneOrMultipleNodes, _Type}} ->
303-
Nodes = as_list(OneOrMultipleNodes),
304-
?LOG_DEBUG(
305-
"Peer discovery cleanup: ~tp returned ~tp",
306-
[Module, Nodes]),
307-
Nodes;
308-
{error, Reason} ->
309-
?LOG_DEBUG(
310-
"Peer discovery cleanup: ~tp returned error ~tp",
311-
[Module, Reason]),
312-
[]
313-
end.

deps/rabbitmq_peer_discovery_common/test/peer_discovery_cleanup_SUITE.erl

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ groups() ->
2626

2727
all_tests() ->
2828
[
29-
cleanup_queues
29+
cleanup_queues,
30+
backend_errors_do_not_trigger_cleanup
3031
].
3132

3233
%% -------------------------------------------------------------------
@@ -126,6 +127,45 @@ cleanup_queues(Config) ->
126127
%% Cleanup.
127128
ok = rabbit_ct_client_helpers:close_connection_and_channel(Conn, Ch).
128129

130+
backend_errors_do_not_trigger_cleanup(Config) ->
131+
%% The backend could have some transient failures. While the backend is
132+
%% not giving reliable peer information, skip cleanup.
133+
[A, B, C] = rabbit_ct_broker_helpers:get_node_configs(Config, nodename),
134+
{Conn, Ch} = rabbit_ct_client_helpers:open_connection_and_channel(Config),
135+
136+
QQ = <<"quorum-queue">>,
137+
declare_queue(Ch, QQ, [{<<"x-queue-type">>, longstr, <<"quorum">>}]),
138+
139+
%% Have the backend return an error.
140+
mock_list_nodes(Config, {error, "...internal server error..."}),
141+
%% Make node C unreachable.
142+
rabbit_ct_broker_helpers:block_traffic_between(A, C),
143+
rabbit_ct_broker_helpers:block_traffic_between(B, C),
144+
Ts1 = erlang:system_time(millisecond),
145+
?awaitMatch([C],
146+
rabbit_ct_broker_helpers:rpc(Config, A,
147+
rabbit_nodes,
148+
list_unreachable, []),
149+
30_000, 1_000),
150+
ct:log(?LOW_IMPORTANCE, "Node C became unreachable in ~bms",
151+
[erlang:system_time(millisecond) - Ts1]),
152+
153+
ok = rabbit_ct_broker_helpers:rpc(Config, A,
154+
rabbit_peer_discovery_cleanup,
155+
check_cluster, []),
156+
157+
%% Node C should remain in the quorum queue members.
158+
?assertEqual(
159+
lists:sort([A, B, C]),
160+
begin
161+
Info = rpc:call(A, rabbit_quorum_queue, infos,
162+
[rabbit_misc:r(<<"/">>, queue, QQ)]),
163+
lists:sort(proplists:get_value(members, Info))
164+
end),
165+
166+
%% Cleanup.
167+
ok = rabbit_ct_client_helpers:close_connection_and_channel(Conn, Ch).
168+
129169
%%%
130170
%%% Implementation
131171
%%%

0 commit comments

Comments
 (0)