Skip to content

Commit efc8540

Browse files
Eric des CourtisEric des Courtis
authored andcommitted
Added readiness check to avoid errors during topology changes
erpc and nodes show up faster than the applications can start in some cases leading to requests being routed to booting nodes. A workaround is possible in the RPC call, but it leads to the proliferation of error handling code, so it would be better to wait for the nodes to be ready.
1 parent f1eaaad commit efc8540

File tree

8 files changed

+369
-51
lines changed

8 files changed

+369
-51
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,10 @@ the ring to change.
102102

103103
The whitelist and blacklist only have an effect when `monitor_nodes: true`.
104104

105+
It is possible to have the ring wait until an application starts before it is included in the ring.
106+
This can be accomplished by setting `wait_for_readiness: true` and listing the app dependencies in
107+
`readiness_deps: [:app1, :app2]`.
108+
105109
## Configuration
106110

107111
Below is an example configuration:
@@ -113,7 +117,9 @@ config :libring,
113117
# but does not allow nodes named "a" or "remsh*" to be added to the ring
114118
ring_a: [monitor_nodes: true,
115119
node_type: :visible,
116-
node_blacklist: ["a", ~r/^remsh.*$/]],
120+
node_blacklist: ["a", ~r/^remsh.*$/],
121+
wait_for_readiness: true,
122+
readiness_deps: [:myapp]],
117123
# A ring which is composed of three nodes, of which "c" has a non-default weight of 200
118124
# The default weight is 128
119125
ring_b: [nodes: ["a", "b", {"c", 200}]]

lib/managed_ring.ex

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,26 @@ defmodule HashRing.Managed do
2222
@type weight :: pos_integer
2323
@type node_list :: [term() | {term(), weight}]
2424
@type pattern_list :: [String.t() | Regex.t()]
25+
@type app_list :: [atom()]
2526
@type ring_options :: [
2627
nodes: node_list,
2728
monitor_nodes: boolean,
2829
node_blacklist: pattern_list,
29-
node_whitelist: pattern_list
30+
node_whitelist: pattern_list,
31+
wait_for_readiness: boolean,
32+
readiness_deps: app_list
3033
]
3134

32-
@valid_ring_opts [:name, :nodes, :monitor_nodes, :node_blacklist, :node_whitelist, :node_type]
35+
@valid_ring_opts [
36+
:name,
37+
:nodes,
38+
:monitor_nodes,
39+
:node_blacklist,
40+
:node_whitelist,
41+
:node_type,
42+
:wait_for_readiness,
43+
:readiness_deps
44+
]
3345

3446
@doc """
3547
Creates a new stateful hash ring with the given name.
@@ -48,6 +60,8 @@ defmodule HashRing.Managed do
4860
is provided, the blacklist has no effect.
4961
* `node_whitelist: [String.t | Regex.t]` - The same as `node_blacklist`, except the opposite; only nodes
5062
which match a pattern in the whitelist will result in the ring being updated.
63+
* `wait_for_readiness: boolean` - Wait for apps listed in `readiness_deps` to start before adding to the ring.
64+
* `readiness_deps: [atom]` - List of dependency apps that need to start before the node is considered ready.
5165
- `node_type: :all | :hidden | :visible`: refers what kind of nodes will be monitored
5266
when `monitor_nodes` is `true`. For more information, see `:net_kernel.monitor_nodes/2`.
5367
@@ -83,6 +97,8 @@ defmodule HashRing.Managed do
8397
:node_blacklist when is_list(value) -> false
8498
:node_whitelist when is_list(value) -> false
8599
:node_type when value in [:all, :hidden, :visible] -> false
100+
:wait_for_readiness when is_boolean(value) -> false
101+
:readiness_deps when is_list(value) -> false
86102
_ -> true
87103
end
88104
end)

lib/worker.ex

Lines changed: 184 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,19 @@ defmodule HashRing.Worker do
22
@moduledoc false
33
use GenServer
44

5+
@erpc_timeout 500
6+
@node_readiness_check_interval :timer.seconds(1)
7+
8+
defstruct [
9+
:table,
10+
:node_blacklist,
11+
:node_whitelist,
12+
:wait_for_readiness,
13+
:readiness_deps_set
14+
]
15+
16+
alias __MODULE__, as: State
17+
518
def nodes(pid_or_name)
619

720
def nodes(pid) when is_pid(pid) do
@@ -14,7 +27,7 @@ defmodule HashRing.Worker do
1427
|> get_ring()
1528
|> HashRing.nodes()
1629
rescue
17-
ArgumentError ->
30+
ArgumentError ->
1831
{:error, :no_such_ring}
1932
end
2033

@@ -69,55 +82,135 @@ defmodule HashRing.Worker do
6982
nodes = [Node.self() | Node.list(:connected)]
7083
node_blacklist = Keyword.get(options, :node_blacklist, [~r/^remsh.*$/, ~r/^rem-.*$/])
7184
node_whitelist = Keyword.get(options, :node_whitelist, [])
85+
wait_for_readiness = Keyword.get(options, :wait_for_readiness, false)
86+
readiness_deps_set = Keyword.get(options, :readiness_deps, []) |> MapSet.new()
7287

7388
ring =
7489
Enum.reduce(nodes, ring, fn node, acc ->
75-
cond do
76-
HashRing.Utils.ignore_node?(node, node_blacklist, node_whitelist) ->
77-
acc
78-
79-
:else ->
90+
if HashRing.Utils.ignore_node?(node, node_blacklist, node_whitelist) do
91+
acc
92+
else
93+
if wait_for_readiness do
94+
if node_ready?(node, readiness_deps_set) do
95+
HashRing.add_node(acc, node)
96+
else
97+
schedule_check_for_node_readiness(node)
98+
acc
99+
end
100+
else
80101
HashRing.add_node(acc, node)
102+
end
81103
end
82104
end)
83105

84106
node_type = Keyword.get(options, :node_type, :all)
85107
:ok = :net_kernel.monitor_nodes(true, node_type: node_type)
86108
true = :ets.insert_new(table, {:ring, ring})
87-
{:ok, {table, node_blacklist, node_whitelist}}
109+
110+
{:ok,
111+
%State{
112+
table: table,
113+
node_blacklist: node_blacklist,
114+
node_whitelist: node_whitelist,
115+
wait_for_readiness: wait_for_readiness,
116+
readiness_deps_set: readiness_deps_set
117+
}}
88118

89119
:else ->
90120
nodes = Keyword.get(options, :nodes, [])
91121
ring = HashRing.add_nodes(ring, nodes)
92122
true = :ets.insert_new(table, {:ring, ring})
93-
{:ok, {table, [], []}}
123+
124+
{:ok,
125+
%State{
126+
table: table,
127+
node_blacklist: [],
128+
node_whitelist: [],
129+
wait_for_readiness: false,
130+
readiness_deps_set: MapSet.new()
131+
}}
94132
end
95133
end
96134

97-
def handle_call(:list_nodes, _from, {table, _b, _w} = state) do
135+
def handle_call(:list_nodes, _from, %State{table: table} = state) do
98136
{:reply, HashRing.nodes(get_ring(table)), state}
99137
end
100138

101-
def handle_call({:key_to_node, key}, _from, {table, _b, _w} = state) do
139+
def handle_call({:key_to_node, key}, _from, %State{table: table} = state) do
102140
{:reply, HashRing.key_to_node(get_ring(table), key), state}
103141
end
104142

105-
def handle_call({:add_node, node}, _from, {table, _b, _w} = state) do
106-
get_ring(table) |> HashRing.add_node(node) |> update_ring(table)
143+
def handle_call(
144+
{:add_node, node},
145+
_from,
146+
%State{
147+
table: table,
148+
wait_for_readiness: wait_for_readiness,
149+
readiness_deps_set: readiness_deps_set
150+
} = state
151+
) do
152+
if wait_for_readiness and not node_ready?(node, readiness_deps_set) do
153+
schedule_check_for_node_readiness(node)
154+
else
155+
get_ring(table) |> HashRing.add_node(node) |> update_ring(table)
156+
end
157+
107158
{:reply, :ok, state}
108159
end
109160

110-
def handle_call({:add_node, node, weight}, _from, {table, _b, _w} = state) do
111-
get_ring(table) |> HashRing.add_node(node, weight) |> update_ring(table)
161+
def handle_call(
162+
{:add_node, node, weight},
163+
_from,
164+
%State{
165+
table: table,
166+
wait_for_readiness: wait_for_readiness,
167+
readiness_deps_set: readiness_deps_set
168+
} = state
169+
) do
170+
if wait_for_readiness and not node_ready?(node, readiness_deps_set) do
171+
schedule_check_for_node_readiness({node, weight})
172+
else
173+
get_ring(table) |> HashRing.add_node(node, weight) |> update_ring(table)
174+
end
175+
112176
{:reply, :ok, state}
113177
end
114178

115-
def handle_call({:add_nodes, nodes}, _from, {table, _b, _w} = state) do
116-
get_ring(table) |> HashRing.add_nodes(nodes) |> update_ring(table)
179+
def handle_call(
180+
{:add_nodes, nodes},
181+
_from,
182+
%State{
183+
table: table,
184+
wait_for_readiness: wait_for_readiness,
185+
readiness_deps_set: readiness_deps_set
186+
} = state
187+
) do
188+
if wait_for_readiness do
189+
%{true: ready_nodes, false: starting_nodes} =
190+
Enum.group_by(
191+
nodes,
192+
fn
193+
{node, _weight} ->
194+
node_ready?(node, readiness_deps_set)
195+
196+
node ->
197+
node_ready?(node, readiness_deps_set)
198+
end
199+
)
200+
201+
get_ring(table) |> HashRing.add_nodes(ready_nodes) |> update_ring(table)
202+
203+
for starting_node <- starting_nodes do
204+
schedule_check_for_node_readiness(starting_node)
205+
end
206+
else
207+
get_ring(table) |> HashRing.add_nodes(nodes) |> update_ring(table)
208+
end
209+
117210
{:reply, :ok, state}
118211
end
119212

120-
def handle_call({:remove_node, node}, _from, {table, _b, _w} = state) do
213+
def handle_call({:remove_node, node}, _from, %State{table: table} = state) do
121214
get_ring(table) |> HashRing.remove_node(node) |> update_ring(table)
122215
{:reply, :ok, state}
123216
end
@@ -127,19 +220,62 @@ defmodule HashRing.Worker do
127220
{:stop, :shutdown, state}
128221
end
129222

130-
def handle_info({:nodeup, node, _info}, {table, b, w} = state) do
223+
def handle_info(
224+
{:nodeup, node, _info},
225+
%State{
226+
table: table,
227+
node_blacklist: b,
228+
node_whitelist: w,
229+
wait_for_readiness: wait_for_readiness,
230+
readiness_deps_set: readiness_deps_set
231+
} = state
232+
) do
131233
unless HashRing.Utils.ignore_node?(node, b, w) do
132-
get_ring(table) |> HashRing.add_node(node) |> update_ring(table)
234+
if wait_for_readiness and not node_ready?(node, readiness_deps_set) do
235+
schedule_check_for_node_readiness(node)
236+
else
237+
get_ring(table) |> HashRing.add_node(node) |> update_ring(table)
238+
end
133239
end
134240

135241
{:noreply, state}
136242
end
137243

138-
def handle_info({:nodedown, node, _info}, state = {table, _b, _w}) do
244+
def handle_info({:nodedown, node, _info}, %State{table: table} = state) do
139245
get_ring(table) |> HashRing.remove_node(node) |> update_ring(table)
140246
{:noreply, state}
141247
end
142248

249+
def handle_info(
250+
{:check_node_readiness, node, weight},
251+
%State{table: table, readiness_deps_set: readiness_deps_set} = state
252+
) do
253+
if node_ready?(node, readiness_deps_set) do
254+
get_ring(table) |> HashRing.add_node(node, weight) |> update_ring(table)
255+
else
256+
schedule_check_for_node_readiness({node, weight})
257+
end
258+
259+
{:noreply, state}
260+
end
261+
262+
def handle_info(
263+
{:check_node_readiness, node},
264+
%State{table: table, readiness_deps_set: readiness_deps_set} = state
265+
) do
266+
if node_ready?(node, readiness_deps_set) do
267+
get_ring(table) |> HashRing.add_node(node) |> update_ring(table)
268+
else
269+
schedule_check_for_node_readiness(node)
270+
end
271+
272+
{:noreply, state}
273+
end
274+
275+
def handle_info(_msg, state) do
276+
{:noreply, state}
277+
end
278+
143279
defp get_ets_name(name), do: :"libring_#{name}"
144280

145281
defp do_call(pid_or_name, msg)
@@ -160,6 +296,33 @@ defmodule HashRing.Worker do
160296

161297
defp get_ring(table), do: :ets.lookup_element(table, :ring, 2)
162298

163-
defp update_ring(ring, table),
299+
defp update_ring(ring, table),
164300
do: :ets.update_element(table, :ring, {2, ring})
301+
302+
defp get_started_apps_set(node) do
303+
try do
304+
:erpc.call(node, Application, :started_applications, [], @erpc_timeout)
305+
|> Enum.map(&elem(&1, 0))
306+
|> MapSet.new()
307+
rescue
308+
_e -> MapSet.new()
309+
end
310+
end
311+
312+
defp node_ready?(node, readiness_deps_set) do
313+
MapSet.difference(readiness_deps_set, get_started_apps_set(node))
314+
|> MapSet.equal?(MapSet.new())
315+
end
316+
317+
defp schedule_check_for_node_readiness({node, weight}) do
318+
if node in Node.list() do
319+
:timer.send_after(@node_readiness_check_interval, {:check_node_readiness, node, weight})
320+
end
321+
end
322+
323+
defp schedule_check_for_node_readiness(node) do
324+
if node in Node.list() do
325+
:timer.send_after(@node_readiness_check_interval, {:check_node_readiness, node})
326+
end
327+
end
165328
end

mix.exs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,9 @@ defmodule HashRing.Mixfile do
4141

4242
{:ex_doc, ">= 0.0.0", only: [:docs]},
4343
{:benchee, "~> 1.0", only: [:dev]},
44-
{:dialyxir, "~> 1.0", only: [:test], runtime: false},
45-
{:stream_data, "~> 0.5", only: [:test]}
44+
{:dialyxir, "~> 1.0", only: [:dev], runtime: false},
45+
{:stream_data, "~> 0.5", only: [:test]},
46+
{:local_cluster, "~> 1.2", only: [:test]}
4647
]
4748
end
4849

mix.lock

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
"earmark_parser": {:hex, :earmark_parser, "1.4.18", "e1b2be73eb08a49fb032a0208bf647380682374a725dfb5b9e510def8397f6f2", [:mix], [], "hexpm", "114a0e85ec3cf9e04b811009e73c206394ffecfcc313e0b346de0d557774ee97"},
66
"erlex": {:hex, :erlex, "0.2.6", "c7987d15e899c7a2f34f5420d2a2ea0d659682c06ac607572df55a43753aa12e", [:mix], [], "hexpm", "2ed2e25711feb44d52b17d2780eabf998452f6efda104877a3881c2f8c0c0c75"},
77
"ex_doc": {:hex, :ex_doc, "0.26.0", "1922164bac0b18b02f84d6f69cab1b93bc3e870e2ad18d5dacb50a9e06b542a3", [:mix], [{:earmark_parser, "~> 1.4.0", [hex: :earmark_parser, repo: "hexpm", optional: false]}, {:makeup_elixir, "~> 0.14", [hex: :makeup_elixir, repo: "hexpm", optional: false]}, {:makeup_erlang, "~> 0.1", [hex: :makeup_erlang, repo: "hexpm", optional: false]}], "hexpm", "2775d66e494a9a48355db7867478ffd997864c61c65a47d31c4949459281c78d"},
8+
"global_flags": {:hex, :global_flags, "1.0.0", "ee6b864979a1fb38d1fbc67838565644baf632212bce864adca21042df036433", [:rebar3], [], "hexpm", "85d944cecd0f8f96b20ce70b5b16ebccedfcd25e744376b131e89ce61ba93176"},
9+
"local_cluster": {:hex, :local_cluster, "1.2.1", "8eab3b8a387680f0872eacfb1a8bd5a91cb1d4d61256eec6a655b07ac7030c73", [:mix], [{:global_flags, "~> 1.0", [hex: :global_flags, repo: "hexpm", optional: false]}], "hexpm", "aae80c9bc92c911cb0be085fdeea2a9f5b88f81b6bec2ff1fec244bb0acc232c"},
810
"makeup": {:hex, :makeup, "1.0.5", "d5a830bc42c9800ce07dd97fa94669dfb93d3bf5fcf6ea7a0c67b2e0e4a7f26c", [:mix], [{:nimble_parsec, "~> 0.5 or ~> 1.0", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "cfa158c02d3f5c0c665d0af11512fed3fba0144cf1aadee0f2ce17747fba2ca9"},
911
"makeup_elixir": {:hex, :makeup_elixir, "0.15.2", "dc72dfe17eb240552857465cc00cce390960d9a0c055c4ccd38b70629227e97c", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.1", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "fd23ae48d09b32eff49d4ced2b43c9f086d402ee4fd4fcb2d7fad97fa8823e75"},
1012
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},

0 commit comments

Comments
 (0)