Skip to content
This repository was archived by the owner on Jan 9, 2024. It is now read-only.

Commit 55d8de2

Browse files
committed
use pipeline to retry failed moved cmds
increase the redict numbers and make sure to try slow mode on the final failed commands
1 parent 85f2196 commit 55d8de2

File tree

5 files changed

+87
-51
lines changed

5 files changed

+87
-51
lines changed

rediscluster/client.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -973,17 +973,17 @@ def scan_iter(self, match=None, count=None, _type=None):
973973
for r in resp:
974974
yield r
975975

976-
# def mget(self, keys, *args):
977-
# """
978-
# Returns a list of values ordered identically to ``keys``
979-
#
980-
# Cluster impl:
981-
# Itterate all keys and send GET for each key.
982-
# This will go alot slower than a normal mget call in Redis.
983-
#
984-
# Operation is no longer atomic.
985-
# """
986-
# return [self.get(arg) for arg in list_or_args(keys, args)]
976+
def mget(self, keys, *args):
977+
"""
978+
Returns a list of values ordered identically to ``keys``
979+
980+
Cluster impl:
981+
Itterate all keys and send GET for each key.
982+
This will go alot slower than a normal mget call in Redis.
983+
984+
Operation is no longer atomic.
985+
"""
986+
return [self.get(arg) for arg in list_or_args(keys, args)]
987987

988988
def mset(self, *args, **kwargs):
989989
"""

rediscluster/nodemanager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,9 @@ def get_node(self, host, port, server_type=None):
420420
return self.nodes[node_name]
421421

422422
def move_slot_to_node(self, slot, node):
423+
"""
424+
When moved response received, we should move all replicas with the master to the new slot.
425+
"""
423426
node_name = node['name']
424427
self.slots[slot] = [node]
425428
slave_nodes = self.slave_nodes_by_master.get(node_name)

rediscluster/pipeline.py

Lines changed: 67 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -196,9 +196,12 @@ def _get_commands_by_node(self, cmds):
196196

197197
master_node = self.connection_pool.get_node_by_slot(slot)
198198

199+
# for the same master_node, it should always get the same proxy_node to group
200+
# as many commands as possible per node
199201
if master_node['name'] in proxy_node_by_master:
200202
node = proxy_node_by_master[master_node['name']]
201203
else:
204+
# TODO: should determine if using replicas by if command is read only
202205
node = self.connection_pool.get_node_by_slot(slot, self.read_from_replicas)
203206
proxy_node_by_master[master_node['name']] = node
204207

@@ -216,7 +219,7 @@ def _get_commands_by_node(self, cmds):
216219

217220
nodes[node_name].append(c)
218221

219-
return nodes
222+
return nodes, connection_by_node
220223

221224
def _execute_single_command(self, cmd):
222225
try:
@@ -234,43 +237,69 @@ def _send_cluster_commands(self, stack, raise_on_error=True, allow_redirections=
234237
"""
235238
# the first time sending the commands we send all of the commands that were queued up.
236239
# if we have to run through it again, we only retry the commands that failed.
237-
attempt = sorted(stack, key=lambda x: x.position)
238-
239-
# build a list of node objects based on node names we need to
240-
nodes = self._get_commands_by_node(attempt)
241-
242-
# send the commands in sequence.
243-
# we write to all the open sockets for each node first, before reading anything
244-
# this allows us to flush all the requests out across the network essentially in parallel
245-
# so that we can read them all in parallel as they come back.
246-
# we dont' multiplex on the sockets as they come available, but that shouldn't make too much difference.
247-
node_commands = nodes.values()
248-
events = []
249-
for n in node_commands:
250-
events.append(gevent.spawn(self._execute_node_commands, n))
251-
252-
gevent.joinall(events)
253-
254-
# release all of the redis connections we allocated earlier back into the connection pool.
255-
# we used to do this step as part of a try/finally block, but it is really dangerous to
256-
# release connections back into the pool if for some reason the socket has data still left in it
257-
# from a previous operation. The write and read operations already have try/catch around them for
258-
# all known types of errors including connection and socket level errors.
259-
# So if we hit an exception, something really bad happened and putting any of
260-
# these connections back into the pool is a very bad idea.
261-
# the socket might have unread buffer still sitting in it, and then the
262-
# next time we read from it we pass the buffered result back from a previous
263-
# command and every single request after to that connection will always get
264-
# a mismatched result. (not just theoretical, I saw this happen on production x.x).
265-
for conn in connection_by_node.values():
266-
self.connection_pool.release(conn)
240+
cmds = sorted(stack, key=lambda x: x.position)
241+
242+
max_redirects = 5
243+
cur_attempt = 0
244+
245+
while cur_attempt < max_redirects:
246+
247+
# build a list of node objects based on node names we need to
248+
nodes, connection_by_node = self._get_commands_by_node(cmds)
249+
250+
# send the commands in sequence.
251+
# we write to all the open sockets for each node first, before reading anything
252+
# this allows us to flush all the requests out across the network essentially in parallel
253+
# so that we can read them all in parallel as they come back.
254+
# we dont' multiplex on the sockets as they come available, but that shouldn't make too much difference.
255+
256+
# duke-cliff: I think it would still be faster if we use gevent to make the command in parallel
257+
# the io is non-blocking, but serialization/deserialization will still be blocking previously
258+
node_commands = nodes.values()
259+
events = []
260+
for n in node_commands:
261+
events.append(gevent.spawn(self._execute_node_commands, n))
262+
263+
gevent.joinall(events)
264+
265+
# release all of the redis connections we allocated earlier back into the connection pool.
266+
# we used to do this step as part of a try/finally block, but it is really dangerous to
267+
# release connections back into the pool if for some reason the socket has data still left in it
268+
# from a previous operation. The write and read operations already have try/catch around them for
269+
# all known types of errors including connection and socket level errors.
270+
# So if we hit an exception, something really bad happened and putting any of
271+
# these connections back into the pool is a very bad idea.
272+
# the socket might have unread buffer still sitting in it, and then the
273+
# next time we read from it we pass the buffered result back from a previous
274+
# command and every single request after to that connection will always get
275+
# a mismatched result. (not just theoretical, I saw this happen on production x.x).
276+
for conn in connection_by_node.values():
277+
self.connection_pool.release(conn)
278+
279+
# will regroup moved commands and retry using pipeline(stacked commands)
280+
# this would increase the pipeline performance a lot
281+
moved_cmds = []
282+
for c in cmds:
283+
if isinstance(c.result, MovedError):
284+
e = c.result
285+
node = self.connection_pool.nodes.get_node(e.host, e.port, server_type='master')
286+
self.connection_pool.nodes.move_slot_to_node(e.slot_id, node)
287+
288+
moved_cmds.append(c)
289+
290+
if moved_cmds:
291+
cur_attempt += 1
292+
cmds = sorted(moved_cmds, key=lambda x: x.position)
293+
continue
294+
295+
break
267296

268297
# if the response isn't an exception it is a valid response from the node
269298
# we're all done with that command, YAY!
270299
# if we have more commands to attempt, we've run into problems.
271300
# collect all the commands we are allowed to retry.
272301
# (MOVED, ASK, or connection errors or timeout errors)
273-
attempt = sorted([c for c in attempt if isinstance(c.result, ERRORS_ALLOW_RETRY)], key=lambda x: x.position)
302+
attempt = sorted([c for c in stack if isinstance(c.result, ERRORS_ALLOW_RETRY)], key=lambda x: x.position)
274303
if attempt and allow_redirections:
275304
# RETRY MAGIC HAPPENS HERE!
276305
# send these remaing comamnds one at a time using `execute_command`
@@ -288,14 +317,19 @@ def _send_cluster_commands(self, stack, raise_on_error=True, allow_redirections=
288317
# flag to rebuild the slots table from scratch. So MOVED errors should
289318
# correct themselves fairly quickly.
290319

291-
log.debug("pipeline has failed commands: {}".format([c.result for c in attempt]))
320+
# with the previous redirect retries, I could barely see the slow mode happening again
321+
log.info("pipeline in slow mode to execute failed commands: {}".format([c.result for c in attempt]))
292322

293323
self.connection_pool.nodes.increment_reinitialize_counter(len(attempt))
324+
325+
# even in the slow mode, we could use gevent to make things faster
294326
events = []
295327
for c in attempt:
296328
events.append(gevent.spawn(self._execute_single_command, c))
329+
297330
gevent.joinall(events)
298331

332+
299333
# turn the response back into a simple flat array that corresponds
300334
# to the sequence of commands issued in the stack in pipeline.execute()
301335
response = [c.result for c in sorted(stack, key=lambda x: x.position)]

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
redis>=3.0.0,<4.0.0
2-
gevent==1.5.0
3-
greenlet==0.4.16
2+
gevent
3+
greenlet

setup.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
history = f.read()
2020

2121
setup(
22-
name="redis-py-cluster-patched",
23-
version="2.1.0.999.10",
22+
name="redis-py-cluster",
23+
version="2.1.1",
2424
description="Library for communicating with Redis Clusters. Built on top of redis-py lib",
2525
long_description=readme + '\n\n' + history,
2626
long_description_content_type="text/markdown",
@@ -30,12 +30,11 @@
3030
maintainer_email='Grokzen@gmail.com',
3131
packages=["rediscluster"],
3232
url='http://github.com/grokzen/redis-py-cluster',
33-
download_url="https://github.com/duke-cliff/redis-py-cluster/archive/2.1.0.999.10.tar.gz",
3433
license='MIT',
3534
install_requires=[
3635
'redis>=3.0.0,<4.0.0',
37-
'gevent==1.5.0',
38-
'greenlet==0.4.16',
36+
'gevent',
37+
'greenlet',
3938
],
4039
python_requires=">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4",
4140
extras_require={

0 commit comments

Comments
 (0)