Skip to content

Commit

Permalink
refine retry logic to exponential backoff (#2729)
Browse files Browse the repository at this point in the history
Committed-by: siyuanzhang.zsy from Dev container
  • Loading branch information
siyuan0322 authored May 23, 2023
1 parent 06f8769 commit 1d3cd4a
Showing 1 changed file with 7 additions and 5 deletions.
12 changes: 7 additions & 5 deletions coordinator/gscoordinator/op_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,8 +337,8 @@ def _create_analytical_grpc_stub(self):
("grpc.max_metadata_size", GS_GRPC_MAX_MESSAGE_LENGTH),
]
# Check connectivity, otherwise the stub is useless
retry = 0
while retry < 20:
delay = 2
for retry in range(8): # approximated 255s
try:
channel = grpc.insecure_channel(
self._launcher.analytical_engine_endpoint, options=options
Expand All @@ -348,11 +348,13 @@ def _create_analytical_grpc_stub(self):
return stub
except grpc.RpcError as e:
logger.warning(
"Connecting to analytical engine... retrying %d time", retry
"Connecting to analytical engine... tried %d time, will retry in %d seconds",
retry + 1,
delay,
)
logger.warning("Error code: %s, details %s", e.code(), e.details())
retry += 1
time.sleep(3)
time.sleep(delay)
delay *= 2 # back off
raise RuntimeError(
"Failed to connect to engine in 60s, deployment may failed. Please check coordinator log for details"
)
Expand Down

0 comments on commit 1d3cd4a

Please sign in to comment.