mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[autoscaler] RecoverUnhealthyWorker mitigation (#3699)
Increases number of retries for RecoverUnhealthyWorkers Closes #3435.
This commit is contained in:
parent
1480f309c3
commit
bdeeacc70f
2 changed files with 9 additions and 7 deletions
|
@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
|
||||
|
@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
|
||||
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
# check that brew is installed
|
||||
which -s brew
|
||||
|
@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
|
||||
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
|
||||
# check that brew is installed
|
||||
which -s brew
|
||||
|
@ -68,7 +68,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
|
|||
bash miniconda.sh -b -p $HOME/miniconda
|
||||
export PATH="$HOME/miniconda/bin:$PATH"
|
||||
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
|
||||
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
|
||||
elif [[ "$LINT" == "1" ]]; then
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cmake build-essential autoconf curl libtool unzip
|
||||
|
|
|
@ -2,6 +2,7 @@ from __future__ import absolute_import
|
|||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from flaky import flaky
|
||||
import shutil
|
||||
import tempfile
|
||||
import threading
|
||||
|
@ -182,8 +183,8 @@ class AutoscalingTest(unittest.TestCase):
|
|||
shutil.rmtree(self.tmpdir)
|
||||
ray.shutdown()
|
||||
|
||||
def waitFor(self, condition):
|
||||
for _ in range(50):
|
||||
def waitFor(self, condition, num_retries=50):
|
||||
for _ in range(num_retries):
|
||||
if condition():
|
||||
return
|
||||
time.sleep(.1)
|
||||
|
@ -674,6 +675,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
autoscaler.update()
|
||||
assert len(self.provider.nodes({})) == 0
|
||||
|
||||
@flaky(max_runs=4)
|
||||
def testRecoverUnhealthyWorkers(self):
|
||||
config_path = self.write_config(SMALL_CLUSTER)
|
||||
self.provider = MockProvider()
|
||||
|
@ -698,7 +700,7 @@ class AutoscalingTest(unittest.TestCase):
|
|||
lm.last_heartbeat_time_by_ip["172.0.0.0"] = 0
|
||||
num_calls = len(runner.calls)
|
||||
autoscaler.update()
|
||||
self.waitFor(lambda: len(runner.calls) > num_calls)
|
||||
self.waitFor(lambda: len(runner.calls) > num_calls, num_retries=150)
|
||||
|
||||
def testExternalNodeScaler(self):
|
||||
config = SMALL_CLUSTER.copy()
|
||||
|
|
Loading…
Add table
Reference in a new issue