[autoscaler] RecoverUnhealthyWorker mitigation (#3699)

Increases number of retries for RecoverUnhealthyWorkers

Closes #3435.
This commit is contained in:
Richard Liaw 2019-01-12 14:06:53 -08:00 committed by GitHub
parent 1480f309c3
commit bdeeacc70f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 7 deletions

View file

@ -25,7 +25,7 @@ if [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
sudo apt-get update
sudo apt-get install -y cmake pkg-config python-dev python-numpy build-essential autoconf curl libtool unzip
@ -34,7 +34,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "linux" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@ -51,7 +51,7 @@ elif [[ "$PYTHON" == "2.7" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock
feather-format lxml openpyxl xlrd py-spy setproctitle faulthandler pytest-timeout mock flaky
elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
# check that brew is installed
which -s brew
@ -68,7 +68,7 @@ elif [[ "$PYTHON" == "3.5" ]] && [[ "$platform" == "macosx" ]]; then
bash miniconda.sh -b -p $HOME/miniconda
export PATH="$HOME/miniconda/bin:$PATH"
pip install -q cython==0.29.0 cmake tensorflow gym opencv-python pyyaml pandas==0.23.4 requests \
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout
feather-format lxml openpyxl xlrd py-spy setproctitle pytest-timeout flaky
elif [[ "$LINT" == "1" ]]; then
sudo apt-get update
sudo apt-get install -y cmake build-essential autoconf curl libtool unzip

View file

@ -2,6 +2,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from flaky import flaky
import shutil
import tempfile
import threading
@ -182,8 +183,8 @@ class AutoscalingTest(unittest.TestCase):
shutil.rmtree(self.tmpdir)
ray.shutdown()
def waitFor(self, condition):
for _ in range(50):
def waitFor(self, condition, num_retries=50):
for _ in range(num_retries):
if condition():
return
time.sleep(.1)
@ -674,6 +675,7 @@ class AutoscalingTest(unittest.TestCase):
autoscaler.update()
assert len(self.provider.nodes({})) == 0
@flaky(max_runs=4)
def testRecoverUnhealthyWorkers(self):
config_path = self.write_config(SMALL_CLUSTER)
self.provider = MockProvider()
@ -698,7 +700,7 @@ class AutoscalingTest(unittest.TestCase):
lm.last_heartbeat_time_by_ip["172.0.0.0"] = 0
num_calls = len(runner.calls)
autoscaler.update()
self.waitFor(lambda: len(runner.calls) > num_calls)
self.waitFor(lambda: len(runner.calls) > num_calls, num_retries=150)
def testExternalNodeScaler(self):
config = SMALL_CLUSTER.copy()