[kuberay] Update KubeRay operator commit, turn autoscaler RPC drain back on (#27077)

This PR:

- Updates the KubeRay operator commit used in the Ray CI autoscaling test
- Uses the RayCluster autoscaling sample config from the KubeRay repo in place of of a config from the Ray repo
- Turns the autoscaler RPC worker drain back on, as I saw some dead node messages from the GCS, and the RPC drain is supposed to avoid those.

Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
Dmitri Gekhtman 2022-07-27 00:00:51 -07:00 committed by GitHub
parent 34d0233aba
commit c4a259828b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 49 additions and 31 deletions

View file

@ -16,6 +16,9 @@ chmod +x kubectl
mv ./kubectl /usr/bin/kubectl
kubectl version --client
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
mv ./kustomize /usr/bin/kustomize
# Delete dangling clusters
kind delete clusters --all

View file

@ -157,7 +157,15 @@ def _generate_provider_config(ray_cluster_namespace: str) -> Dict[str, Any]:
DISABLE_LAUNCH_CONFIG_CHECK_KEY: True,
FOREGROUND_NODE_LAUNCH_KEY: True,
WORKER_LIVENESS_CHECK_KEY: False,
WORKER_RPC_DRAIN_KEY: False,
# For the time being we are letting the autoscaler drain nodes,
# hence the following setting is set to True (the default value).
# This is because we are observing that with the flag set to false,
# The GCS may not be properly notified of node downscaling.
# TODO Solve this issue, flip the key back to false -- else we may have
# a race condition in which the autoscaler kills the Ray container
# Kubernetes recreates it,
# and then KubeRay deletes the pod, killing the container again.
WORKER_RPC_DRAIN_KEY: True,
}
@ -349,12 +357,10 @@ def _get_custom_resources(
) -> Dict[str, int]:
"""Format custom resources based on the `resources` Ray start param.
For the current prototype, the value of the `resources` field must
Currently, the value of the `resources` field must
be formatted as follows:
'"{\"Custom1\": 1, \"Custom2\": 5}"'.
We intend to provide a better interface soon.
This method first converts the input to a correctly formatted
json string and then loads that json string to a dict.
"""

View file

@ -178,8 +178,8 @@ class KuberayNodeProvider(NodeProvider): # type: ignore
provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False
), f"To use KuberayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`."
assert (
provider_config.get(WORKER_RPC_DRAIN_KEY, True) is False
), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:False`."
provider_config.get(WORKER_RPC_DRAIN_KEY, False) is True
), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:True`."
provider_exists = True
super().__init__(provider_config, cluster_name)

View file

@ -3,6 +3,16 @@
# Clone pinned Kuberay commit to temporary directory, copy the CRD definitions
# into the autoscaler folder.
KUBERAY_SHA="ce84f0441c991eb4b0f52ee2cd85c0a5ac048d11"
OPERATOR_TAG=${KUBERAY_SHA:0:7}
# Requires Kustomize (dependency to be removed after KubeRay 0.3.0 cut)
if ! command -v kustomize &> /dev/null
then
echo "Please install kustomize. Then re-run this script."
exit
fi
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
DIR=$(mktemp -d -t "kuberay-XXXXXX")
@ -10,22 +20,10 @@ DIR=$(mktemp -d -t "kuberay-XXXXXX")
pushd "$DIR" || exit
git clone https://github.com/ray-project/kuberay/
pushd "kuberay" || exit
# If you changed the Kuberay CRD, you need to update this commit to point
# to the new CRD. The following always need to be compatible: The used CRDs,
# the docker image of the Kuberay operator and the KuberayNodeProvider.
# This is normally not a problem since the KuberayNodeProvider uses a
# stable part of the CRD definition and the Kuberay operator and the
# CRDs are in the https://github.com/ray-project/kuberay/ so they
# get updated together. It is important to keep this in mind when making
# changes. The CRD is designed to be stable so one operator can run many
# different versions of Ray.
git checkout 69ecfceef5c966193ab87f22a9f49250b17e35fb
# Here is where we specify the docker image that is used for the operator.
# If you want to use your own version of Kuberay, you should change the content
# of kuberay-autoscaler.patch to point to your operator.
# This would normally better be done with kustomization, but we don't want to make
# kustomization a dependency for running this.
git apply "$SCRIPT_DIR/kuberay-autoscaler.patch"
git checkout "$KUBERAY_SHA$"
pushd ray-operator/config/default || exit
kustomize edit set image kuberay/operator=kuberay/operator:"$OPERATOR_TAG"
popd || exit
cp -r ray-operator/config "$SCRIPT_DIR/"
popd || exit
popd || exit

View file

@ -5,12 +5,12 @@ from ray.autoscaler._private.kuberay.autoscaling_config import _generate_provide
@ray.remote
def count_non_terminated_nodes() -> int:
"""Get the count of non terminated nodes for the Ray cluster raycluster-complete
"""Get the count of non terminated nodes for the Ray cluster raycluster-autoscaler
in namespace default.
"""
provider_config = _generate_provider_config(ray_cluster_namespace="default")
kuberay_node_provider = _get_node_provider(
provider_config=provider_config, cluster_name="raycluster-complete"
provider_config=provider_config, cluster_name="raycluster-autoscaler"
)
nodes = kuberay_node_provider.non_terminated_nodes({})
return len(nodes)

View file

@ -46,7 +46,7 @@ def _get_basic_autoscaling_config() -> dict:
"disable_launch_config_check": True,
"foreground_node_launch": True,
"worker_liveness_check": False,
"worker_rpc_drain": False,
"worker_rpc_drain": True,
"namespace": "default",
"type": "kuberay",
},

View file

@ -46,12 +46,14 @@ logger.info(f"Using image `{AUTOSCALER_IMAGE}` for Autoscaler containers.")
logger.info(f"Using pull policy `{PULL_POLICY}` for all images.")
# Path to example config rel RAY_PARENT
EXAMPLE_CLUSTER_PATH = "ray/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml"
EXAMPLE_CLUSTER_PATH = (
"ray/python/ray/autoscaler/kuberay/config/samples/ray-cluster.autoscaler.yaml"
)
HEAD_SERVICE = "raycluster-complete-head-svc"
HEAD_POD_PREFIX = "raycluster-complete-head"
CPU_WORKER_PREFIX = "raycluster-complete-worker-small-group"
RAY_CLUSTER_NAME = "raycluster-complete"
HEAD_SERVICE = "raycluster-autoscaler-head-svc"
HEAD_POD_PREFIX = "raycluster-autoscaler-head"
CPU_WORKER_PREFIX = "raycluster-autoscaler-worker-small-group"
RAY_CLUSTER_NAME = "raycluster-autoscaler"
RAY_CLUSTER_NAMESPACE = "default"
@ -75,11 +77,19 @@ class KubeRayAutoscalingTest(unittest.TestCase):
with open(EXAMPLE_CLUSTER_PATH) as ray_cr_config_file:
ray_cr_config_str = ray_cr_config_file.read()
config = yaml.safe_load(ray_cr_config_str)
head_group = config["spec"]["headGroupSpec"]
head_group["rayStartParams"][
"resources"
] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"'
cpu_group = config["spec"]["workerGroupSpecs"][0]
cpu_group["replicas"] = cpu_replicas
cpu_group["minReplicas"] = min_replicas
# Keep maxReplicas big throughout the test.
cpu_group["maxReplicas"] = 300
cpu_group["rayStartParams"][
"resources"
] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"'
# Add a GPU-annotated group.
# (We're not using real GPUs, just adding a GPU annotation for the autoscaler
@ -100,7 +110,8 @@ class KubeRayAutoscalingTest(unittest.TestCase):
ray_container = containers[0]
# Confirm the first container in the example config is the Ray container.
assert ray_container["name"] in ["ray-head", "ray-worker"]
assert ray_container["name"] in ["ray-head", "machine-learning"]
# ("machine-learning" is the name of the worker Ray container)
ray_container["image"] = RAY_IMAGE