mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[kuberay] Update KubeRay operator commit, turn autoscaler RPC drain back on (#27077)
This PR: - Updates the KubeRay operator commit used in the Ray CI autoscaling test - Uses the RayCluster autoscaling sample config from the KubeRay repo in place of of a config from the Ray repo - Turns the autoscaler RPC worker drain back on, as I saw some dead node messages from the GCS, and the RPC drain is supposed to avoid those. Signed-off-by: Dmitri Gekhtman <dmitri.m.gekhtman@gmail.com>
This commit is contained in:
parent
34d0233aba
commit
c4a259828b
7 changed files with 49 additions and 31 deletions
|
@ -16,6 +16,9 @@ chmod +x kubectl
|
|||
mv ./kubectl /usr/bin/kubectl
|
||||
kubectl version --client
|
||||
|
||||
curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash
|
||||
mv ./kustomize /usr/bin/kustomize
|
||||
|
||||
# Delete dangling clusters
|
||||
kind delete clusters --all
|
||||
|
||||
|
|
|
@ -157,7 +157,15 @@ def _generate_provider_config(ray_cluster_namespace: str) -> Dict[str, Any]:
|
|||
DISABLE_LAUNCH_CONFIG_CHECK_KEY: True,
|
||||
FOREGROUND_NODE_LAUNCH_KEY: True,
|
||||
WORKER_LIVENESS_CHECK_KEY: False,
|
||||
WORKER_RPC_DRAIN_KEY: False,
|
||||
# For the time being we are letting the autoscaler drain nodes,
|
||||
# hence the following setting is set to True (the default value).
|
||||
# This is because we are observing that with the flag set to false,
|
||||
# The GCS may not be properly notified of node downscaling.
|
||||
# TODO Solve this issue, flip the key back to false -- else we may have
|
||||
# a race condition in which the autoscaler kills the Ray container
|
||||
# Kubernetes recreates it,
|
||||
# and then KubeRay deletes the pod, killing the container again.
|
||||
WORKER_RPC_DRAIN_KEY: True,
|
||||
}
|
||||
|
||||
|
||||
|
@ -349,12 +357,10 @@ def _get_custom_resources(
|
|||
) -> Dict[str, int]:
|
||||
"""Format custom resources based on the `resources` Ray start param.
|
||||
|
||||
For the current prototype, the value of the `resources` field must
|
||||
Currently, the value of the `resources` field must
|
||||
be formatted as follows:
|
||||
'"{\"Custom1\": 1, \"Custom2\": 5}"'.
|
||||
|
||||
We intend to provide a better interface soon.
|
||||
|
||||
This method first converts the input to a correctly formatted
|
||||
json string and then loads that json string to a dict.
|
||||
"""
|
||||
|
|
|
@ -178,8 +178,8 @@ class KuberayNodeProvider(NodeProvider): # type: ignore
|
|||
provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False
|
||||
), f"To use KuberayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`."
|
||||
assert (
|
||||
provider_config.get(WORKER_RPC_DRAIN_KEY, True) is False
|
||||
), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:False`."
|
||||
provider_config.get(WORKER_RPC_DRAIN_KEY, False) is True
|
||||
), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:True`."
|
||||
provider_exists = True
|
||||
|
||||
super().__init__(provider_config, cluster_name)
|
||||
|
|
|
@ -3,6 +3,16 @@
|
|||
# Clone pinned Kuberay commit to temporary directory, copy the CRD definitions
|
||||
# into the autoscaler folder.
|
||||
|
||||
KUBERAY_SHA="ce84f0441c991eb4b0f52ee2cd85c0a5ac048d11"
|
||||
OPERATOR_TAG=${KUBERAY_SHA:0:7}
|
||||
|
||||
# Requires Kustomize (dependency to be removed after KubeRay 0.3.0 cut)
|
||||
if ! command -v kustomize &> /dev/null
|
||||
then
|
||||
echo "Please install kustomize. Then re-run this script."
|
||||
exit
|
||||
fi
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
DIR=$(mktemp -d -t "kuberay-XXXXXX")
|
||||
|
@ -10,22 +20,10 @@ DIR=$(mktemp -d -t "kuberay-XXXXXX")
|
|||
pushd "$DIR" || exit
|
||||
git clone https://github.com/ray-project/kuberay/
|
||||
pushd "kuberay" || exit
|
||||
# If you changed the Kuberay CRD, you need to update this commit to point
|
||||
# to the new CRD. The following always need to be compatible: The used CRDs,
|
||||
# the docker image of the Kuberay operator and the KuberayNodeProvider.
|
||||
# This is normally not a problem since the KuberayNodeProvider uses a
|
||||
# stable part of the CRD definition and the Kuberay operator and the
|
||||
# CRDs are in the https://github.com/ray-project/kuberay/ so they
|
||||
# get updated together. It is important to keep this in mind when making
|
||||
# changes. The CRD is designed to be stable so one operator can run many
|
||||
# different versions of Ray.
|
||||
git checkout 69ecfceef5c966193ab87f22a9f49250b17e35fb
|
||||
# Here is where we specify the docker image that is used for the operator.
|
||||
# If you want to use your own version of Kuberay, you should change the content
|
||||
# of kuberay-autoscaler.patch to point to your operator.
|
||||
# This would normally better be done with kustomization, but we don't want to make
|
||||
# kustomization a dependency for running this.
|
||||
git apply "$SCRIPT_DIR/kuberay-autoscaler.patch"
|
||||
git checkout "$KUBERAY_SHA$"
|
||||
pushd ray-operator/config/default || exit
|
||||
kustomize edit set image kuberay/operator=kuberay/operator:"$OPERATOR_TAG"
|
||||
popd || exit
|
||||
cp -r ray-operator/config "$SCRIPT_DIR/"
|
||||
popd || exit
|
||||
popd || exit
|
||||
|
|
|
@ -5,12 +5,12 @@ from ray.autoscaler._private.kuberay.autoscaling_config import _generate_provide
|
|||
|
||||
@ray.remote
|
||||
def count_non_terminated_nodes() -> int:
|
||||
"""Get the count of non terminated nodes for the Ray cluster raycluster-complete
|
||||
"""Get the count of non terminated nodes for the Ray cluster raycluster-autoscaler
|
||||
in namespace default.
|
||||
"""
|
||||
provider_config = _generate_provider_config(ray_cluster_namespace="default")
|
||||
kuberay_node_provider = _get_node_provider(
|
||||
provider_config=provider_config, cluster_name="raycluster-complete"
|
||||
provider_config=provider_config, cluster_name="raycluster-autoscaler"
|
||||
)
|
||||
nodes = kuberay_node_provider.non_terminated_nodes({})
|
||||
return len(nodes)
|
||||
|
|
|
@ -46,7 +46,7 @@ def _get_basic_autoscaling_config() -> dict:
|
|||
"disable_launch_config_check": True,
|
||||
"foreground_node_launch": True,
|
||||
"worker_liveness_check": False,
|
||||
"worker_rpc_drain": False,
|
||||
"worker_rpc_drain": True,
|
||||
"namespace": "default",
|
||||
"type": "kuberay",
|
||||
},
|
||||
|
|
|
@ -46,12 +46,14 @@ logger.info(f"Using image `{AUTOSCALER_IMAGE}` for Autoscaler containers.")
|
|||
logger.info(f"Using pull policy `{PULL_POLICY}` for all images.")
|
||||
|
||||
# Path to example config rel RAY_PARENT
|
||||
EXAMPLE_CLUSTER_PATH = "ray/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml"
|
||||
EXAMPLE_CLUSTER_PATH = (
|
||||
"ray/python/ray/autoscaler/kuberay/config/samples/ray-cluster.autoscaler.yaml"
|
||||
)
|
||||
|
||||
HEAD_SERVICE = "raycluster-complete-head-svc"
|
||||
HEAD_POD_PREFIX = "raycluster-complete-head"
|
||||
CPU_WORKER_PREFIX = "raycluster-complete-worker-small-group"
|
||||
RAY_CLUSTER_NAME = "raycluster-complete"
|
||||
HEAD_SERVICE = "raycluster-autoscaler-head-svc"
|
||||
HEAD_POD_PREFIX = "raycluster-autoscaler-head"
|
||||
CPU_WORKER_PREFIX = "raycluster-autoscaler-worker-small-group"
|
||||
RAY_CLUSTER_NAME = "raycluster-autoscaler"
|
||||
RAY_CLUSTER_NAMESPACE = "default"
|
||||
|
||||
|
||||
|
@ -75,11 +77,19 @@ class KubeRayAutoscalingTest(unittest.TestCase):
|
|||
with open(EXAMPLE_CLUSTER_PATH) as ray_cr_config_file:
|
||||
ray_cr_config_str = ray_cr_config_file.read()
|
||||
config = yaml.safe_load(ray_cr_config_str)
|
||||
head_group = config["spec"]["headGroupSpec"]
|
||||
head_group["rayStartParams"][
|
||||
"resources"
|
||||
] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"'
|
||||
|
||||
cpu_group = config["spec"]["workerGroupSpecs"][0]
|
||||
cpu_group["replicas"] = cpu_replicas
|
||||
cpu_group["minReplicas"] = min_replicas
|
||||
# Keep maxReplicas big throughout the test.
|
||||
cpu_group["maxReplicas"] = 300
|
||||
cpu_group["rayStartParams"][
|
||||
"resources"
|
||||
] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"'
|
||||
|
||||
# Add a GPU-annotated group.
|
||||
# (We're not using real GPUs, just adding a GPU annotation for the autoscaler
|
||||
|
@ -100,7 +110,8 @@ class KubeRayAutoscalingTest(unittest.TestCase):
|
|||
|
||||
ray_container = containers[0]
|
||||
# Confirm the first container in the example config is the Ray container.
|
||||
assert ray_container["name"] in ["ray-head", "ray-worker"]
|
||||
assert ray_container["name"] in ["ray-head", "machine-learning"]
|
||||
# ("machine-learning" is the name of the worker Ray container)
|
||||
|
||||
ray_container["image"] = RAY_IMAGE
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue