diff --git a/ci/k8s/prep-k8s-environment.sh b/ci/k8s/prep-k8s-environment.sh index 2f62d9d43..125ee4eaa 100755 --- a/ci/k8s/prep-k8s-environment.sh +++ b/ci/k8s/prep-k8s-environment.sh @@ -16,6 +16,9 @@ chmod +x kubectl mv ./kubectl /usr/bin/kubectl kubectl version --client +curl -s "https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh" | bash +mv ./kustomize /usr/bin/kustomize + # Delete dangling clusters kind delete clusters --all diff --git a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py index 29da2eb4e..29c0f0141 100644 --- a/python/ray/autoscaler/_private/kuberay/autoscaling_config.py +++ b/python/ray/autoscaler/_private/kuberay/autoscaling_config.py @@ -157,7 +157,15 @@ def _generate_provider_config(ray_cluster_namespace: str) -> Dict[str, Any]: DISABLE_LAUNCH_CONFIG_CHECK_KEY: True, FOREGROUND_NODE_LAUNCH_KEY: True, WORKER_LIVENESS_CHECK_KEY: False, - WORKER_RPC_DRAIN_KEY: False, + # For the time being we are letting the autoscaler drain nodes, + # hence the following setting is set to True (the default value). + # This is because we are observing that with the flag set to false, + # The GCS may not be properly notified of node downscaling. + # TODO Solve this issue, flip the key back to false -- else we may have + # a race condition in which the autoscaler kills the Ray container + # Kubernetes recreates it, + # and then KubeRay deletes the pod, killing the container again. + WORKER_RPC_DRAIN_KEY: True, } @@ -349,12 +357,10 @@ def _get_custom_resources( ) -> Dict[str, int]: """Format custom resources based on the `resources` Ray start param. - For the current prototype, the value of the `resources` field must + Currently, the value of the `resources` field must be formatted as follows: '"{\"Custom1\": 1, \"Custom2\": 5}"'. - We intend to provide a better interface soon. - This method first converts the input to a correctly formatted json string and then loads that json string to a dict. """ diff --git a/python/ray/autoscaler/_private/kuberay/node_provider.py b/python/ray/autoscaler/_private/kuberay/node_provider.py index 35bc9d9e9..eec2c68f4 100644 --- a/python/ray/autoscaler/_private/kuberay/node_provider.py +++ b/python/ray/autoscaler/_private/kuberay/node_provider.py @@ -178,8 +178,8 @@ class KuberayNodeProvider(NodeProvider): # type: ignore provider_config.get(WORKER_LIVENESS_CHECK_KEY, True) is False ), f"To use KuberayNodeProvider, must set `{WORKER_LIVENESS_CHECK_KEY}:False`." assert ( - provider_config.get(WORKER_RPC_DRAIN_KEY, True) is False - ), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:False`." + provider_config.get(WORKER_RPC_DRAIN_KEY, False) is True + ), f"To use KuberayNodeProvider, must set `{WORKER_RPC_DRAIN_KEY}:True`." provider_exists = True super().__init__(provider_config, cluster_name) diff --git a/python/ray/autoscaler/kuberay/init-config.sh b/python/ray/autoscaler/kuberay/init-config.sh index 6e67387d0..76ee0ce79 100755 --- a/python/ray/autoscaler/kuberay/init-config.sh +++ b/python/ray/autoscaler/kuberay/init-config.sh @@ -3,6 +3,16 @@ # Clone pinned Kuberay commit to temporary directory, copy the CRD definitions # into the autoscaler folder. +KUBERAY_SHA="ce84f0441c991eb4b0f52ee2cd85c0a5ac048d11" +OPERATOR_TAG=${KUBERAY_SHA:0:7} + +# Requires Kustomize (dependency to be removed after KubeRay 0.3.0 cut) +if ! command -v kustomize &> /dev/null +then + echo "Please install kustomize. Then re-run this script." + exit +fi + SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) DIR=$(mktemp -d -t "kuberay-XXXXXX") @@ -10,22 +20,10 @@ DIR=$(mktemp -d -t "kuberay-XXXXXX") pushd "$DIR" || exit git clone https://github.com/ray-project/kuberay/ pushd "kuberay" || exit - # If you changed the Kuberay CRD, you need to update this commit to point - # to the new CRD. The following always need to be compatible: The used CRDs, - # the docker image of the Kuberay operator and the KuberayNodeProvider. - # This is normally not a problem since the KuberayNodeProvider uses a - # stable part of the CRD definition and the Kuberay operator and the - # CRDs are in the https://github.com/ray-project/kuberay/ so they - # get updated together. It is important to keep this in mind when making - # changes. The CRD is designed to be stable so one operator can run many - # different versions of Ray. - git checkout 69ecfceef5c966193ab87f22a9f49250b17e35fb - # Here is where we specify the docker image that is used for the operator. - # If you want to use your own version of Kuberay, you should change the content - # of kuberay-autoscaler.patch to point to your operator. - # This would normally better be done with kustomization, but we don't want to make - # kustomization a dependency for running this. - git apply "$SCRIPT_DIR/kuberay-autoscaler.patch" + git checkout "$KUBERAY_SHA$" + pushd ray-operator/config/default || exit + kustomize edit set image kuberay/operator=kuberay/operator:"$OPERATOR_TAG" + popd || exit cp -r ray-operator/config "$SCRIPT_DIR/" popd || exit popd || exit diff --git a/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py b/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py index e48682d4a..86f55f67a 100644 --- a/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py +++ b/python/ray/tests/kuberay/scripts/non_terminated_nodes_count.py @@ -5,12 +5,12 @@ from ray.autoscaler._private.kuberay.autoscaling_config import _generate_provide @ray.remote def count_non_terminated_nodes() -> int: - """Get the count of non terminated nodes for the Ray cluster raycluster-complete + """Get the count of non terminated nodes for the Ray cluster raycluster-autoscaler in namespace default. """ provider_config = _generate_provider_config(ray_cluster_namespace="default") kuberay_node_provider = _get_node_provider( - provider_config=provider_config, cluster_name="raycluster-complete" + provider_config=provider_config, cluster_name="raycluster-autoscaler" ) nodes = kuberay_node_provider.non_terminated_nodes({}) return len(nodes) diff --git a/python/ray/tests/kuberay/test_autoscaling_config.py b/python/ray/tests/kuberay/test_autoscaling_config.py index 7bbdd7377..f6d4b8121 100644 --- a/python/ray/tests/kuberay/test_autoscaling_config.py +++ b/python/ray/tests/kuberay/test_autoscaling_config.py @@ -46,7 +46,7 @@ def _get_basic_autoscaling_config() -> dict: "disable_launch_config_check": True, "foreground_node_launch": True, "worker_liveness_check": False, - "worker_rpc_drain": False, + "worker_rpc_drain": True, "namespace": "default", "type": "kuberay", }, diff --git a/python/ray/tests/kuberay/test_autoscaling_e2e.py b/python/ray/tests/kuberay/test_autoscaling_e2e.py index 5dee695c1..ed2a2e64b 100644 --- a/python/ray/tests/kuberay/test_autoscaling_e2e.py +++ b/python/ray/tests/kuberay/test_autoscaling_e2e.py @@ -46,12 +46,14 @@ logger.info(f"Using image `{AUTOSCALER_IMAGE}` for Autoscaler containers.") logger.info(f"Using pull policy `{PULL_POLICY}` for all images.") # Path to example config rel RAY_PARENT -EXAMPLE_CLUSTER_PATH = "ray/python/ray/autoscaler/kuberay/ray-cluster.complete.yaml" +EXAMPLE_CLUSTER_PATH = ( + "ray/python/ray/autoscaler/kuberay/config/samples/ray-cluster.autoscaler.yaml" +) -HEAD_SERVICE = "raycluster-complete-head-svc" -HEAD_POD_PREFIX = "raycluster-complete-head" -CPU_WORKER_PREFIX = "raycluster-complete-worker-small-group" -RAY_CLUSTER_NAME = "raycluster-complete" +HEAD_SERVICE = "raycluster-autoscaler-head-svc" +HEAD_POD_PREFIX = "raycluster-autoscaler-head" +CPU_WORKER_PREFIX = "raycluster-autoscaler-worker-small-group" +RAY_CLUSTER_NAME = "raycluster-autoscaler" RAY_CLUSTER_NAMESPACE = "default" @@ -75,11 +77,19 @@ class KubeRayAutoscalingTest(unittest.TestCase): with open(EXAMPLE_CLUSTER_PATH) as ray_cr_config_file: ray_cr_config_str = ray_cr_config_file.read() config = yaml.safe_load(ray_cr_config_str) + head_group = config["spec"]["headGroupSpec"] + head_group["rayStartParams"][ + "resources" + ] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"' + cpu_group = config["spec"]["workerGroupSpecs"][0] cpu_group["replicas"] = cpu_replicas cpu_group["minReplicas"] = min_replicas # Keep maxReplicas big throughout the test. cpu_group["maxReplicas"] = 300 + cpu_group["rayStartParams"][ + "resources" + ] = '"{\\"Custom1\\": 1, \\"Custom2\\": 5}"' # Add a GPU-annotated group. # (We're not using real GPUs, just adding a GPU annotation for the autoscaler @@ -100,7 +110,8 @@ class KubeRayAutoscalingTest(unittest.TestCase): ray_container = containers[0] # Confirm the first container in the example config is the Ray container. - assert ray_container["name"] in ["ray-head", "ray-worker"] + assert ray_container["name"] in ["ray-head", "machine-learning"] + # ("machine-learning" is the name of the worker Ray container) ray_container["image"] = RAY_IMAGE