mirror of
https://github.com/vale981/ray
synced 2025-03-09 04:46:38 -04:00
[kubernetes][test] Operator test tweaks. (#15074)
This commit is contained in:
parent
8de66fce3d
commit
6f81ec1998
1 changed files with 27 additions and 5 deletions
|
@ -14,9 +14,13 @@ import yaml
|
||||||
|
|
||||||
IMAGE_ENV = "KUBERNETES_OPERATOR_TEST_IMAGE"
|
IMAGE_ENV = "KUBERNETES_OPERATOR_TEST_IMAGE"
|
||||||
IMAGE = os.getenv(IMAGE_ENV, "rayproject/ray:nightly")
|
IMAGE = os.getenv(IMAGE_ENV, "rayproject/ray:nightly")
|
||||||
|
|
||||||
NAMESPACE_ENV = "KUBERNETES_OPERATOR_TEST_NAMESPACE"
|
NAMESPACE_ENV = "KUBERNETES_OPERATOR_TEST_NAMESPACE"
|
||||||
NAMESPACE = os.getenv(NAMESPACE_ENV, "test-k8s-operator")
|
NAMESPACE = os.getenv(NAMESPACE_ENV, "test-k8s-operator")
|
||||||
|
|
||||||
|
PULL_POLICY_ENV = "KUBERNETES_OPERATOR_TEST_PULL_POLICY"
|
||||||
|
PULL_POLICY = os.getenv(PULL_POLICY_ENV, "Always")
|
||||||
|
|
||||||
RAY_PATH = os.path.abspath(
|
RAY_PATH = os.path.abspath(
|
||||||
os.path.dirname(
|
os.path.dirname(
|
||||||
os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
|
os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))
|
||||||
|
@ -57,13 +61,22 @@ def wait_for_logs():
|
||||||
|
|
||||||
@retry_until_true
|
@retry_until_true
|
||||||
def wait_for_job(job_pod):
|
def wait_for_job(job_pod):
|
||||||
|
print(">>>Checking job logs.")
|
||||||
cmd = f"kubectl -n {NAMESPACE} logs {job_pod}"
|
cmd = f"kubectl -n {NAMESPACE} logs {job_pod}"
|
||||||
try:
|
try:
|
||||||
out = subprocess.check_output(cmd, shell=True).decode()
|
out = subprocess.check_output(
|
||||||
|
cmd, shell=True, stderr=subprocess.STDOUT).decode()
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(">>>Failed to check job logs.")
|
||||||
print(e.output.decode())
|
print(e.output.decode())
|
||||||
raise (e)
|
raise e
|
||||||
return ("success" in out.lower())
|
success = "success" in out.lower()
|
||||||
|
if success:
|
||||||
|
print(">>>Job submission succeeded.")
|
||||||
|
else:
|
||||||
|
print(">>>Job logs do not indicate job sucess:")
|
||||||
|
print(out)
|
||||||
|
return success
|
||||||
|
|
||||||
|
|
||||||
def kubernetes_configs_directory():
|
def kubernetes_configs_directory():
|
||||||
|
@ -116,7 +129,7 @@ class KubernetesOperatorTest(unittest.TestCase):
|
||||||
] + [podType["podConfig"]["spec"] for podType in podTypes2])
|
] + [podType["podConfig"]["spec"] for podType in podTypes2])
|
||||||
for pod_spec in pod_specs:
|
for pod_spec in pod_specs:
|
||||||
pod_spec["containers"][0]["image"] = IMAGE
|
pod_spec["containers"][0]["image"] = IMAGE
|
||||||
pod_spec["containers"][0]["imagePullPolicy"] = "IfNotPresent"
|
pod_spec["containers"][0]["imagePullPolicy"] = PULL_POLICY
|
||||||
|
|
||||||
# Dump to temporary files
|
# Dump to temporary files
|
||||||
yaml.dump(example_cluster_config, example_cluster_file)
|
yaml.dump(example_cluster_config, example_cluster_file)
|
||||||
|
@ -130,27 +143,33 @@ class KubernetesOperatorTest(unittest.TestCase):
|
||||||
file.flush()
|
file.flush()
|
||||||
|
|
||||||
# Start operator and two clusters
|
# Start operator and two clusters
|
||||||
|
print(">>>Starting operator and two clusters.")
|
||||||
for file in files:
|
for file in files:
|
||||||
cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
|
cmd = f"kubectl -n {NAMESPACE} apply -f {file.name}"
|
||||||
subprocess.check_call(cmd, shell=True)
|
subprocess.check_call(cmd, shell=True)
|
||||||
|
|
||||||
# Check that autoscaling respects minWorkers by waiting for
|
# Check that autoscaling respects minWorkers by waiting for
|
||||||
# six pods in the namespace.
|
# six pods in the namespace.
|
||||||
|
print(">>>Waiting for pods to join clusters.")
|
||||||
wait_for_pods(6)
|
wait_for_pods(6)
|
||||||
|
|
||||||
# Check that logging output looks normal (two workers connected to
|
# Check that logging output looks normal (two workers connected to
|
||||||
# ray cluster example-cluster.)
|
# ray cluster example-cluster.)
|
||||||
|
print(">>>Checking monitor logs for head and workers.")
|
||||||
wait_for_logs()
|
wait_for_logs()
|
||||||
|
|
||||||
# Delete the second cluster
|
# Delete the second cluster
|
||||||
|
print(">>>Deleting example-cluster2.")
|
||||||
cmd = f"kubectl -n {NAMESPACE} delete -f"\
|
cmd = f"kubectl -n {NAMESPACE} delete -f"\
|
||||||
f"{example_cluster2_file.name}"
|
f"{example_cluster2_file.name}"
|
||||||
subprocess.check_call(cmd, shell=True)
|
subprocess.check_call(cmd, shell=True)
|
||||||
|
|
||||||
# Four pods remain
|
# Four pods remain
|
||||||
|
print(">>>Checking that example-cluster2 pods are gone.")
|
||||||
wait_for_pods(4)
|
wait_for_pods(4)
|
||||||
|
|
||||||
# Check job submission
|
# Check job submission
|
||||||
|
print(">>>Submitting a job to test Ray client connection.")
|
||||||
cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
|
cmd = f"kubectl -n {NAMESPACE} create -f {job_file.name}"
|
||||||
subprocess.check_call(cmd, shell=True)
|
subprocess.check_call(cmd, shell=True)
|
||||||
|
|
||||||
|
@ -165,21 +184,24 @@ class KubernetesOperatorTest(unittest.TestCase):
|
||||||
|
|
||||||
# Check that cluster updates work: increase minWorkers to 3
|
# Check that cluster updates work: increase minWorkers to 3
|
||||||
# and check that one worker is created.
|
# and check that one worker is created.
|
||||||
|
print(">>>Updating cluster size.")
|
||||||
example_cluster_edit = copy.deepcopy(example_cluster_config)
|
example_cluster_edit = copy.deepcopy(example_cluster_config)
|
||||||
example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3
|
example_cluster_edit["spec"]["podTypes"][1]["minWorkers"] = 3
|
||||||
yaml.dump(example_cluster_edit, example_cluster_file)
|
yaml.dump(example_cluster_edit, example_cluster_file)
|
||||||
example_cluster_file.flush()
|
example_cluster_file.flush()
|
||||||
cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
|
cm = f"kubectl -n {NAMESPACE} apply -f {example_cluster_file.name}"
|
||||||
subprocess.check_call(cm, shell=True)
|
subprocess.check_call(cm, shell=True)
|
||||||
|
print(">>>Checking that new cluster size is respected.")
|
||||||
wait_for_pods(5)
|
wait_for_pods(5)
|
||||||
|
|
||||||
# Delete the first cluster
|
# Delete the first cluster
|
||||||
|
print(">>>Deleting second cluster.")
|
||||||
cmd = f"kubectl -n {NAMESPACE} delete -f"\
|
cmd = f"kubectl -n {NAMESPACE} delete -f"\
|
||||||
f"{example_cluster_file.name}"
|
f"{example_cluster_file.name}"
|
||||||
subprocess.check_call(cmd, shell=True)
|
subprocess.check_call(cmd, shell=True)
|
||||||
|
|
||||||
# Only operator pod remains.
|
# Only operator pod remains.
|
||||||
|
print(">>>Checking that all Ray cluster pods are gone.")
|
||||||
wait_for_pods(1)
|
wait_for_pods(1)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue