mirror of
https://github.com/vale981/ray
synced 2025-03-06 02:21:39 -05:00
[ci/release] Start cluster before connecting via anyscale connect (#18878)
This commit is contained in:
parent
d52203ee03
commit
e08d4253cf
2 changed files with 69 additions and 66 deletions
|
@ -802,9 +802,7 @@ def run_job(cluster_name: str, compute_tpl_name: str, cluster_env_name: str,
|
|||
script_args: List[str], env_vars: Dict[str, str],
|
||||
autosuspend: int) -> Tuple[int, str]:
|
||||
# Start cluster and job
|
||||
address = f"anyscale://{cluster_name}?cluster_compute={compute_tpl_name}" \
|
||||
f"&cluster_env={cluster_env_name}&autosuspend={autosuspend}" \
|
||||
"&&update=True"
|
||||
address = f"anyscale://{cluster_name}?autosuspend={autosuspend}"
|
||||
logger.info(f"Starting job {job_name} with Ray address: {address}")
|
||||
env = copy.deepcopy(os.environ)
|
||||
env.update(GLOBAL_CONFIG)
|
||||
|
@ -1304,6 +1302,7 @@ def run_test_config(
|
|||
session_url = None
|
||||
runtime = None
|
||||
anyscale.conf.CLI_TOKEN = GLOBAL_CONFIG["ANYSCALE_CLI_TOKEN"]
|
||||
test_uses_ray_connect = test_config["run"].get("use_connect")
|
||||
|
||||
session_id = None
|
||||
scd_id = None
|
||||
|
@ -1360,7 +1359,7 @@ def run_test_config(
|
|||
session_options["build_id"] = build_id
|
||||
session_options["uses_app_config"] = True
|
||||
|
||||
if not test_config["run"].get("use_connect"):
|
||||
# Start session
|
||||
session_id = create_and_wait_for_session(
|
||||
sdk=sdk,
|
||||
stop_event=stop_event,
|
||||
|
@ -1368,29 +1367,7 @@ def run_test_config(
|
|||
session_options=session_options,
|
||||
)
|
||||
|
||||
if test_config["run"].get("use_connect"):
|
||||
assert compute_tpl_name, "Compute template must exist."
|
||||
assert app_config_name, "Cluster environment must exist."
|
||||
script_args = test_config["run"].get("args", [])
|
||||
if smoke_test:
|
||||
script_args += ["--smoke-test"]
|
||||
min_workers = 0
|
||||
for node_type in compute_tpl["worker_node_types"]:
|
||||
min_workers += node_type["min_workers"]
|
||||
# Build completed, use job timeout
|
||||
result_queue.put(State("CMD_RUN", time.time(), None))
|
||||
returncode, logs = run_job(
|
||||
cluster_name=test_name,
|
||||
compute_tpl_name=compute_tpl_name,
|
||||
cluster_env_name=app_config_name,
|
||||
job_name=session_name,
|
||||
min_workers=min_workers,
|
||||
script=test_config["run"]["script"],
|
||||
script_args=script_args,
|
||||
env_vars=env_vars,
|
||||
autosuspend=autosuspend_mins)
|
||||
_process_finished_client_command(returncode, logs)
|
||||
return
|
||||
prepare_command = test_config["run"].get("prepare")
|
||||
|
||||
# Write test state json
|
||||
test_state_file = os.path.join(local_dir, "test_state.json")
|
||||
|
@ -1400,6 +1377,10 @@ def run_test_config(
|
|||
"test_name": test_name
|
||||
}, f)
|
||||
|
||||
if prepare_command or not test_uses_ray_connect:
|
||||
if test_uses_ray_connect:
|
||||
logger.info("Found a prepare command, so pushing it "
|
||||
"to the session.")
|
||||
# Rsync up
|
||||
logger.info("Syncing files to session...")
|
||||
session_controller.push(
|
||||
|
@ -1425,9 +1406,9 @@ def run_test_config(
|
|||
_check_stop(stop_event, "file_sync")
|
||||
|
||||
# Optionally run preparation command
|
||||
prepare_command = test_config["run"].get("prepare")
|
||||
if prepare_command:
|
||||
logger.info(f"Running preparation command: {prepare_command}")
|
||||
logger.info(
|
||||
f"Running preparation command: {prepare_command}")
|
||||
scd_id, result = run_session_command(
|
||||
sdk=sdk,
|
||||
session_id=session_id,
|
||||
|
@ -1442,6 +1423,28 @@ def run_test_config(
|
|||
stop_event=stop_event,
|
||||
state_str="CMD_PREPARE")
|
||||
|
||||
if test_uses_ray_connect:
|
||||
script_args = test_config["run"].get("args", [])
|
||||
if smoke_test:
|
||||
script_args += ["--smoke-test"]
|
||||
min_workers = 0
|
||||
for node_type in compute_tpl["worker_node_types"]:
|
||||
min_workers += node_type["min_workers"]
|
||||
# Build completed, use job timeout
|
||||
result_queue.put(State("CMD_RUN", time.time(), None))
|
||||
returncode, logs = run_job(
|
||||
cluster_name=session_name,
|
||||
compute_tpl_name=compute_tpl_name,
|
||||
cluster_env_name=app_config_name,
|
||||
job_name=session_name,
|
||||
min_workers=min_workers,
|
||||
script=test_config["run"]["script"],
|
||||
script_args=script_args,
|
||||
env_vars=env_vars,
|
||||
autosuspend=autosuspend_mins)
|
||||
_process_finished_client_command(returncode, logs)
|
||||
return
|
||||
|
||||
# Run release test command
|
||||
cmd_to_run = test_config["run"]["script"] + " "
|
||||
|
||||
|
|
|
@ -4,8 +4,8 @@
|
|||
compute_template: tpl_cpu_small.yaml
|
||||
|
||||
run:
|
||||
# use_connect: True
|
||||
# autosuspend_mins: 10
|
||||
use_connect: True
|
||||
autosuspend_mins: 10
|
||||
timeout: 600
|
||||
prepare: python wait_cluster.py 4 600
|
||||
script: python workloads/train_small.py
|
||||
|
|
Loading…
Add table
Reference in a new issue