mirror of
https://github.com/vale981/ray
synced 2025-03-04 17:41:43 -05:00
[autoscaler] [azure] Fix Azure Autoscaling Failures (#16640)
Co-authored-by: Scott Graham <scgraham@microsoft.com>
This commit is contained in:
parent
33e319e9d7
commit
3334357c58
14 changed files with 76 additions and 39 deletions
|
@ -1888,7 +1888,7 @@ filegroup(
|
|||
"python/ray/*.py",
|
||||
"python/ray/autoscaler/*.py",
|
||||
"python/ray/autoscaler/_private/*.py",
|
||||
"python/ray/autoscaler/_private/azure/*.json",
|
||||
"python/ray/autoscaler/_private/_azure/*.json",
|
||||
"python/ray/autoscaler/aws/defaults.yaml",
|
||||
"python/ray/autoscaler/azure/defaults.yaml",
|
||||
"python/ray/autoscaler/gcp/defaults.yaml",
|
||||
|
|
|
@ -17,7 +17,8 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install $(basename $WHEEL_PATH)[all]
|
|||
"azure-cli-core==2.22.0" \
|
||||
"azure-mgmt-compute==14.0.0" \
|
||||
"azure-mgmt-msi==1.0.0" \
|
||||
"azure-mgmt-network==10.2.0"; fi) \
|
||||
"azure-mgmt-network==10.2.0" \
|
||||
"azure-mgmt-resource==13.0.0"; fi) \
|
||||
$(if [ $($HOME/anaconda3/bin/python -c "import sys; print(sys.version_info.minor)") != 6 ] \
|
||||
&& [ "$AUTOSCALER" = "autoscaler" ]; then echo "kopf"; fi) \
|
||||
&& $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)
|
||||
|
|
|
@ -4,6 +4,7 @@ from pathlib import Path
|
|||
import random
|
||||
|
||||
from azure.common.client_factory import get_client_from_cli_profile
|
||||
from azure.common.credentials import get_azure_cli_credentials
|
||||
from azure.mgmt.resource import ResourceManagementClient
|
||||
from azure.mgmt.resource.resources.models import DeploymentMode
|
||||
|
||||
|
@ -35,7 +36,10 @@ def _configure_resource_group(config):
|
|||
# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
|
||||
resource_client = _get_client(ResourceManagementClient, config)
|
||||
|
||||
subscription_id = resource_client.config.subscription_id
|
||||
_, cli_subscription_id = get_azure_cli_credentials(
|
||||
resource=ResourceManagementClient)
|
||||
subscription_id = config["provider"].get("subscription_id",
|
||||
cli_subscription_id)
|
||||
logger.info("Using subscription id: %s", subscription_id)
|
||||
config["provider"]["subscription_id"] = subscription_id
|
||||
|
||||
|
@ -76,7 +80,11 @@ def _configure_resource_group(config):
|
|||
}
|
||||
}
|
||||
|
||||
resource_client.deployments.create_or_update(
|
||||
if hasattr(resource_client.deployments, "create_or_update"):
|
||||
create_or_update = resource_client.deployments.create_or_update
|
||||
else:
|
||||
create_or_update = resource_client.deployments.begin_create_or_update
|
||||
create_or_update(
|
||||
resource_group_name=resource_group,
|
||||
deployment_name="ray-config",
|
||||
parameters=parameters).wait()
|
|
@ -14,7 +14,7 @@ from knack.util import CLIError
|
|||
|
||||
from ray.autoscaler.node_provider import NodeProvider
|
||||
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
|
||||
from ray.autoscaler._private.azure.config import bootstrap_azure
|
||||
from ray.autoscaler._private._azure.config import bootstrap_azure
|
||||
|
||||
VM_NAME_MAX_LEN = 64
|
||||
VM_NAME_UUID_LEN = 8
|
||||
|
@ -213,7 +213,11 @@ class AzureNodeProvider(NodeProvider):
|
|||
}
|
||||
|
||||
# TODO: we could get the private/public ips back directly
|
||||
self.resource_client.deployments.create_or_update(
|
||||
if hasattr(self.resource_client.deployments, "create_or_update"):
|
||||
create = self.resource_client.deployments.create_or_update
|
||||
else:
|
||||
create = self.resource_client.deployments.begin_create_or_update
|
||||
create(
|
||||
resource_group_name=resource_group,
|
||||
deployment_name="ray-vm-{}".format(name_tag),
|
||||
parameters=parameters).wait()
|
|
@ -35,7 +35,7 @@ def _import_gcp(provider_config):
|
|||
|
||||
|
||||
def _import_azure(provider_config):
|
||||
from ray.autoscaler._private.azure.node_provider import AzureNodeProvider
|
||||
from ray.autoscaler._private._azure.node_provider import AzureNodeProvider
|
||||
return AzureNodeProvider
|
||||
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ available_node_types:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
|
||||
ray.worker.default:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
|
@ -79,7 +79,7 @@ available_node_types:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
|
@ -120,7 +120,6 @@ rsync_filter: []
|
|||
# is setup.
|
||||
initialization_commands:
|
||||
# get rid of annoying Ubuntu message
|
||||
- sudo usermod -aG docker $USER || true
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
|
@ -129,7 +128,7 @@ setup_commands:
|
|||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
- (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
|
||||
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
|
||||
# - (conda activate py37_pytorch &> /dev/null && echo 'conda activate py37_pytorch' >> ~/.bashrc) || true
|
||||
- (conda activate py37_tensorflow &> /dev/null && echo 'conda activate py37_tensorflow' >> ~/.bashrc) || true
|
||||
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
# Consider uncommenting these if you also want to run apt-get commands during setup
|
||||
|
@ -139,7 +138,7 @@ setup_commands:
|
|||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
|
||||
- pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
|
|
@ -68,7 +68,7 @@ head_node:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type.
|
||||
worker_nodes:
|
||||
|
@ -78,7 +78,7 @@ worker_nodes:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
|
@ -118,24 +118,26 @@ rsync_filter:
|
|||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands:
|
||||
# get rid of annoying Ubuntu message
|
||||
# enable docker setup
|
||||
- sudo usermod -aG docker $USER || true
|
||||
- sleep 10 # delay to avoid docker permission denied errors
|
||||
# get rid of annoying Ubuntu message
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
|
||||
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
|
||||
# NOTE: rayproject/ray-ml:latest has azure packages bundled
|
||||
head_setup_commands: []
|
||||
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
|
|
@ -77,7 +77,7 @@ available_node_types:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
|
||||
ray.worker.default:
|
||||
# The minimum number of worker nodes of this type to launch.
|
||||
|
@ -96,7 +96,7 @@ available_node_types:
|
|||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 20.07.06
|
||||
imageVersion: 21.01.21
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
|
@ -139,24 +139,26 @@ rsync_filter:
|
|||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands:
|
||||
# get rid of annoying Ubuntu message
|
||||
# enable docker setup
|
||||
- sudo usermod -aG docker $USER || true
|
||||
- sleep 10 # delay to avoid docker permission denied errors
|
||||
# get rid of annoying Ubuntu message
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# NOTE: rayproject/ray-ml:latest has ray latest bundled
|
||||
setup_commands: []
|
||||
# Note: if you're developing Ray, you probably want to create a Docker image that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
|
||||
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
|
||||
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
|
||||
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
|
||||
- pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
|
||||
# NOTE: rayproject/ray-ml:latest has azure packages bundled
|
||||
head_setup_commands: []
|
||||
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
|
|
@ -31,6 +31,9 @@ idle_timeout_minutes: 5
|
|||
provider:
|
||||
type: azure
|
||||
location: westus2
|
||||
resource_group: ray-cluster
|
||||
# set subscription id otherwise the default from az cli will be used
|
||||
# subscription_id: 00000000-0000-0000-0000-000000000000
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
|
@ -57,7 +60,12 @@ available_node_types:
|
|||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6_v3
|
||||
vmSize: Standard_NC6s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 21.01.21
|
||||
|
||||
ray.worker.gpu:
|
||||
# The minimum number of nodes of this type to launch.
|
||||
|
@ -71,7 +79,17 @@ available_node_types:
|
|||
# Provider-specific config, e.g. instance type.
|
||||
node_config:
|
||||
azure_arm_parameters:
|
||||
vmSize: Standard_NC6_v3
|
||||
vmSize: Standard_NC6s_v3
|
||||
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
|
||||
imagePublisher: microsoft-dsvm
|
||||
imageOffer: ubuntu-1804
|
||||
imageSku: 1804-gen2
|
||||
imageVersion: 21.01.21
|
||||
# optionally set priority to use Spot instances
|
||||
priority: Spot
|
||||
# set a maximum price for spot instances if desired
|
||||
# billingProfile:
|
||||
# maxPrice: -1
|
||||
|
||||
# Specify the node type of the head node (as configured above).
|
||||
head_node_type: ray.head.gpu
|
||||
|
@ -88,8 +106,10 @@ file_mounts: {
|
|||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands:
|
||||
# get rid of annoying Ubuntu message
|
||||
# enable docker setup
|
||||
- sudo usermod -aG docker $USER || true
|
||||
- sleep 10 # delay to avoid docker permission denied errors
|
||||
# get rid of annoying Ubuntu message
|
||||
- touch ~/.sudo_as_admin_successful
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
|
@ -98,8 +118,9 @@ setup_commands: []
|
|||
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
|
||||
# NOTE: rayproject/ray-ml:latest has azure packages bundled
|
||||
head_setup_commands: []
|
||||
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
|
|
@ -11,8 +11,8 @@ from unittest.mock import MagicMock, Mock, patch
|
|||
import pytest
|
||||
from click.exceptions import ClickException
|
||||
|
||||
from ray.autoscaler._private.azure.config import (_configure_key_pair as
|
||||
_azure_configure_key_pair)
|
||||
from ray.autoscaler._private._azure.config import (_configure_key_pair as
|
||||
_azure_configure_key_pair)
|
||||
from ray.autoscaler._private.gcp import config as gcp_config
|
||||
from ray.autoscaler._private.util import prepare_config, validate_config,\
|
||||
_get_default_config, merge_setup_commands
|
||||
|
|
|
@ -84,8 +84,8 @@ ray_files.append("ray/nightly-wheels.yaml")
|
|||
ray_files += [
|
||||
"ray/autoscaler/aws/defaults.yaml",
|
||||
"ray/autoscaler/azure/defaults.yaml",
|
||||
"ray/autoscaler/_private/azure/azure-vm-template.json",
|
||||
"ray/autoscaler/_private/azure/azure-config-template.json",
|
||||
"ray/autoscaler/_private/_azure/azure-vm-template.json",
|
||||
"ray/autoscaler/_private/_azure/azure-config-template.json",
|
||||
"ray/autoscaler/gcp/defaults.yaml",
|
||||
"ray/autoscaler/local/defaults.yaml",
|
||||
"ray/autoscaler/kubernetes/defaults.yaml",
|
||||
|
|
Loading…
Add table
Reference in a new issue