[autoscaler] [azure] Fix Azure Autoscaling Failures (#16640)

Co-authored-by: Scott Graham <scgraham@microsoft.com>
This commit is contained in:
Scott Graham 2021-07-10 14:55:00 -04:00 committed by GitHub
parent 33e319e9d7
commit 3334357c58
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 76 additions and 39 deletions

View file

@ -1888,7 +1888,7 @@ filegroup(
"python/ray/*.py",
"python/ray/autoscaler/*.py",
"python/ray/autoscaler/_private/*.py",
"python/ray/autoscaler/_private/azure/*.json",
"python/ray/autoscaler/_private/_azure/*.json",
"python/ray/autoscaler/aws/defaults.yaml",
"python/ray/autoscaler/azure/defaults.yaml",
"python/ray/autoscaler/gcp/defaults.yaml",

View file

@ -17,7 +17,8 @@ RUN $HOME/anaconda3/bin/pip --no-cache-dir install $(basename $WHEEL_PATH)[all]
"azure-cli-core==2.22.0" \
"azure-mgmt-compute==14.0.0" \
"azure-mgmt-msi==1.0.0" \
"azure-mgmt-network==10.2.0"; fi) \
"azure-mgmt-network==10.2.0" \
"azure-mgmt-resource==13.0.0"; fi) \
$(if [ $($HOME/anaconda3/bin/python -c "import sys; print(sys.version_info.minor)") != 6 ] \
&& [ "$AUTOSCALER" = "autoscaler" ]; then echo "kopf"; fi) \
&& $HOME/anaconda3/bin/pip uninstall ray -y && sudo rm $(basename $WHEEL_PATH)

View file

@ -4,6 +4,7 @@ from pathlib import Path
import random
from azure.common.client_factory import get_client_from_cli_profile
from azure.common.credentials import get_azure_cli_credentials
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.resource.resources.models import DeploymentMode
@ -35,7 +36,10 @@ def _configure_resource_group(config):
# https://docs.microsoft.com/en-us/azure/virtual-machines/windows/tutorial-availability-sets
resource_client = _get_client(ResourceManagementClient, config)
subscription_id = resource_client.config.subscription_id
_, cli_subscription_id = get_azure_cli_credentials(
resource=ResourceManagementClient)
subscription_id = config["provider"].get("subscription_id",
cli_subscription_id)
logger.info("Using subscription id: %s", subscription_id)
config["provider"]["subscription_id"] = subscription_id
@ -76,7 +80,11 @@ def _configure_resource_group(config):
}
}
resource_client.deployments.create_or_update(
if hasattr(resource_client.deployments, "create_or_update"):
create_or_update = resource_client.deployments.create_or_update
else:
create_or_update = resource_client.deployments.begin_create_or_update
create_or_update(
resource_group_name=resource_group,
deployment_name="ray-config",
parameters=parameters).wait()

View file

@ -14,7 +14,7 @@ from knack.util import CLIError
from ray.autoscaler.node_provider import NodeProvider
from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_NAME
from ray.autoscaler._private.azure.config import bootstrap_azure
from ray.autoscaler._private._azure.config import bootstrap_azure
VM_NAME_MAX_LEN = 64
VM_NAME_UUID_LEN = 8
@ -213,7 +213,11 @@ class AzureNodeProvider(NodeProvider):
}
# TODO: we could get the private/public ips back directly
self.resource_client.deployments.create_or_update(
if hasattr(self.resource_client.deployments, "create_or_update"):
create = self.resource_client.deployments.create_or_update
else:
create = self.resource_client.deployments.begin_create_or_update
create(
resource_group_name=resource_group,
deployment_name="ray-vm-{}".format(name_tag),
parameters=parameters).wait()

View file

@ -35,7 +35,7 @@ def _import_gcp(provider_config):
def _import_azure(provider_config):
from ray.autoscaler._private.azure.node_provider import AzureNodeProvider
from ray.autoscaler._private._azure.node_provider import AzureNodeProvider
return AzureNodeProvider

View file

@ -63,7 +63,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
ray.worker.default:
# The minimum number of nodes of this type to launch.
@ -79,7 +79,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@ -120,7 +120,6 @@ rsync_filter: []
# is setup.
initialization_commands:
# get rid of annoying Ubuntu message
- sudo usermod -aG docker $USER || true
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
@ -129,7 +128,7 @@ setup_commands:
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
- (which conda && echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc) || true
# - echo 'conda activate py37_pytorch' >> ~/.bashrc
# - (conda activate py37_pytorch &> /dev/null && echo 'conda activate py37_pytorch' >> ~/.bashrc) || true
- (conda activate py37_tensorflow &> /dev/null && echo 'conda activate py37_tensorflow' >> ~/.bashrc) || true
- which ray || pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# Consider uncommenting these if you also want to run apt-get commands during setup
@ -139,7 +138,7 @@ setup_commands:
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
- pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

View file

@ -68,7 +68,7 @@ head_node:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
# Provider-specific config for worker nodes, e.g. instance type.
worker_nodes:
@ -78,7 +78,7 @@ worker_nodes:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@ -118,24 +118,26 @@ rsync_filter:
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands:
# get rid of annoying Ubuntu message
# enable docker setup
- sudo usermod -aG docker $USER || true
- sleep 10 # delay to avoid docker permission denied errors
# get rid of annoying Ubuntu message
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
setup_commands:
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
# NOTE: rayproject/ray-ml:latest has azure packages bundled
head_setup_commands: []
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

View file

@ -77,7 +77,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
ray.worker.default:
# The minimum number of worker nodes of this type to launch.
@ -96,7 +96,7 @@ available_node_types:
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 20.07.06
imageVersion: 21.01.21
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
@ -139,24 +139,26 @@ rsync_filter:
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands:
# get rid of annoying Ubuntu message
# enable docker setup
- sudo usermod -aG docker $USER || true
- sleep 10 # delay to avoid docker permission denied errors
# get rid of annoying Ubuntu message
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
setup_commands:
# NOTE: rayproject/ray-ml:latest has ray latest bundled
setup_commands: []
# Note: if you're developing Ray, you probably want to create a Docker image that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# To run the nightly version of ray (as opposed to the latest), either use a rayproject docker image
# that has the "nightly" (e.g. "rayproject/ray-ml:nightly-gpu") or uncomment the following line:
- echo 'eval "$(conda shell.bash hook)"' >> ~/.bashrc
- echo 'conda activate py37_tensorflow' >> ~/.bashrc
- pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
# NOTE: rayproject/ray-ml:latest has azure packages bundled
head_setup_commands: []
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

View file

@ -31,6 +31,9 @@ idle_timeout_minutes: 5
provider:
type: azure
location: westus2
resource_group: ray-cluster
# set subscription id otherwise the default from az cli will be used
# subscription_id: 00000000-0000-0000-0000-000000000000
# How Ray will authenticate with newly launched nodes.
auth:
@ -57,7 +60,12 @@ available_node_types:
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
vmSize: Standard_NC6s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
ray.worker.gpu:
# The minimum number of nodes of this type to launch.
@ -71,7 +79,17 @@ available_node_types:
# Provider-specific config, e.g. instance type.
node_config:
azure_arm_parameters:
vmSize: Standard_NC6_v3
vmSize: Standard_NC6s_v3
# List images https://docs.microsoft.com/en-us/azure/virtual-machines/linux/cli-ps-findimage
imagePublisher: microsoft-dsvm
imageOffer: ubuntu-1804
imageSku: 1804-gen2
imageVersion: 21.01.21
# optionally set priority to use Spot instances
priority: Spot
# set a maximum price for spot instances if desired
# billingProfile:
# maxPrice: -1
# Specify the node type of the head node (as configured above).
head_node_type: ray.head.gpu
@ -88,8 +106,10 @@ file_mounts: {
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands:
# get rid of annoying Ubuntu message
# enable docker setup
- sudo usermod -aG docker $USER || true
- sleep 10 # delay to avoid docker permission denied errors
# get rid of annoying Ubuntu message
- touch ~/.sudo_as_admin_successful
# List of shell commands to run to set up nodes.
@ -98,8 +118,9 @@ setup_commands: []
# - pip install -U "ray[default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-2.0.0.dev0-cp37-cp37m-manylinux2014_x86_64.whl"
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0
# NOTE: rayproject/ray-ml:latest has azure packages bundled
head_setup_commands: []
# - pip install -U azure-cli-core==2.22.0 azure-mgmt-compute==14.0.0 azure-mgmt-msi==1.0.0 azure-mgmt-network==10.2.0 azure-mgmt-resource==13.0.0
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []

View file

@ -11,8 +11,8 @@ from unittest.mock import MagicMock, Mock, patch
import pytest
from click.exceptions import ClickException
from ray.autoscaler._private.azure.config import (_configure_key_pair as
_azure_configure_key_pair)
from ray.autoscaler._private._azure.config import (_configure_key_pair as
_azure_configure_key_pair)
from ray.autoscaler._private.gcp import config as gcp_config
from ray.autoscaler._private.util import prepare_config, validate_config,\
_get_default_config, merge_setup_commands

View file

@ -84,8 +84,8 @@ ray_files.append("ray/nightly-wheels.yaml")
ray_files += [
"ray/autoscaler/aws/defaults.yaml",
"ray/autoscaler/azure/defaults.yaml",
"ray/autoscaler/_private/azure/azure-vm-template.json",
"ray/autoscaler/_private/azure/azure-config-template.json",
"ray/autoscaler/_private/_azure/azure-vm-template.json",
"ray/autoscaler/_private/_azure/azure-config-template.json",
"ray/autoscaler/gcp/defaults.yaml",
"ray/autoscaler/local/defaults.yaml",
"ray/autoscaler/kubernetes/defaults.yaml",