[Serve] Fix Serve Release Tests (#12777)

This commit is contained in:
Simon Mo 2020-12-11 11:53:47 -08:00 committed by GitHub
parent 4ad4463be6
commit 3d8c1cbae6
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 80 additions and 99 deletions

View file

@ -1,7 +1,7 @@
cluster_name: default
min_workers: 22
max_workers: 22
initial_workers: 22
min_workers: 5
max_workers: 5
initial_workers: 5
autoscaling_mode: default
docker:
image: 'anyscale/ray-ml:latest'
@ -28,6 +28,7 @@ initialization_commands: []
setup_commands:
- apt-get install build-essential libssl-dev git -y
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
- ray install-nightly
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:

View file

@ -86,13 +86,14 @@ async def main():
client.create_backend("backend", backend)
client.create_endpoint("endpoint", backend="backend", route="/api")
for intermediate_handles in [False, True]:
if (intermediate_handles):
if intermediate_handles:
client.create_endpoint(
"backend", backend="backend", route="/backend")
class forwardActor:
def __init__(self):
client = serve.connect()
self.handle = client.get_handle("backend")
def __call__(self, _):

View file

@ -36,73 +36,76 @@ from ray import serve
from ray.serve import BackendConfig
from ray.serve.utils import logger
from ray.util.placement_group import (placement_group, remove_placement_group)
from ray.util.placement_group import placement_group, remove_placement_group
ray.shutdown()
ray.init(address="auto")
client = serve.start()
# These numbers need to correspond with the autoscaler config file.
# The number of remote nodes in the autoscaler should upper bound
# these because sometimes nodes fail to update.
num_workers = 20
expected_num_nodes = num_workers + 1
cpus_per_node = 4
num_remote_cpus = expected_num_nodes * cpus_per_node
# We ask for more worker but only need to run on smaller subset.
# This should account for worker nodes failed to launch.
expected_num_nodes = 6
num_replicas = 11
# wrk HTTP load testing config
num_connections = 20
num_threads = 2
time_to_run = "20s"
# Wait until the expected number of nodes have joined the cluster.
while True:
num_nodes = len(ray.nodes())
num_nodes = len(list(filter(lambda node: node["Alive"], ray.nodes())))
logger.info("Waiting for nodes {}/{}".format(num_nodes,
expected_num_nodes))
if num_nodes >= expected_num_nodes:
break
time.sleep(5)
logger.info("Nodes have all joined. There are %s resources.",
ray.cluster_resources())
client = serve.start()
def hey(_):
time.sleep(0.01) # Sleep for 10ms
return b"hey"
num_connections = int(num_remote_cpus * 0.75)
num_threads = 2
time_to_run = "10s"
pg = placement_group(
[{
"CPU": 1
} for _ in range(expected_num_nodes)], strategy="STRICT_SPREAD")
ray.get(pg.ready())
# The number of replicas is the number of cores remaining after accounting
# for the one HTTP proxy actor on each node, the "hey" requester task on each
# node, and the serve controller.
# num_replicas = expected_num_nodes * (cpus_per_node - 2) - 1
num_replicas = ray.available_resources()["CPU"]
logger.info("Starting %i replicas", num_replicas)
client.create_backend(
"hey", hey, config=BackendConfig(num_replicas=num_replicas))
client.create_endpoint("hey", backend="hey", route="/hey")
@ray.remote
@ray.remote(num_cpus=0)
def run_wrk():
logger.info("Warming up for ~3 seconds")
for _ in range(5):
resp = requests.get("http://127.0.0.1:8000/hey").text
logger.info("Received response \'" + resp + "\'")
time.sleep(0.5)
logger.info("Warming up")
for _ in range(10):
try:
resp = requests.get("http://127.0.0.1:8000/hey").text
logger.info("Received response '" + resp + "'")
time.sleep(0.5)
except Exception as e:
logger.info(f"Got exception {e}")
result = subprocess.run(
[
"wrk", "-c",
str(num_connections), "-t",
str(num_threads), "-d", time_to_run, "http://127.0.0.1:8000/hey"
"wrk",
"-c",
str(num_connections),
"-t",
str(num_threads),
"-d",
time_to_run,
"http://127.0.0.1:8000/hey",
],
stdout=subprocess.PIPE)
stdout=subprocess.PIPE,
)
return result.stdout.decode()

View file

@ -23,6 +23,7 @@ initialization_commands: []
setup_commands:
- apt-get install build-essential libssl-dev git -y
- 'rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin'
- ray install-nightly
head_setup_commands: []
worker_setup_commands: []
head_start_ray_commands:

View file

@ -1,48 +1,17 @@
cluster_name: default
min_workers: 0
max_workers: 0
target_utilization_fraction: 0.8
idle_timeout_minutes: 5
cluster_name: ray-release-long-running-tests
docker:
image: anyscale/ray:latest
container_name: ray_container
pull_before_run: False
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
availability_zone: us-west-2a
availability_zone: us-west-2a, us-west-2b, us-west-2c
auth:
ssh_user: ubuntu
head_node:
InstanceType: m5.2xlarge
ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 150
worker_nodes:
InstanceType: m5.large
ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 150
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# List of shell commands to run to set up nodes.
setup_commands: []
# Custom commands that will be run on the head node after common setup.
head_setup_commands: []
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands: []
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands: []
InstanceType: m5.xlarge

View file

@ -1,6 +1,6 @@
#!/usr/bin/env bash
ray_version=""
ray_version=""
commit=""
ray_branch=""
workload=""
@ -48,20 +48,20 @@ echo "commit: $commit"
echo "branch: $ray_branch"
echo "workload: $workload"
wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp36-cp36m-manylinux2014_x86_64.whl"
wheel="https://s3-us-west-2.amazonaws.com/ray-wheels/$ray_branch/$commit/ray-$ray_version-cp37-cp37m-manylinux2014_x86_64.whl"
echo set-window-option -g mouse on > ~/.tmux.conf
echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc
# Serve load testing tool
rm -r wrk || true && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && sudo cp wrk /usr/local/bin
pip install -U pip
unset RAY_ADDRESS
source activate tensorflow_p36
conda remove -y --force wrapt || true
cur_dir=$(pwd)
cd /tmp && rm -rf wrk && git clone https://github.com/wg/wrk.git wrk && cd wrk && make -j && cp wrk /usr/local/bin
cd "$cur_dir" || exit
pip install --upgrade pip
pip install -U tensorflow==1.14
pip install -q -U "$wheel" Click
pip install -q -U "$wheel"
pip install -q "ray[all]" "gym[atari]"
cd ..
ray stop && sleep 2
unset RAY_ADDRESS
python "./workloads/$workload.py"

View file

@ -11,7 +11,7 @@ from ray.cluster_utils import Cluster
num_redis_shards = 1
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 5
num_nodes = 1
cluster = Cluster()
for i in range(num_nodes):
cluster.add_node(
@ -22,21 +22,20 @@ for i in range(num_nodes):
resources={str(i): 2},
object_store_memory=object_store_memory,
redis_max_memory=redis_max_memory,
dashboard_host="0.0.0.0")
dashboard_host="0.0.0.0",
)
ray.init(address=cluster.address, dashboard_host="0.0.0.0")
client = serve.start()
@serve.accept_batch
def echo(_):
def echo(requests_batch):
time.sleep(0.01) # Sleep for 10ms
ray.show_in_dashboard(
str(serve.context.batch_size), key="Current batch size")
return ["hi {}".format(i) for i in range(serve.context.batch_size)]
return ["hi" for _ in range(len(requests_batch))]
config = {"num_replicas": 30, "max_batch_size": 16}
config = {"num_replicas": 7, "max_batch_size": 16}
client.create_backend("echo:v1", echo, config=config)
client.create_endpoint("echo", backend="echo:v1", route="/echo")
@ -53,12 +52,18 @@ time_to_run = "60m"
while True:
proc = subprocess.Popen(
[
"wrk", "-c",
str(connections), "-t",
str(num_threads), "-s", time_to_run, "http://127.0.0.1:8000/echo"
"wrk",
"-c",
str(connections),
"-t",
str(num_threads),
"-d",
time_to_run,
"http://127.0.0.1:8000/echo",
],
stdout=PIPE,
stderr=PIPE)
stderr=PIPE,
)
print("started load testing")
proc.wait()
out, err = proc.communicate()

View file

@ -11,19 +11,20 @@ from ray.cluster_utils import Cluster
num_redis_shards = 1
redis_max_memory = 10**8
object_store_memory = 10**8
num_nodes = 5
cpus_per_node = 2
num_nodes = 1
cpus_per_node = 10
cluster = Cluster()
for i in range(num_nodes):
cluster.add_node(
redis_port=6379 if i == 0 else None,
num_redis_shards=num_redis_shards if i == 0 else None,
num_cpus=2,
num_cpus=16,
num_gpus=0,
resources={str(i): 2},
object_store_memory=object_store_memory,
redis_max_memory=redis_max_memory,
dashboard_host="0.0.0.0")
dashboard_host="0.0.0.0",
)
ray.init(
address=cluster.address, dashboard_host="0.0.0.0", log_to_driver=False)