2020-12-14 08:26:01 -08:00
|
|
|
apiVersion: cluster.ray.io/v1
|
|
|
|
kind: RayCluster
|
|
|
|
metadata:
|
|
|
|
name: example-cluster
|
|
|
|
spec:
|
|
|
|
# The maximum number of workers nodes to launch in addition to the head node.
|
|
|
|
maxWorkers: 3
|
|
|
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
|
|
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
|
|
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
|
|
|
# This number should be > 0.
|
|
|
|
upscalingSpeed: 1.0
|
|
|
|
# If a node is idle for this many minutes, it will be removed.
|
|
|
|
idleTimeoutMinutes: 5
|
|
|
|
# Specify the pod type for the ray head node (as configured below).
|
|
|
|
headPodType: head-node
|
2021-05-04 17:45:37 -04:00
|
|
|
# Optionally, configure ports for the Ray head service.
|
|
|
|
# The ports specified below are the defaults.
|
|
|
|
headServicePorts:
|
|
|
|
- name: client
|
|
|
|
port: 10001
|
|
|
|
targetPort: 10001
|
|
|
|
- name: dashboard
|
|
|
|
port: 8265
|
|
|
|
targetPort: 8265
|
|
|
|
- name: ray-serve
|
|
|
|
port: 8000
|
|
|
|
targetPort: 8000
|
2020-12-14 08:26:01 -08:00
|
|
|
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
|
|
|
podTypes:
|
|
|
|
- name: head-node
|
|
|
|
podConfig:
|
|
|
|
apiVersion: v1
|
|
|
|
kind: Pod
|
|
|
|
metadata:
|
2021-04-29 11:45:52 -04:00
|
|
|
# The operator automatically prepends the cluster name to this field.
|
|
|
|
generateName: ray-head-
|
2020-12-14 08:26:01 -08:00
|
|
|
spec:
|
|
|
|
restartPolicy: Never
|
|
|
|
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
|
|
volumes:
|
|
|
|
- name: dshm
|
|
|
|
emptyDir:
|
|
|
|
medium: Memory
|
|
|
|
containers:
|
|
|
|
- name: ray-node
|
|
|
|
imagePullPolicy: Always
|
2021-06-01 12:12:35 -04:00
|
|
|
image: rayproject/ray:latest
|
2020-12-14 08:26:01 -08:00
|
|
|
# Do not change this command - it keeps the pod alive until it is
|
|
|
|
# explicitly killed.
|
|
|
|
command: ["/bin/bash", "-c", "--"]
|
2021-08-15 14:16:55 -07:00
|
|
|
args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
|
2020-12-14 08:26:01 -08:00
|
|
|
ports:
|
2022-02-09 18:59:50 -08:00
|
|
|
- containerPort: 6379 # Redis port for Ray <= 1.10.0. GCS server port for Ray >= 1.11.0.
|
2021-02-08 18:00:34 -08:00
|
|
|
- containerPort: 10001 # Used by Ray Client
|
|
|
|
- containerPort: 8265 # Used by Ray Dashboard
|
2021-05-17 19:55:10 -04:00
|
|
|
- containerPort: 8000 # Used by Ray Serve
|
2020-12-14 08:26:01 -08:00
|
|
|
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
|
|
volumeMounts:
|
|
|
|
- mountPath: /dev/shm
|
|
|
|
name: dshm
|
|
|
|
resources:
|
|
|
|
requests:
|
|
|
|
cpu: 1000m
|
2022-04-18 17:47:42 -07:00
|
|
|
memory: 1Gi
|
2021-08-17 18:10:39 -07:00
|
|
|
ephemeral-storage: 1Gi
|
2020-12-14 08:26:01 -08:00
|
|
|
limits:
|
2022-04-18 17:47:42 -07:00
|
|
|
cpu: 1000m
|
2020-12-14 08:26:01 -08:00
|
|
|
# The maximum memory that this pod is allowed to use. The
|
|
|
|
# limit will be detected by ray and split to use 10% for
|
|
|
|
# redis, 30% for the shared memory object store, and the
|
|
|
|
# rest for application memory. If this limit is not set and
|
|
|
|
# the object store size is not set manually, ray will
|
|
|
|
# allocate a very large object store in each pod that may
|
|
|
|
# cause problems for other pods.
|
2022-04-18 17:47:42 -07:00
|
|
|
memory: 1Gi
|
2021-02-08 18:00:34 -08:00
|
|
|
- name: worker-node
|
2020-12-14 08:26:01 -08:00
|
|
|
# Minimum number of Ray workers of this Pod type.
|
|
|
|
minWorkers: 2
|
|
|
|
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
|
|
|
maxWorkers: 3
|
2021-02-08 18:00:34 -08:00
|
|
|
# User-specified custom resources for use by Ray.
|
|
|
|
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
|
2021-04-29 11:45:52 -04:00
|
|
|
rayResources: {"example-resource-a": 1, "example-resource-b": 1}
|
2020-12-14 08:26:01 -08:00
|
|
|
podConfig:
|
|
|
|
apiVersion: v1
|
|
|
|
kind: Pod
|
|
|
|
metadata:
|
2021-04-29 11:45:52 -04:00
|
|
|
# The operator automatically prepends the cluster name to this field.
|
|
|
|
generateName: ray-worker-
|
2020-12-14 08:26:01 -08:00
|
|
|
spec:
|
|
|
|
restartPolicy: Never
|
|
|
|
volumes:
|
|
|
|
- name: dshm
|
|
|
|
emptyDir:
|
|
|
|
medium: Memory
|
|
|
|
containers:
|
|
|
|
- name: ray-node
|
|
|
|
imagePullPolicy: Always
|
2021-06-01 12:12:35 -04:00
|
|
|
image: rayproject/ray:latest
|
2020-12-14 08:26:01 -08:00
|
|
|
command: ["/bin/bash", "-c", "--"]
|
2021-08-15 14:16:55 -07:00
|
|
|
args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
|
2020-12-14 08:26:01 -08:00
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
|
|
volumeMounts:
|
|
|
|
- mountPath: /dev/shm
|
|
|
|
name: dshm
|
|
|
|
resources:
|
|
|
|
requests:
|
|
|
|
cpu: 1000m
|
2022-04-18 17:47:42 -07:00
|
|
|
memory: 1Gi
|
2021-08-17 18:10:39 -07:00
|
|
|
ephemeral-storage: 1Gi
|
2020-12-14 08:26:01 -08:00
|
|
|
limits:
|
2022-04-18 17:47:42 -07:00
|
|
|
cpu: 1000m
|
2020-12-14 08:26:01 -08:00
|
|
|
# The maximum memory that this pod is allowed to use. The
|
|
|
|
# limit will be detected by ray and split to use 10% for
|
|
|
|
# redis, 30% for the shared memory object store, and the
|
|
|
|
# rest for application memory. If this limit is not set and
|
|
|
|
# the object store size is not set manually, ray will
|
|
|
|
# allocate a very large object store in each pod that may
|
|
|
|
# cause problems for other pods.
|
2022-04-18 17:47:42 -07:00
|
|
|
memory: 1Gi
|
2020-12-14 08:26:01 -08:00
|
|
|
# Commands to start Ray on the head node. You don't need to change this.
|
|
|
|
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
|
|
|
|
headStartRayCommands:
|
2021-02-08 18:00:34 -08:00
|
|
|
- ray stop
|
2022-02-09 18:59:50 -08:00
|
|
|
- ulimit -n 65536; ray start --head --port=6379 --no-monitor --dashboard-host 0.0.0.0 &> /tmp/raylogs
|
2020-12-14 08:26:01 -08:00
|
|
|
# Commands to start Ray on worker nodes. You don't need to change this.
|
|
|
|
workerStartRayCommands:
|
2021-02-08 18:00:34 -08:00
|
|
|
- ray stop
|
2021-08-15 14:16:55 -07:00
|
|
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 &> /tmp/raylogs
|