mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* make 0 default min/max workers for head node * fix helm charts, test, defaults for head * fix test, docs * make 0 default min/max workers for head node * fix helm charts, test, defaults for head * fix test, docs * comments. logging * better wording (logs) Co-authored-by: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com> * fix logging message * fix max workers in raycluster.yaml * use default values of 0 for min/max workders in a helm chart * add missing line back Co-authored-by: Dmitri Gekhtman <62982571+DmitriGekhtman@users.noreply.github.com>
136 lines
5.5 KiB
YAML
136 lines
5.5 KiB
YAML
apiVersion: cluster.ray.io/v1
|
|
kind: RayCluster
|
|
metadata:
|
|
name: example-cluster
|
|
spec:
|
|
# The maximum number of workers nodes to launch in addition to the head node.
|
|
maxWorkers: 3
|
|
# The autoscaler will scale up the cluster faster with higher upscaling speed.
|
|
# E.g., if the task requires adding more nodes then autoscaler will gradually
|
|
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes.
|
|
# This number should be > 0.
|
|
upscalingSpeed: 1.0
|
|
# If a node is idle for this many minutes, it will be removed.
|
|
idleTimeoutMinutes: 5
|
|
# Specify the pod type for the ray head node (as configured below).
|
|
headPodType: head-node
|
|
# Optionally, configure ports for the Ray head service.
|
|
# The ports specified below are the defaults.
|
|
headServicePorts:
|
|
- name: client
|
|
port: 10001
|
|
targetPort: 10001
|
|
- name: dashboard
|
|
port: 8265
|
|
targetPort: 8265
|
|
- name: ray-serve
|
|
port: 8000
|
|
targetPort: 8000
|
|
# Specify the allowed pod types for this ray cluster and the resources they provide.
|
|
podTypes:
|
|
- name: head-node
|
|
podConfig:
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
# The operator automatically prepends the cluster name to this field.
|
|
generateName: ray-head-
|
|
spec:
|
|
restartPolicy: Never
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
containers:
|
|
- name: ray-node
|
|
imagePullPolicy: Always
|
|
image: rayproject/ray:latest
|
|
# Do not change this command - it keeps the pod alive until it is
|
|
# explicitly killed.
|
|
command: ["/bin/bash", "-c", "--"]
|
|
args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
|
|
ports:
|
|
- containerPort: 6379 # Redis port
|
|
- containerPort: 10001 # Used by Ray Client
|
|
- containerPort: 8265 # Used by Ray Dashboard
|
|
- containerPort: 8000 # Used by Ray Serve
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 512Mi
|
|
ephemeral-storage: 1Gi
|
|
limits:
|
|
# The maximum memory that this pod is allowed to use. The
|
|
# limit will be detected by ray and split to use 10% for
|
|
# redis, 30% for the shared memory object store, and the
|
|
# rest for application memory. If this limit is not set and
|
|
# the object store size is not set manually, ray will
|
|
# allocate a very large object store in each pod that may
|
|
# cause problems for other pods.
|
|
memory: 512Mi
|
|
- name: worker-node
|
|
# Minimum number of Ray workers of this Pod type.
|
|
minWorkers: 2
|
|
# Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers.
|
|
maxWorkers: 3
|
|
# User-specified custom resources for use by Ray.
|
|
# (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.)
|
|
rayResources: {"example-resource-a": 1, "example-resource-b": 1}
|
|
podConfig:
|
|
apiVersion: v1
|
|
kind: Pod
|
|
metadata:
|
|
# The operator automatically prepends the cluster name to this field.
|
|
generateName: ray-worker-
|
|
spec:
|
|
restartPolicy: Never
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
containers:
|
|
- name: ray-node
|
|
imagePullPolicy: Always
|
|
image: rayproject/ray:latest
|
|
command: ["/bin/bash", "-c", "--"]
|
|
args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"]
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
resources:
|
|
requests:
|
|
cpu: 1000m
|
|
memory: 512Mi
|
|
ephemeral-storage: 1Gi
|
|
limits:
|
|
# The maximum memory that this pod is allowed to use. The
|
|
# limit will be detected by ray and split to use 10% for
|
|
# redis, 30% for the shared memory object store, and the
|
|
# rest for application memory. If this limit is not set and
|
|
# the object store size is not set manually, ray will
|
|
# allocate a very large object store in each pod that may
|
|
# cause problems for other pods.
|
|
memory: 512Mi
|
|
# Commands to start Ray on the head node. You don't need to change this.
|
|
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward.
|
|
headStartRayCommands:
|
|
- ray stop
|
|
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 &> /tmp/raylogs
|
|
# Commands to start Ray on worker nodes. You don't need to change this.
|
|
workerStartRayCommands:
|
|
- ray stop
|
|
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 &> /tmp/raylogs
|