apiVersion: cluster.ray.io/v1 kind: RayCluster metadata: name: example-cluster spec: # The maximum number of workers nodes to launch in addition to the head node. maxWorkers: 3 # The autoscaler will scale up the cluster faster with higher upscaling speed. # E.g., if the task requires adding more nodes then autoscaler will gradually # scale up the cluster in chunks of upscaling_speed*currently_running_nodes. # This number should be > 0. upscalingSpeed: 1.0 # If a node is idle for this many minutes, it will be removed. idleTimeoutMinutes: 5 # Specify the pod type for the ray head node (as configured below). headPodType: head-node # Optionally, configure ports for the Ray head service. # The ports specified below are the defaults. headServicePorts: - name: client port: 10001 targetPort: 10001 - name: dashboard port: 8265 targetPort: 8265 - name: ray-serve port: 8000 targetPort: 8000 # Specify the allowed pod types for this ray cluster and the resources they provide. podTypes: - name: head-node podConfig: apiVersion: v1 kind: Pod metadata: # The operator automatically prepends the cluster name to this field. generateName: ray-head- spec: restartPolicy: Never # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: medium: Memory containers: - name: ray-node imagePullPolicy: Always image: rayproject/ray:latest # Do not change this command - it keeps the pod alive until it is # explicitly killed. command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"] ports: - containerPort: 6379 # Redis port for Ray <= 1.10.0. GCS server port for Ray >= 1.11.0. - containerPort: 10001 # Used by Ray Client - containerPort: 8265 # Used by Ray Dashboard - containerPort: 8000 # Used by Ray Serve # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: cpu: 1000m memory: 1Gi ephemeral-storage: 1Gi limits: cpu: 1000m # The maximum memory that this pod is allowed to use. The # limit will be detected by ray and split to use 10% for # redis, 30% for the shared memory object store, and the # rest for application memory. If this limit is not set and # the object store size is not set manually, ray will # allocate a very large object store in each pod that may # cause problems for other pods. memory: 1Gi - name: worker-node # Minimum number of Ray workers of this Pod type. minWorkers: 2 # Maximum number of Ray workers of this Pod type. Takes precedence over minWorkers. maxWorkers: 3 # User-specified custom resources for use by Ray. # (Ray detects CPU and GPU from pod spec resource requests and limits, so no need to fill those here.) rayResources: {"example-resource-a": 1, "example-resource-b": 1} podConfig: apiVersion: v1 kind: Pod metadata: # The operator automatically prepends the cluster name to this field. generateName: ray-worker- spec: restartPolicy: Never volumes: - name: dshm emptyDir: medium: Memory containers: - name: ray-node imagePullPolicy: Always image: rayproject/ray:latest command: ["/bin/bash", "-c", "--"] args: ["trap : TERM INT; touch /tmp/raylogs; tail -f /tmp/raylogs; sleep infinity & wait;"] # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - mountPath: /dev/shm name: dshm resources: requests: cpu: 1000m memory: 1Gi ephemeral-storage: 1Gi limits: cpu: 1000m # The maximum memory that this pod is allowed to use. The # limit will be detected by ray and split to use 10% for # redis, 30% for the shared memory object store, and the # rest for application memory. If this limit is not set and # the object store size is not set manually, ray will # allocate a very large object store in each pod that may # cause problems for other pods. memory: 1Gi # Commands to start Ray on the head node. You don't need to change this. # Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. headStartRayCommands: - ray stop - ulimit -n 65536; ray start --head --port=6379 --no-monitor --dashboard-host 0.0.0.0 &> /tmp/raylogs # Commands to start Ray on worker nodes. You don't need to change this. workerStartRayCommands: - ray stop - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 &> /tmp/raylogs