# Ray head node service, allowing worker pods to discover the head node. apiVersion: v1 kind: Service metadata: namespace: ray name: ray-head spec: ports: - name: client protocol: TCP port: 10001 targetPort: 10001 - name: dashboard protocol: TCP port: 8265 targetPort: 8265 - name: redis protocol: TCP port: 6379 targetPort: 6379 selector: component: ray-head --- apiVersion: apps/v1 kind: Deployment metadata: namespace: ray name: ray-head spec: # Do not change this - Ray currently only supports one head node per cluster. replicas: 1 selector: matchLabels: component: ray-head type: ray template: metadata: labels: component: ray-head type: ray spec: # If the head node goes down, the entire cluster (including all worker # nodes) will go down as well. If you want Kubernetes to bring up a new # head node in this case, set this to "Always," else set it to "Never." restartPolicy: Always # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumes: - name: dshm emptyDir: medium: Memory containers: - name: ray-head image: rayproject/ray:nightly imagePullPolicy: IfNotPresent command: [ "/bin/bash", "-c", "--" ] args: - "ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block" ports: - containerPort: 6379 # Redis port - containerPort: 10001 # Used by Ray Client - containerPort: 8265 # Used by Ray Dashboard # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - mountPath: /dev/shm name: dshm env: # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. - name: MY_CPU_REQUEST valueFrom: resourceFieldRef: resource: requests.cpu resources: requests: cpu: 100m memory: 512Mi --- apiVersion: apps/v1 kind: Deployment metadata: namespace: ray name: ray-worker spec: # Change this to scale the number of worker nodes started in the Ray cluster. replicas: 3 selector: matchLabels: component: ray-worker type: ray template: metadata: labels: component: ray-worker type: ray spec: restartPolicy: Always volumes: - name: dshm emptyDir: medium: Memory containers: - name: ray-worker image: rayproject/ray:nightly imagePullPolicy: IfNotPresent command: ["/bin/bash", "-c", "--"] args: - "ray start --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS --object-manager-port=12345 --node-manager-port=12346 --block" # This volume allocates shared memory for Ray to use for its plasma # object store. If you do not provide this, Ray will fall back to # /tmp which cause slowdowns if is not a shared memory volume. volumeMounts: - mountPath: /dev/shm name: dshm env: # This is used in the ray start command so that Ray can spawn the # correct number of processes. Omitting this may lead to degraded # performance. - name: MY_CPU_REQUEST valueFrom: resourceFieldRef: resource: requests.cpu resources: requests: cpu: 100m memory: 512Mi