ray/doc/kubernetes/ray-cluster.yaml

# Ray head node service, allowing worker pods to discover the head node.
apiVersion: v1
kind: Service
metadata:
  namespace: ray
  name: example-cluster-ray-head
spec:
  ports:
  - name: client
    protocol: TCP
    port: 10001
    targetPort: 10001
  - name: dashboard
    protocol: TCP
    port: 8265
    targetPort: 8265
  - name: redis
    protocol: TCP
    port: 6379
    targetPort: 6379
  selector:
    component: ray-head
---
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: ray
  name: ray-head
spec:
  # Do not change this - Ray currently only supports one head node per cluster.
  replicas: 1
  selector:
    matchLabels:
      component: ray-head
      type: ray
  template:
    metadata:
      labels:
        component: ray-head
        type: ray
    spec:
      # If the head node goes down, the entire cluster (including all worker
      # nodes) will go down as well. If you want Kubernetes to bring up a new
      # head node in this case, set this to "Always," else set it to "Never."
      restartPolicy: Always

      # This volume allocates shared memory for Ray to use for its plasma
      # object store. If you do not provide this, Ray will fall back to
      # /tmp which cause slowdowns if is not a shared memory volume.
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      containers:
        - name: ray-head
          image: rayproject/ray:latest
          imagePullPolicy: IfNotPresent
          command: [ "/bin/bash", "-c", "--" ]
          args:
            - "ray start --head --port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=22345 --node-manager-port=22346 --dashboard-host=0.0.0.0 --block"
          ports:
            - containerPort: 6379 # Redis port
            - containerPort: 10001 # Used by Ray Client
            - containerPort: 8265 # Used by Ray Dashboard

          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
            - mountPath: /dev/shm
              name: dshm
          env:
            # This is used in the ray start command so that Ray can spawn the
            # correct number of processes. Omitting this may lead to degraded
            # performance.
            - name: MY_CPU_REQUEST
              valueFrom:
                resourceFieldRef:
                  resource: requests.cpu
          resources:
            requests:
              cpu: 100m
              memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: ray
  name: ray-worker
spec:
  # Change this to scale the number of worker nodes started in the Ray cluster.
  replicas: 3
  selector:
    matchLabels:
      component: ray-worker
      type: ray
  template:
    metadata:
      labels:
        component: ray-worker
        type: ray
    spec:
      restartPolicy: Always
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      containers:
      - name: ray-worker
        image: rayproject/ray:latest
        imagePullPolicy: IfNotPresent
        command: ["/bin/bash", "-c", "--"]
        args:
          - "ray start --num-cpus=$MY_CPU_REQUEST --address=$EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_HOST:$EXAMPLE_CLUSTER_RAY_HEAD_SERVICE_PORT_REDIS --object-manager-port=22345 --node-manager-port=22346 --block"
        # This volume allocates shared memory for Ray to use for its plasma
        # object store. If you do not provide this, Ray will fall back to
        # /tmp which cause slowdowns if is not a shared memory volume.
        volumeMounts:
          - mountPath: /dev/shm
            name: dshm
        env:
          # This is used in the ray start command so that Ray can spawn the
          # correct number of processes. Omitting this may lead to degraded
          # performance.
          - name: MY_CPU_REQUEST
            valueFrom:
              resourceFieldRef:
                resource: requests.cpu
        resources:
          requests:
            cpu: 100m
            memory: 512Mi