ray/doc/kubernetes/ray-cluster.yaml

# Ray head node service, allowing worker pods to discover the head node.
apiVersion: v1
kind: Service
metadata:
  namespace: ray
  name: ray-head
spec:
  ports:
    # Redis ports.
    - name: redis-primary
      port: 6379
      targetPort: 6379
    - name: redis-shard-0
      port: 6380
      targetPort: 6380
    - name: redis-shard-1
      port: 6381
      targetPort: 6381

    # Ray internal communication ports.
    - name: object-manager
      port: 12345
      targetPort: 12345
    - name: node-manager
      port: 12346
      targetPort: 12346
  selector:
    component: ray-head
---
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: ray
  name: ray-head
spec:
  # Do not change this - Ray currently only supports one head node per cluster.
  replicas: 1
  selector:
    matchLabels:
      component: ray-head
      type: ray
  template:
    metadata:
      labels:
        component: ray-head
        type: ray
    spec:
      # If the head node goes down, the entire cluster (including all worker
      # nodes) will go down as well. If you want Kubernetes to bring up a new
      # head node in this case, set this to "Always," else set it to "Never."
      restartPolicy: Always

      # This volume allocates shared memory for Ray to use for its plasma
      # object store. If you do not provide this, Ray will fall back to
      # /tmp which cause slowdowns if is not a shared memory volume.
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      containers:
        - name: ray-head
          image: rayproject/autoscaler
          imagePullPolicy: Always
          command: [ "/bin/bash", "-c", "--" ]
          args:
            - "ray start --head --node-ip-address=$MY_POD_IP --redis-port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block"
          ports:
            - containerPort: 6379 # Redis port.
            - containerPort: 6380 # Redis port.
            - containerPort: 6381 # Redis port.
            - containerPort: 12345 # Ray internal communication.
            - containerPort: 12346 # Ray internal communication.

          # This volume allocates shared memory for Ray to use for its plasma
          # object store. If you do not provide this, Ray will fall back to
          # /tmp which cause slowdowns if is not a shared memory volume.
          volumeMounts:
            - mountPath: /dev/shm
              name: dshm
          env:
            - name: MY_POD_IP
              valueFrom:
                fieldRef:
                  fieldPath: status.podIP

            # This is used in the ray start command so that Ray can spawn the
            # correct number of processes. Omitting this may lead to degraded
            # performance.
            - name: MY_CPU_REQUEST
              valueFrom:
                resourceFieldRef:
                  resource: requests.cpu
          resources:
            requests:
              cpu: 100m
              memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: ray
  name: ray-worker
spec:
  # Change this to scale the number of worker nodes started in the Ray cluster.
  replicas: 3
  selector:
    matchLabels:
      component: ray-worker
      type: ray
  template:
    metadata:
      labels:
        component: ray-worker
        type: ray
    spec:
      restartPolicy: Always
      volumes:
      - name: dshm
        emptyDir:
          medium: Memory
      containers:
      - name: ray-worker
        image: rayproject/autoscaler
        imagePullPolicy: Always
        command: ["/bin/bash", "-c", "--"]
        args:
          - "ray start --node-ip-address=$MY_POD_IP --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 --block"
        ports:
          - containerPort: 12345 # Ray internal communication.
          - containerPort: 12346 # Ray internal communication.
        volumeMounts:
          - mountPath: /dev/shm
            name: dshm
        env:
          - name: MY_POD_IP
            valueFrom:
              fieldRef:
                fieldPath: status.podIP

          # This is used in the ray start command so that Ray can spawn the
          # correct number of processes. Omitting this may lead to degraded
          # performance.
          - name: MY_CPU_REQUEST
            valueFrom:
              resourceFieldRef:
                resource: requests.cpu
        resources:
          requests:
            cpu: 100m
            memory: 512Mi