ray/doc/kubernetes/ray-cluster.yaml
Edward Oakes 8ca7fab581
Improve manual Kubernetes deployment documentation (#5582)
* Add ray-cluster, modify submit

* Add comments

* Job submission working

* Write docs

* Add link to autoscaling

* Fix wget link in job

* Use namespace file

* match tense

* fix tab

* Improve job documentation

* comments

* Fix link

* Fix links

* comments

* add overview paragraph

* Update imagePullPolicy

* Warning if no cluster running

* better check
2019-10-03 15:47:49 -07:00

150 lines
4.5 KiB
YAML

# Ray head node service, allowing worker pods to discover the head node.
apiVersion: v1
kind: Service
metadata:
namespace: ray
name: ray-head
spec:
ports:
# Redis ports.
- name: redis-primary
port: 6379
targetPort: 6379
- name: redis-shard-0
port: 6380
targetPort: 6380
- name: redis-shard-1
port: 6381
targetPort: 6381
# Ray internal communication ports.
- name: object-manager
port: 12345
targetPort: 12345
- name: node-manager
port: 12346
targetPort: 12346
selector:
component: ray-head
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: ray
name: ray-head
spec:
# Do not change this - Ray currently only supports one head node per cluster.
replicas: 1
selector:
matchLabels:
component: ray-head
type: ray
template:
metadata:
labels:
component: ray-head
type: ray
spec:
# If the head node goes down, the entire cluster (including all worker
# nodes) will go down as well. If you want Kubernetes to bring up a new
# head node in this case, set this to "Always," else set it to "Never."
restartPolicy: Always
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-head
image: rayproject/autoscaler
imagePullPolicy: Always
command: [ "/bin/bash", "-c", "--" ]
args:
- "ray start --head --node-ip-address=$MY_POD_IP --redis-port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block"
ports:
- containerPort: 6379 # Redis port.
- containerPort: 6380 # Redis port.
- containerPort: 6381 # Redis port.
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
# This volume allocates shared memory for Ray to use for its plasma
# object store. If you do not provide this, Ray will fall back to
# /tmp which cause slowdowns if is not a shared memory volume.
volumeMounts:
- mountPath: /dev/shm
name: dshm
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# This is used in the ray start command so that Ray can spawn the
# correct number of processes. Omitting this may lead to degraded
# performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
resources:
requests:
cpu: 100m
memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: ray
name: ray-worker
spec:
# Change this to scale the number of worker nodes started in the Ray cluster.
replicas: 3
selector:
matchLabels:
component: ray-worker
type: ray
template:
metadata:
labels:
component: ray-worker
type: ray
spec:
restartPolicy: Always
volumes:
- name: dshm
emptyDir:
medium: Memory
containers:
- name: ray-worker
image: rayproject/autoscaler
imagePullPolicy: Always
command: ["/bin/bash", "-c", "--"]
args:
- "ray start --node-ip-address=$MY_POD_IP --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 --block"
ports:
- containerPort: 12345 # Ray internal communication.
- containerPort: 12346 # Ray internal communication.
volumeMounts:
- mountPath: /dev/shm
name: dshm
env:
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
# This is used in the ray start command so that Ray can spawn the
# correct number of processes. Omitting this may lead to degraded
# performance.
- name: MY_CPU_REQUEST
valueFrom:
resourceFieldRef:
resource: requests.cpu
resources:
requests:
cpu: 100m
memory: 512Mi