mirror of
https://github.com/vale981/ray
synced 2025-03-06 10:31:39 -05:00

* Add ray-cluster, modify submit * Add comments * Job submission working * Write docs * Add link to autoscaling * Fix wget link in job * Use namespace file * match tense * fix tab * Improve job documentation * comments * Fix link * Fix links * comments * add overview paragraph * Update imagePullPolicy * Warning if no cluster running * better check
150 lines
4.5 KiB
YAML
150 lines
4.5 KiB
YAML
# Ray head node service, allowing worker pods to discover the head node.
|
|
apiVersion: v1
|
|
kind: Service
|
|
metadata:
|
|
namespace: ray
|
|
name: ray-head
|
|
spec:
|
|
ports:
|
|
# Redis ports.
|
|
- name: redis-primary
|
|
port: 6379
|
|
targetPort: 6379
|
|
- name: redis-shard-0
|
|
port: 6380
|
|
targetPort: 6380
|
|
- name: redis-shard-1
|
|
port: 6381
|
|
targetPort: 6381
|
|
|
|
# Ray internal communication ports.
|
|
- name: object-manager
|
|
port: 12345
|
|
targetPort: 12345
|
|
- name: node-manager
|
|
port: 12346
|
|
targetPort: 12346
|
|
selector:
|
|
component: ray-head
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
namespace: ray
|
|
name: ray-head
|
|
spec:
|
|
# Do not change this - Ray currently only supports one head node per cluster.
|
|
replicas: 1
|
|
selector:
|
|
matchLabels:
|
|
component: ray-head
|
|
type: ray
|
|
template:
|
|
metadata:
|
|
labels:
|
|
component: ray-head
|
|
type: ray
|
|
spec:
|
|
# If the head node goes down, the entire cluster (including all worker
|
|
# nodes) will go down as well. If you want Kubernetes to bring up a new
|
|
# head node in this case, set this to "Always," else set it to "Never."
|
|
restartPolicy: Always
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
containers:
|
|
- name: ray-head
|
|
image: rayproject/autoscaler
|
|
imagePullPolicy: Always
|
|
command: [ "/bin/bash", "-c", "--" ]
|
|
args:
|
|
- "ray start --head --node-ip-address=$MY_POD_IP --redis-port=6379 --redis-shard-ports=6380,6381 --num-cpus=$MY_CPU_REQUEST --object-manager-port=12345 --node-manager-port=12346 --block"
|
|
ports:
|
|
- containerPort: 6379 # Redis port.
|
|
- containerPort: 6380 # Redis port.
|
|
- containerPort: 6381 # Redis port.
|
|
- containerPort: 12345 # Ray internal communication.
|
|
- containerPort: 12346 # Ray internal communication.
|
|
|
|
# This volume allocates shared memory for Ray to use for its plasma
|
|
# object store. If you do not provide this, Ray will fall back to
|
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
env:
|
|
- name: MY_POD_IP
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: status.podIP
|
|
|
|
# This is used in the ray start command so that Ray can spawn the
|
|
# correct number of processes. Omitting this may lead to degraded
|
|
# performance.
|
|
- name: MY_CPU_REQUEST
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
resource: requests.cpu
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 512Mi
|
|
---
|
|
apiVersion: apps/v1
|
|
kind: Deployment
|
|
metadata:
|
|
namespace: ray
|
|
name: ray-worker
|
|
spec:
|
|
# Change this to scale the number of worker nodes started in the Ray cluster.
|
|
replicas: 3
|
|
selector:
|
|
matchLabels:
|
|
component: ray-worker
|
|
type: ray
|
|
template:
|
|
metadata:
|
|
labels:
|
|
component: ray-worker
|
|
type: ray
|
|
spec:
|
|
restartPolicy: Always
|
|
volumes:
|
|
- name: dshm
|
|
emptyDir:
|
|
medium: Memory
|
|
containers:
|
|
- name: ray-worker
|
|
image: rayproject/autoscaler
|
|
imagePullPolicy: Always
|
|
command: ["/bin/bash", "-c", "--"]
|
|
args:
|
|
- "ray start --node-ip-address=$MY_POD_IP --num-cpus=$MY_CPU_REQUEST --address=$RAY_HEAD_SERVICE_HOST:$RAY_HEAD_SERVICE_PORT_REDIS_PRIMARY --object-manager-port=12345 --node-manager-port=12346 --block"
|
|
ports:
|
|
- containerPort: 12345 # Ray internal communication.
|
|
- containerPort: 12346 # Ray internal communication.
|
|
volumeMounts:
|
|
- mountPath: /dev/shm
|
|
name: dshm
|
|
env:
|
|
- name: MY_POD_IP
|
|
valueFrom:
|
|
fieldRef:
|
|
fieldPath: status.podIP
|
|
|
|
# This is used in the ray start command so that Ray can spawn the
|
|
# correct number of processes. Omitting this may lead to degraded
|
|
# performance.
|
|
- name: MY_CPU_REQUEST
|
|
valueFrom:
|
|
resourceFieldRef:
|
|
resource: requests.cpu
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 512Mi
|