Skip to content

Commit 286e1ee

Browse files
authored
Adds Ray Workflow: Multiple Run Support, Distributed Hyperparameter Tuning, and Consistent Setup Across Local/Cloud (#1301)
This PR adds Ray support, which enables a lot of really cool stuff by leveraging the existing Hydra support, including but not limited to: - Several training runs at once in parallel or consecutively with minimal interaction - Using the same training setup everywhere (on cloud and local) with minimal overhead - Tuning hyperparameters - Tuning hyperparameters in parallel on multiple GPUs and/or multiple GPU Nodes - Simultaneously tuning model hyperparameters for different environments/agents - Resource Isolation
1 parent d8bc725 commit 286e1ee

File tree

13 files changed

+2457
-0
lines changed

13 files changed

+2457
-0
lines changed

docs/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ Table of Contents
100100
source/features/hydra
101101
source/features/multi_gpu
102102
Tiled Rendering</source/overview/sensors/camera>
103+
source/features/ray
103104
source/features/reproducibility
104105

105106
.. toctree::

docs/source/features/ray.rst

Lines changed: 379 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
FROM isaac-lab-base:latest
2+
3+
# Set NVIDIA paths
4+
ENV PATH="/usr/local/nvidia/bin:$PATH"
5+
ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64"
6+
7+
# Link NVIDIA binaries
8+
RUN ln -sf /usr/local/nvidia/bin/nvidia* /usr/bin
9+
10+
# Install Ray and configure it
11+
RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install "ray[default, tune]"==2.31.0 && \
12+
sed -i "1i $(echo "#!/workspace/isaaclab/_isaac_sim/python.sh")" \
13+
/isaac-sim/kit/python/bin/ray && ln -s /isaac-sim/kit/python/bin/ray /usr/local/bin/ray
14+
15+
# Install tuning dependencies
16+
RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install optuna bayesian-optimization
17+
18+
# Install MLflow for logging
19+
RUN /workspace/isaaclab/_isaac_sim/python.sh -m pip install mlflow
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
# Jinja is used for templating here as full helm setup is excessive for application
2+
apiVersion: ray.io/v1alpha1
3+
kind: RayCluster
4+
metadata:
5+
name: {{ name }}
6+
namespace: {{ namespace }}
7+
spec:
8+
rayVersion: "2.8.0"
9+
enableInTreeAutoscaling: true
10+
autoscalerOptions:
11+
upscalingMode: Default
12+
idleTimeoutSeconds: 120
13+
imagePullPolicy: Always
14+
securityContext: {}
15+
envFrom: []
16+
17+
headGroupSpec:
18+
rayStartParams:
19+
block: "true"
20+
dashboard-host: 0.0.0.0
21+
dashboard-port: "8265"
22+
node-ip-address: "0.0.0.0"
23+
port: "6379"
24+
include-dashboard: "true"
25+
ray-debugger-external: "true"
26+
object-manager-port: "8076"
27+
num-gpus: "0"
28+
num-cpus: "0" # prevent scheduling jobs to the head node - workers only
29+
headService:
30+
apiVersion: v1
31+
kind: Service
32+
metadata:
33+
name: head
34+
spec:
35+
type: LoadBalancer
36+
template:
37+
metadata:
38+
labels:
39+
app.kubernetes.io/instance: tuner
40+
app.kubernetes.io/name: kuberay
41+
cloud.google.com/gke-ray-node-type: head
42+
spec:
43+
serviceAccountName: {{ service_account_name }}
44+
affinity: {}
45+
securityContext:
46+
fsGroup: 100
47+
containers:
48+
- env:
49+
image: {{ image }}
50+
imagePullPolicy: Always
51+
name: head
52+
resources:
53+
limits:
54+
cpu: "{{ num_head_cpu }}"
55+
memory: {{ head_ram_gb }}G
56+
nvidia.com/gpu: "0"
57+
requests:
58+
cpu: "{{ num_head_cpu }}"
59+
memory: {{ head_ram_gb }}G
60+
nvidia.com/gpu: "0"
61+
securityContext: {}
62+
volumeMounts:
63+
- mountPath: /tmp/ray
64+
name: ray-logs
65+
command: ["/bin/bash", "-c", "ray start --head --port=6379 --object-manager-port=8076 --dashboard-host=0.0.0.0 --dashboard-port=8265 --include-dashboard=true && tail -f /dev/null"]
66+
- image: fluent/fluent-bit:1.9.6
67+
name: fluentbit
68+
resources:
69+
limits:
70+
cpu: 100m
71+
memory: 128Mi
72+
requests:
73+
cpu: 100m
74+
memory: 128Mi
75+
volumeMounts:
76+
- mountPath: /tmp/ray
77+
name: ray-logs
78+
imagePullSecrets: []
79+
nodeSelector:
80+
iam.gke.io/gke-metadata-server-enabled: "true"
81+
volumes:
82+
- configMap:
83+
name: fluentbit-config
84+
name: fluentbit-config
85+
- name: ray-logs
86+
emptyDir: {}
87+
88+
workerGroupSpecs:
89+
{% for it in range(gpu_per_worker|length) %}
90+
- groupName: "{{ worker_accelerator[it] }}x{{ gpu_per_worker[it] }}-cpu-{{ cpu_per_worker[it] }}-ram-gb-{{ ram_gb_per_worker[it] }}"
91+
replicas: {{ num_workers[it] }}
92+
maxReplicas: {{ num_workers[it] }}
93+
minReplicas: {{ num_workers[it] }}
94+
rayStartParams:
95+
block: "true"
96+
ray-debugger-external: "true"
97+
replicas: "{{num_workers[it]}}"
98+
template:
99+
metadata:
100+
annotations: {}
101+
labels:
102+
app.kubernetes.io/instance: tuner
103+
app.kubernetes.io/name: kuberay
104+
cloud.google.com/gke-ray-node-type: worker
105+
spec:
106+
serviceAccountName: {{ service_account_name }}
107+
affinity: {}
108+
securityContext:
109+
fsGroup: 100
110+
containers:
111+
- env:
112+
- name: NVIDIA_VISIBLE_DEVICES
113+
value: "all"
114+
- name: NVIDIA_DRIVER_CAPABILITIES
115+
value: "compute,utility"
116+
117+
image: {{ image }}
118+
imagePullPolicy: Always
119+
name: ray-worker
120+
resources:
121+
limits:
122+
cpu: "{{ cpu_per_worker[it] }}"
123+
memory: {{ ram_gb_per_worker[it] }}G
124+
nvidia.com/gpu: "{{ gpu_per_worker[it] }}"
125+
requests:
126+
cpu: "{{ cpu_per_worker[it] }}"
127+
memory: {{ ram_gb_per_worker[it] }}G
128+
nvidia.com/gpu: "{{ gpu_per_worker[it] }}"
129+
securityContext: {}
130+
volumeMounts:
131+
- mountPath: /tmp/ray
132+
name: ray-logs
133+
command: ["/bin/bash", "-c", "ray start --address=head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"]
134+
- image: fluent/fluent-bit:1.9.6
135+
name: fluentbit
136+
resources:
137+
limits:
138+
cpu: 100m
139+
memory: 128Mi
140+
requests:
141+
cpu: 100m
142+
memory: 128Mi
143+
volumeMounts:
144+
- mountPath: /tmp/ray
145+
name: ray-logs
146+
147+
imagePullSecrets: []
148+
nodeSelector:
149+
cloud.google.com/gke-accelerator: {{ worker_accelerator[it] }}
150+
iam.gke.io/gke-metadata-server-enabled: "true"
151+
tolerations:
152+
- key: "nvidia.com/gpu"
153+
operator: "Exists"
154+
effect: "NoSchedule"
155+
volumes:
156+
- configMap:
157+
name: fluentbit-config
158+
name: fluentbit-config
159+
- name: ray-logs
160+
emptyDir: {}
161+
{% endfor %}
162+
163+
---
164+
# ML Flow Server - for fetching logs
165+
apiVersion: apps/v1
166+
kind: Deployment
167+
metadata:
168+
name: {{name}}-mlflow
169+
namespace: {{ namespace }}
170+
spec:
171+
replicas: 1
172+
selector:
173+
matchLabels:
174+
app: mlflow
175+
template:
176+
metadata:
177+
labels:
178+
app: mlflow
179+
spec:
180+
containers:
181+
- name: mlflow
182+
image: ghcr.io/mlflow/mlflow:v2.9.2
183+
ports:
184+
- containerPort: 5000
185+
command: ["mlflow"]
186+
args:
187+
- server
188+
- --host=0.0.0.0
189+
- --port=5000
190+
- --backend-store-uri=sqlite:///mlflow.db
191+
---
192+
# ML Flow Service (for port forwarding, kubectl port-forward service/{name}-mlflow 5000:5000)
193+
apiVersion: v1
194+
kind: Service
195+
metadata:
196+
name: {{name}}-mlflow
197+
namespace: {{ namespace }}
198+
spec:
199+
selector:
200+
app: mlflow
201+
ports:
202+
- port: 5000
203+
targetPort: 5000
204+
type: ClusterIP

0 commit comments

Comments
 (0)