|
| 1 | +# Jinja is used for templating here as full helm setup is excessive for application |
| 2 | +apiVersion: ray.io/v1alpha1 |
| 3 | +kind: RayCluster |
| 4 | +metadata: |
| 5 | + name: {{ name }} |
| 6 | + namespace: {{ namespace }} |
| 7 | +spec: |
| 8 | + rayVersion: "2.8.0" |
| 9 | + enableInTreeAutoscaling: true |
| 10 | + autoscalerOptions: |
| 11 | + upscalingMode: Default |
| 12 | + idleTimeoutSeconds: 120 |
| 13 | + imagePullPolicy: Always |
| 14 | + securityContext: {} |
| 15 | + envFrom: [] |
| 16 | + |
| 17 | + headGroupSpec: |
| 18 | + rayStartParams: |
| 19 | + block: "true" |
| 20 | + dashboard-host: 0.0.0.0 |
| 21 | + dashboard-port: "8265" |
| 22 | + node-ip-address: "0.0.0.0" |
| 23 | + port: "6379" |
| 24 | + include-dashboard: "true" |
| 25 | + ray-debugger-external: "true" |
| 26 | + object-manager-port: "8076" |
| 27 | + num-gpus: "0" |
| 28 | + num-cpus: "0" # prevent scheduling jobs to the head node - workers only |
| 29 | + headService: |
| 30 | + apiVersion: v1 |
| 31 | + kind: Service |
| 32 | + metadata: |
| 33 | + name: head |
| 34 | + spec: |
| 35 | + type: LoadBalancer |
| 36 | + template: |
| 37 | + metadata: |
| 38 | + labels: |
| 39 | + app.kubernetes.io/instance: tuner |
| 40 | + app.kubernetes.io/name: kuberay |
| 41 | + cloud.google.com/gke-ray-node-type: head |
| 42 | + spec: |
| 43 | + serviceAccountName: {{ service_account_name }} |
| 44 | + affinity: {} |
| 45 | + securityContext: |
| 46 | + fsGroup: 100 |
| 47 | + containers: |
| 48 | + - env: |
| 49 | + image: {{ image }} |
| 50 | + imagePullPolicy: Always |
| 51 | + name: head |
| 52 | + resources: |
| 53 | + limits: |
| 54 | + cpu: "{{ num_head_cpu }}" |
| 55 | + memory: {{ head_ram_gb }}G |
| 56 | + nvidia.com/gpu: "0" |
| 57 | + requests: |
| 58 | + cpu: "{{ num_head_cpu }}" |
| 59 | + memory: {{ head_ram_gb }}G |
| 60 | + nvidia.com/gpu: "0" |
| 61 | + securityContext: {} |
| 62 | + volumeMounts: |
| 63 | + - mountPath: /tmp/ray |
| 64 | + name: ray-logs |
| 65 | + command: ["/bin/bash", "-c", "ray start --head --port=6379 --object-manager-port=8076 --dashboard-host=0.0.0.0 --dashboard-port=8265 --include-dashboard=true && tail -f /dev/null"] |
| 66 | + - image: fluent/fluent-bit:1.9.6 |
| 67 | + name: fluentbit |
| 68 | + resources: |
| 69 | + limits: |
| 70 | + cpu: 100m |
| 71 | + memory: 128Mi |
| 72 | + requests: |
| 73 | + cpu: 100m |
| 74 | + memory: 128Mi |
| 75 | + volumeMounts: |
| 76 | + - mountPath: /tmp/ray |
| 77 | + name: ray-logs |
| 78 | + imagePullSecrets: [] |
| 79 | + nodeSelector: |
| 80 | + iam.gke.io/gke-metadata-server-enabled: "true" |
| 81 | + volumes: |
| 82 | + - configMap: |
| 83 | + name: fluentbit-config |
| 84 | + name: fluentbit-config |
| 85 | + - name: ray-logs |
| 86 | + emptyDir: {} |
| 87 | + |
| 88 | + workerGroupSpecs: |
| 89 | + {% for it in range(gpu_per_worker|length) %} |
| 90 | + - groupName: "{{ worker_accelerator[it] }}x{{ gpu_per_worker[it] }}-cpu-{{ cpu_per_worker[it] }}-ram-gb-{{ ram_gb_per_worker[it] }}" |
| 91 | + replicas: {{ num_workers[it] }} |
| 92 | + maxReplicas: {{ num_workers[it] }} |
| 93 | + minReplicas: {{ num_workers[it] }} |
| 94 | + rayStartParams: |
| 95 | + block: "true" |
| 96 | + ray-debugger-external: "true" |
| 97 | + replicas: "{{num_workers[it]}}" |
| 98 | + template: |
| 99 | + metadata: |
| 100 | + annotations: {} |
| 101 | + labels: |
| 102 | + app.kubernetes.io/instance: tuner |
| 103 | + app.kubernetes.io/name: kuberay |
| 104 | + cloud.google.com/gke-ray-node-type: worker |
| 105 | + spec: |
| 106 | + serviceAccountName: {{ service_account_name }} |
| 107 | + affinity: {} |
| 108 | + securityContext: |
| 109 | + fsGroup: 100 |
| 110 | + containers: |
| 111 | + - env: |
| 112 | + - name: NVIDIA_VISIBLE_DEVICES |
| 113 | + value: "all" |
| 114 | + - name: NVIDIA_DRIVER_CAPABILITIES |
| 115 | + value: "compute,utility" |
| 116 | + |
| 117 | + image: {{ image }} |
| 118 | + imagePullPolicy: Always |
| 119 | + name: ray-worker |
| 120 | + resources: |
| 121 | + limits: |
| 122 | + cpu: "{{ cpu_per_worker[it] }}" |
| 123 | + memory: {{ ram_gb_per_worker[it] }}G |
| 124 | + nvidia.com/gpu: "{{ gpu_per_worker[it] }}" |
| 125 | + requests: |
| 126 | + cpu: "{{ cpu_per_worker[it] }}" |
| 127 | + memory: {{ ram_gb_per_worker[it] }}G |
| 128 | + nvidia.com/gpu: "{{ gpu_per_worker[it] }}" |
| 129 | + securityContext: {} |
| 130 | + volumeMounts: |
| 131 | + - mountPath: /tmp/ray |
| 132 | + name: ray-logs |
| 133 | + command: ["/bin/bash", "-c", "ray start --address=head.{{ namespace }}.svc.cluster.local:6379 && tail -f /dev/null"] |
| 134 | + - image: fluent/fluent-bit:1.9.6 |
| 135 | + name: fluentbit |
| 136 | + resources: |
| 137 | + limits: |
| 138 | + cpu: 100m |
| 139 | + memory: 128Mi |
| 140 | + requests: |
| 141 | + cpu: 100m |
| 142 | + memory: 128Mi |
| 143 | + volumeMounts: |
| 144 | + - mountPath: /tmp/ray |
| 145 | + name: ray-logs |
| 146 | + |
| 147 | + imagePullSecrets: [] |
| 148 | + nodeSelector: |
| 149 | + cloud.google.com/gke-accelerator: {{ worker_accelerator[it] }} |
| 150 | + iam.gke.io/gke-metadata-server-enabled: "true" |
| 151 | + tolerations: |
| 152 | + - key: "nvidia.com/gpu" |
| 153 | + operator: "Exists" |
| 154 | + effect: "NoSchedule" |
| 155 | + volumes: |
| 156 | + - configMap: |
| 157 | + name: fluentbit-config |
| 158 | + name: fluentbit-config |
| 159 | + - name: ray-logs |
| 160 | + emptyDir: {} |
| 161 | + {% endfor %} |
| 162 | + |
| 163 | +--- |
| 164 | +# ML Flow Server - for fetching logs |
| 165 | +apiVersion: apps/v1 |
| 166 | +kind: Deployment |
| 167 | +metadata: |
| 168 | + name: {{name}}-mlflow |
| 169 | + namespace: {{ namespace }} |
| 170 | +spec: |
| 171 | + replicas: 1 |
| 172 | + selector: |
| 173 | + matchLabels: |
| 174 | + app: mlflow |
| 175 | + template: |
| 176 | + metadata: |
| 177 | + labels: |
| 178 | + app: mlflow |
| 179 | + spec: |
| 180 | + containers: |
| 181 | + - name: mlflow |
| 182 | + image: ghcr.io/mlflow/mlflow:v2.9.2 |
| 183 | + ports: |
| 184 | + - containerPort: 5000 |
| 185 | + command: ["mlflow"] |
| 186 | + args: |
| 187 | + - server |
| 188 | + - --host=0.0.0.0 |
| 189 | + - --port=5000 |
| 190 | + - --backend-store-uri=sqlite:///mlflow.db |
| 191 | +--- |
| 192 | +# ML Flow Service (for port forwarding, kubectl port-forward service/{name}-mlflow 5000:5000) |
| 193 | +apiVersion: v1 |
| 194 | +kind: Service |
| 195 | +metadata: |
| 196 | + name: {{name}}-mlflow |
| 197 | + namespace: {{ namespace }} |
| 198 | +spec: |
| 199 | + selector: |
| 200 | + app: mlflow |
| 201 | + ports: |
| 202 | + - port: 5000 |
| 203 | + targetPort: 5000 |
| 204 | + type: ClusterIP |
0 commit comments