Skip to content

Commit 2b5774c

Browse files
SalikovAlexSeungjinYangcblmemo
authored
[Nebius] Add Docker support for Nebius cloud (#5334)
* Add Docker support for Nebius cloud This commit introduces Docker compatibility for Nebius, enabling users to specify Docker images and runtime options. It includes updates to templates, resource management, and Docker daemon initialization to ensure smooth GPU integration and proper runtime setup. Relevant tests have been adjusted to reflect the new functionality. * Enable Nebius support for H100 GPUs and CUDA >= 12.0 Updated templates and tests to add Nebius compatibility for H100 GPUs and CUDA versions 12.0 and above. Adjusted smoke test parameters and logic to incorporate these changes while maintaining backward compatibility. * fix typo * fix typo * Update formatting and clarify Docker installation logic Adjusted spacing in templates for consistency. Added inline documentation to explain the Docker installation process for specific OS images, optimizing deployment speed by selectively installing Docker only when required. * Fix GPU runtime option and refine Nebius template logic Corrected the comment in `nebius.py` regarding GPU runtime options and adjusted YAML logic to handle docker installation conditionally. Improved formatting and removed redundant lines to enhance clarity and maintainability. * Apply suggestions from code review Co-authored-by: Tian Xia <cblmemo@gmail.com> --------- Co-authored-by: Seung Jin <seungjin219@gmail.com> Co-authored-by: Tian Xia <cblmemo@gmail.com>
1 parent af6a728 commit 2b5774c

File tree

5 files changed

+107
-35
lines changed

5 files changed

+107
-35
lines changed

sky/clouds/nebius.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,10 +59,6 @@ class Nebius(clouds.Cloud):
5959
('Spot is not supported, as Nebius API does not implement spot.'),
6060
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
6161
(f'Migrating disk is currently not supported on {_REPR}.'),
62-
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
63-
(f'Docker image is currently not supported on {_REPR}. '
64-
'You can try running docker command inside the '
65-
'`run` section in task.yaml.'),
6662
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
6763
(f'Custom disk tier is currently not supported on {_REPR}.'),
6864
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
@@ -213,7 +209,8 @@ def make_deploy_resources_variables(
213209
else:
214210
raise RuntimeError('Unsupported instance type for Nebius cloud:'
215211
f' {resources.instance_type}')
216-
return {
212+
213+
resources_vars = {
217214
'instance_type': resources.instance_type,
218215
'custom_resources': custom_resources,
219216
'region': region.name,
@@ -222,6 +219,14 @@ def make_deploy_resources_variables(
222219
'zones': None,
223220
}
224221

222+
if acc_dict is not None:
223+
# Nebius cloud's docker runtime information does not contain
224+
# 'nvidia-container-runtime', causing no GPU option to be added to
225+
# the docker run command. We patch this by adding it here.
226+
resources_vars['docker_run_options'] = ['--gpus all']
227+
228+
return resources_vars
229+
225230
def _get_feasible_launchable_resources(
226231
self, resources: 'resources_lib.Resources'
227232
) -> 'resources_utils.FeasibleResources':

sky/provision/docker_utils.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@
2828
# the command.
2929
DOCKER_PERMISSION_DENIED_STR = ('permission denied while trying to connect to '
3030
'the Docker daemon socket')
31+
32+
DOCKER_SOCKET_NOT_READY_STR = ('Is the docker daemon running?')
33+
3134
_DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS = 30
3235

3336

@@ -173,22 +176,25 @@ def _run(self,
173176
stream_logs=False,
174177
separate_stderr=separate_stderr,
175178
log_path=self.log_path)
176-
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr and
177-
wait_for_docker_daemon):
178-
if time.time() - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
179-
if rc == 0:
180-
# Set returncode to 1 if failed to connect to docker
181-
# daemon after timeout.
182-
rc = 1
183-
break
184-
# Close the cached connection to make the permission update of
185-
# ssh user take effect, e.g. usermod -aG docker $USER, called
186-
# by cloud-init of Azure.
187-
self.runner.close_cached_connection()
188-
logger.info('Failed to connect to docker daemon. It might be '
189-
'initializing, retrying in 5 seconds...')
190-
time.sleep(5)
191-
continue
179+
if (DOCKER_PERMISSION_DENIED_STR in stdout + stderr or
180+
DOCKER_SOCKET_NOT_READY_STR in stdout + stderr):
181+
if wait_for_docker_daemon:
182+
if time.time(
183+
) - start > _DOCKER_SOCKET_WAIT_TIMEOUT_SECONDS:
184+
if rc == 0:
185+
# Set returncode to 1 if failed to connect to docker
186+
# daemon after timeout.
187+
rc = 1
188+
break
189+
# Close the cached connection to make the permission update
190+
# of ssh user take effect, e.g. usermod -aG docker $USER,
191+
# called by cloud-init of Azure.
192+
self.runner.close_cached_connection()
193+
logger.info(
194+
'Failed to connect to docker daemon. It might be '
195+
'initializing, retrying in 5 seconds...')
196+
time.sleep(5)
197+
continue
192198
break
193199
subprocess_utils.handle_returncode(
194200
rc,

sky/templates/nebius-ray.yml.j2

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,27 @@ provider:
1010
module: sky.provision.nebius
1111
region: "{{region}}"
1212

13+
{%- if docker_image is not none %}
14+
docker:
15+
image: {{docker_image}}
16+
container_name: {{docker_container_name}}
17+
run_options:
18+
- --ulimit nofile=1048576:1048576
19+
{%- for run_option in docker_run_options %}
20+
- {{run_option}}
21+
{%- endfor %}
22+
{%- if docker_login_config is not none %}
23+
docker_login_config:
24+
username: |-
25+
{{docker_login_config.username}}
26+
password: |-
27+
{{docker_login_config.password}}
28+
server: |-
29+
{{docker_login_config.server}}
30+
{%- endif %}
31+
{%- endif %}
32+
33+
1334
auth:
1435
ssh_user: ubuntu
1536
ssh_private_key: {{ssh_private_key}}
@@ -22,6 +43,48 @@ available_node_types:
2243
ImageId: {{image_id}}
2344
DiskSize: {{disk_size}}
2445
UserData: |
46+
{%- if docker_image is not none %}
47+
runcmd:
48+
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
49+
- systemctl restart sshd
50+
{%- endif %}
51+
52+
{# Two available OS images:
53+
1. ubuntu22.04-driverless - requires Docker installation
54+
2. ubuntu22.04-cuda12 - comes with Docker pre-installed
55+
To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
56+
{%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
57+
apt:
58+
sources:
59+
docker.list:
60+
source: deb [arch=amd64] https://download.docker.com/linux/ubuntu $RELEASE stable
61+
keyid: 9DC858229FC7DD38854AE2D88D81803C0EBFCD88
62+
63+
packages:
64+
- apt-transport-https
65+
- ca-certificates
66+
- curl
67+
- gnupg-agent
68+
- software-properties-common
69+
- docker-ce
70+
- docker-ce-cli
71+
- containerd.io
72+
73+
# Enable ipv4 forwarding, required on CIS hardened machines
74+
write_files:
75+
- path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
76+
content: |
77+
net.ipv4.conf.all.forwarding=1
78+
79+
# create the docker group
80+
groups:
81+
- docker
82+
83+
# Add default auto created user to docker group
84+
system_info:
85+
default_user:
86+
groups: [docker]
87+
{%- endif %}
2588
users:
2689
- name: skypilot:ssh_user
2790
shell: /bin/bash

tests/smoke_tests/test_cluster_job.py

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -46,9 +46,8 @@
4646
@pytest.mark.no_scp # SCP does not have T4 gpus. Run test_scp_job_queue instead
4747
@pytest.mark.no_paperspace # Paperspace does not have T4 gpus.
4848
@pytest.mark.no_oci # OCI does not have T4 gpus
49-
@pytest.mark.no_nebius # Nebius does not support T4 GPUs
5049
@pytest.mark.resource_heavy
51-
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
50+
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
5251
def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
5352
accelerator = accelerator.get(generic_cloud, 'T4')
5453
name = smoke_tests_utils.get_cluster_name()
@@ -85,8 +84,7 @@ def test_job_queue(generic_cloud: str, accelerator: Dict[str, str]):
8584
@pytest.mark.no_scp # Doesn't support SCP for now
8685
@pytest.mark.no_oci # Doesn't support OCI for now
8786
@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now
88-
@pytest.mark.no_nebius # Nebius does not support Docker
89-
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
87+
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
9088
@pytest.mark.parametrize(
9189
'image_id',
9290
[
@@ -109,6 +107,11 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str,
109107
name = smoke_tests_utils.get_cluster_name() + image_id[len('docker:'):][:4]
110108
total_timeout_minutes = 40 if generic_cloud == 'azure' else 15
111109
time_to_sleep = 300 if generic_cloud == 'azure' else 200
110+
# Nebius support Cuda >= 12.0
111+
if (image_id == 'docker:nvidia/cuda:11.8.0-devel-ubuntu18.04' and
112+
generic_cloud == 'nebius'):
113+
image_id = 'docker:nvidia/cuda:12.1.0-devel-ubuntu18.04'
114+
112115
test = smoke_tests_utils.Test(
113116
'job_queue_with_docker',
114117
[
@@ -139,7 +142,7 @@ def test_job_queue_with_docker(generic_cloud: str, image_id: str,
139142
f'sky logs {name} 5 --status',
140143
f'sky logs {name} 6 --status',
141144
# Make sure it is still visible after an stop & start cycle.
142-
f'sky exec {name} --image-id {image_id} nvidia-smi | grep "Tesla T4"',
145+
f'sky exec {name} --image-id {image_id} nvidia-smi | grep -i "{accelerator}"',
143146
f'sky logs {name} 7 --status'
144147
],
145148
f'sky down -y {name}',
@@ -228,8 +231,7 @@ def test_scp_job_queue():
228231
@pytest.mark.no_oci # OCI Cloud does not have T4 gpus.
229232
@pytest.mark.no_vast # Vast does not support num_nodes > 1 yet
230233
@pytest.mark.no_kubernetes # Kubernetes not support num_nodes > 1 yet
231-
@pytest.mark.no_nebius # Nebius does not have T4 gpus.
232-
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
234+
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
233235
def test_job_queue_multinode(generic_cloud: str, accelerator: Dict[str, str]):
234236
accelerator = accelerator.get(generic_cloud, 'T4')
235237
name = smoke_tests_utils.get_cluster_name()
@@ -383,7 +385,6 @@ def test_ibm_job_queue_multinode():
383385
@pytest.mark.no_scp # Doesn't support SCP for now
384386
@pytest.mark.no_oci # Doesn't support OCI for now
385387
@pytest.mark.no_kubernetes # Doesn't support Kubernetes for now
386-
@pytest.mark.no_nebius # Nebius does not support Docker
387388
# TODO(zhwu): we should fix this for kubernetes
388389
def test_docker_preinstalled_package(generic_cloud: str):
389390
name = smoke_tests_utils.get_cluster_name()
@@ -462,9 +463,8 @@ def test_multi_echo(generic_cloud: str):
462463
@pytest.mark.no_lambda_cloud # Lambda Cloud does not have V100 gpus
463464
@pytest.mark.no_ibm # IBM cloud currently doesn't provide public image with CUDA
464465
@pytest.mark.no_scp # SCP does not have V100 (16GB) GPUs. Run test_scp_huggingface instead.
465-
@pytest.mark.no_nebius # Nebius does not have T4 gpus for now
466466
@pytest.mark.resource_heavy
467-
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
467+
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
468468
def test_huggingface(generic_cloud: str, accelerator: Dict[str, str]):
469469
accelerator = accelerator.get(generic_cloud, 'T4')
470470
name = smoke_tests_utils.get_cluster_name()
@@ -1399,9 +1399,8 @@ def test_cancel_azure():
13991399
@pytest.mark.no_paperspace # Paperspace has `gnome-shell` on nvidia-smi
14001400
@pytest.mark.no_scp # SCP does not support num_nodes > 1 yet
14011401
@pytest.mark.no_vast # Vast does not support num_nodes > 1 yet
1402-
@pytest.mark.no_nebius # Nebius Cloud does not work with Cuda 11
14031402
@pytest.mark.resource_heavy
1404-
@pytest.mark.parametrize('accelerator', [{'do': 'H100'}])
1403+
@pytest.mark.parametrize('accelerator', [{'do': 'H100', 'nebius': 'H100'}])
14051404
def test_cancel_pytorch(generic_cloud: str, accelerator: Dict[str, str]):
14061405
accelerator = accelerator.get(generic_cloud, 'T4')
14071406
name = smoke_tests_utils.get_cluster_name()

tests/smoke_tests/test_mount_and_storage.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,7 +387,6 @@ def test_kubernetes_context_switch():
387387
# credentials for getting azure storage commands, even though the API server
388388
# is running remotely. We should fix this.
389389
@pytest.mark.no_vast # Requires AWS
390-
@pytest.mark.no_nebius # Docker image is currently not supported on Nebius.
391390
@pytest.mark.resource_heavy
392391
@pytest.mark.parametrize(
393392
'image_id',
@@ -557,7 +556,7 @@ def test_ibm_storage_mounts():
557556
[constants.SKY_IGNORE_FILE, constants.GIT_IGNORE_FILE])
558557
def test_ignore_exclusions(generic_cloud: str, ignore_file: str):
559558
"""Tests that .skyignore patterns correctly exclude files when using sky launch and sky jobs launch.
560-
559+
561560
Creates a temporary directory with various files and folders, adds a .skyignore file
562561
that excludes specific files and folders, then verifies the exclusions work properly
563562
when using sky launch and sky jobs launch commands.

0 commit comments

Comments
 (0)