Skip to content

Commit 6e98b19

Browse files
kiukchungmeta-codesync[bot]
authored andcommitted
clean pod labels for any invalid characters (#1157)
Summary: Pull Request resolved: #1157 Reported on `torchx.__version__` since torchx follows PEP-440 standards for the version string, it could be that certain `torchx` builds (specifically those that are built from source) could have a "local label". For example: `0.8.0dev0+fb` would be a Meta "local build". Added logic to clean the pod label values to meet k8s specs specified in https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ Specifically: ``` Valid label value: * must be 63 characters or less (can be empty), unless empty * must begin and end with an alphanumeric character ([a-z0-9A-Z]) * could contain dashes (-), underscores (_), dots (.), and alphanumerics between. ``` We first replace any invalid chars with a valid one (`"."`) and finally truncate each pod label value to a MAX length of 63. Reviewed By: ndacosta, AbishekS Differential Revision: D85796595
1 parent 7abb9a2 commit 6e98b19

File tree

2 files changed

+75
-6
lines changed

2 files changed

+75
-6
lines changed

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@
101101

102102
import json
103103
import logging
104+
import re
104105
import warnings
105106
from dataclasses import dataclass
106107
from datetime import datetime
@@ -984,13 +985,34 @@ def create_scheduler(
984985
def pod_labels(
985986
app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
986987
) -> Dict[str, str]:
988+
989+
def clean(label_value: str) -> str:
990+
# cleans the provided `label_value` to make it compliant
991+
# to pod label specs as described in
992+
# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
993+
#
994+
# Valid label value:
995+
# must be 63 characters or less (can be empty),
996+
# unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
997+
# could contain dashes (-), underscores (_), dots (.), and alphanumerics between.
998+
999+
# Replace invalid characters (allow: alphanum, -, _, .) with "."
1000+
label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
1001+
# Replace leading non-alphanumeric with "."
1002+
label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
1003+
# Replace trailing non-alphanumeric with "."
1004+
label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)
1005+
1006+
# Trim to 63 characters
1007+
return label_value[:63]
1008+
9871009
return {
988-
LABEL_VERSION: torchx.__version__,
989-
LABEL_APP_NAME: app.name,
1010+
LABEL_VERSION: clean(torchx.__version__),
1011+
LABEL_APP_NAME: clean(app.name),
9901012
LABEL_ROLE_INDEX: str(role_idx),
991-
LABEL_ROLE_NAME: role.name,
1013+
LABEL_ROLE_NAME: clean(role.name),
9921014
LABEL_REPLICA_ID: str(replica_id),
993-
LABEL_KUBE_APP_NAME: app.name,
1015+
LABEL_KUBE_APP_NAME: clean(app.name),
9941016
LABEL_ORGANIZATION: "torchx.pytorch.org",
995-
LABEL_UNIQUE_NAME: app_id,
1017+
LABEL_UNIQUE_NAME: clean(app_id),
9961018
}

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,27 @@
2121
from torchx.schedulers import kubernetes_scheduler
2222
from torchx.schedulers.api import DescribeAppResponse, ListAppResponse
2323
from torchx.schedulers.docker_scheduler import has_docker
24+
from torchx.schedulers.ids import make_unique
2425
from torchx.schedulers.kubernetes_scheduler import (
2526
app_to_resource,
2627
create_scheduler,
2728
KubernetesJob,
2829
KubernetesOpts,
2930
KubernetesScheduler,
31+
LABEL_APP_NAME,
3032
LABEL_INSTANCE_TYPE,
33+
LABEL_KUBE_APP_NAME,
34+
LABEL_ORGANIZATION,
35+
LABEL_REPLICA_ID,
36+
LABEL_ROLE_INDEX,
37+
LABEL_ROLE_NAME,
38+
LABEL_UNIQUE_NAME,
39+
LABEL_VERSION,
3140
PLACEHOLDER_FIELD_PATH,
3241
role_to_pod,
3342
)
3443
from torchx.specs import AppDryRunInfo, AppState
44+
from torchx.util.strings import normalize_str
3545

3646
SKIP_DOCKER: bool = not has_docker()
3747

@@ -311,7 +321,7 @@ def test_submit_dryrun(self) -> None:
311321
torchx.pytorch.org/replica-id: '0'
312322
torchx.pytorch.org/role-index: '0'
313323
torchx.pytorch.org/role-name: trainer_foo
314-
torchx.pytorch.org/version: {torchx.__version__}
324+
torchx.pytorch.org/version: {torchx.__version__.replace("+", ".")}
315325
spec:
316326
containers:
317327
- command:
@@ -1309,6 +1319,43 @@ def test_validate_spec_long_pod_name(self) -> None:
13091319
self.assertIn("Pod name", str(ctx.exception))
13101320
self.assertIn("exceeds 63 character limit", str(ctx.exception))
13111321

1322+
def test_pod_label(self) -> None:
1323+
_UNUSED = "__UNUSED__"
1324+
1325+
app = specs.AppDef(
1326+
name="foo+bar",
1327+
roles=[specs.Role(name="a/b", image=_UNUSED)],
1328+
)
1329+
app_id = normalize_str(make_unique(app.name))
1330+
labels = kubernetes_scheduler.pod_labels(
1331+
app=app,
1332+
role_idx=0,
1333+
role=app.roles[0],
1334+
replica_id=1,
1335+
app_id=app_id,
1336+
)
1337+
1338+
self.assertDictEqual(
1339+
labels,
1340+
{
1341+
# torchx version complies with PEP-440
1342+
# while typically it is 0.x.x or 0.x.xdev0
1343+
# there could be org specific builds that are of the form
1344+
# 0.x.xdev0+org_name (e.g. 0.8.0dev0+fb)
1345+
# "+" is not a valid pod label char
1346+
# we expect that the version str would've been "cleaned"
1347+
# to replace invalid chars with "." (a valid char)
1348+
LABEL_VERSION: torchx.__version__.replace("+", "."),
1349+
LABEL_APP_NAME: "foo.bar",
1350+
LABEL_ROLE_INDEX: "0",
1351+
LABEL_ROLE_NAME: "a.b",
1352+
LABEL_REPLICA_ID: "1",
1353+
LABEL_KUBE_APP_NAME: "foo.bar",
1354+
LABEL_ORGANIZATION: "torchx.pytorch.org",
1355+
LABEL_UNIQUE_NAME: app_id,
1356+
},
1357+
)
1358+
13121359

13131360
class KubernetesSchedulerNoImportTest(unittest.TestCase):
13141361
"""

0 commit comments

Comments
 (0)