From 65d1e8592239fbb231dedaf35c2eba7555cb50f9 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 2 Jul 2025 15:53:21 -0300 Subject: [PATCH 1/3] Adds sample yaml file for pod creation Signed-off-by: Flavia Beo --- pod.yaml | 109 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 pod.yaml diff --git a/pod.yaml b/pod.yaml new file mode 100644 index 00000000..aadfb14a --- /dev/null +++ b/pod.yaml @@ -0,0 +1,109 @@ +kind: Pod +apiVersion: v1 +metadata: + name: -dev-e2e-1x-aiu +spec: + restartPolicy: Always + serviceAccountName: default + imagePullSecrets: + - name: -secret + priority: 0 + schedulerName: aiu-scheduler + enableServiceLinks: true + containers: + - resources: + limits: + ibm.com/aiu_pf: '1' + requests: + ibm.com/aiu_pf: '1' + terminationMessagePath: /dev/termination-log + name: -dev-e2e-1x-aiu + command: + - bash + - '-c' + env: + - name: FLEX_COMPUTE + value: SENTIENT + - name: FLEX_DEVICE + value: PF + - name: FLEX_OVERWRITE_NMB_FRAME + value: '1' + - name: FLEX_UNLINK_DEVMEM + value: 'false' + - name: PYTHONUNBUFFERED + value: '1' + - name: HOME + value: /home/senuser + - name: HF_HUB_OFFLINE + value: '1' + - name: HF_HOME + value: /home/senuser/models/huggingface_cache + - name: HF_HUB_CACHE + value: /home/senuser/models/huggingface_cache/hub + - name: DTLOG_LEVEL + value: error + - name: TORCH_SENDNN_LOG + value: CRITICAL + - name: DT_DEEPRT_VERBOSE + value: '-1' + - name: POD_IMAGE + value: *pod_image + - name: FMS_CHECKOUT + value: v1.1.0 + securityContext: + capabilities: + drop: + - ALL + runAsUser: 1000810000 + runAsNonRoot: true + allowPrivilegeEscalation: false + imagePullPolicy: IfNotPresent + volumeMounts: + - name: dev-shm + mountPath: /dev/shm + terminationMessagePolicy: File + image: &pod_image icr.io/ibmaiu_internal/x86_64/dd2/e2e_stable:latest + workingDir: /home/senuser + args: + - | + source ~/.bashrc + unset HF_HOME + cd $HOME + pip3 install -q -U transformers + git clone https://github.com/foundation-model-stack/foundation-model-stack.git + cd foundation-model-stack + git checkout $FMS_CHECKOUT + cp ${AIU_AUTOGEN_SENLIB_CONFIG_FILE} /tmp/etc/aiu/senlib_config.json + FILE=/tmp/etc/aiu/senlib_config.json + cat $FILE | jq '. += {"RISCV": {"DOOM": { "enable" : false}}, "SNT_MCI" : { "DCR": {"MCI_CTRL": {"ENABLE_RISCV": "0x0"} } }}' > $FILE.jq + mv $FILE.jq $FILE + cp /tmp/etc/aiu/senlib_config.json $HOME/.senlib.json + echo "POD_IMAGE:" $POD_IMAGE >> /tmp/aiu-query-devices.txt + echo " " >> /tmp/aiu-query-devices.txt + /opt/sentient/bin/aiu-query-devices >> /tmp/aiu-query-devices.txt + echo " " >> ~/.bashrc + echo "cat /tmp/aiu-query-devices.txt" >> ~/.bashrc + echo 'FLEX_COMPUTE = ' $FLEX_COMPUTE + echo 'FLEX_DEVICE = ' $FLEX_DEVICE + echo 'DTLOG_LEVEL = ' $DTLOG_LEVEL + echo 'TORCH_SENDNN_LOG = ' $TORCH_SENDNN_LOG + echo 'DT_DEEPRT_VERBOSE = ' $DT_DEEPRT_VERBOSE + echo 'INFER_SCRIPT = ' $INFER_SCRIPT + echo 'MODEL = ' $MODEL + tail -f /dev/null + serviceAccount: default + volumes: + - name: dev-shm + emptyDir: + medium: Memory + sizeLimit: 64Gi + dnsPolicy: ClusterFirst + tolerations: + - key: node.kubernetes.io/not-ready + operator: Exists + effect: NoExecute + tolerationSeconds: 300 + - key: node.kubernetes.io/unreachable + operator: Exists + effect: NoExecute + tolerationSeconds: 300 From ac6864e6564e8b62940f297971026de0b086bb99 Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Wed, 2 Jul 2025 15:59:54 -0300 Subject: [PATCH 2/3] Removes tolerations Signed-off-by: Flavia Beo --- pod.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pod.yaml b/pod.yaml index aadfb14a..7a25879b 100644 --- a/pod.yaml +++ b/pod.yaml @@ -98,12 +98,3 @@ spec: medium: Memory sizeLimit: 64Gi dnsPolicy: ClusterFirst - tolerations: - - key: node.kubernetes.io/not-ready - operator: Exists - effect: NoExecute - tolerationSeconds: 300 - - key: node.kubernetes.io/unreachable - operator: Exists - effect: NoExecute - tolerationSeconds: 300 From 551aa366786821cf7aec0fcd8af46762593037be Mon Sep 17 00:00:00 2001 From: Flavia Beo Date: Thu, 3 Jul 2025 09:13:32 -0300 Subject: [PATCH 3/3] Add comments Signed-off-by: Flavia Beo --- pod.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pod.yaml b/pod.yaml index 7a25879b..b10eabfe 100644 --- a/pod.yaml +++ b/pod.yaml @@ -1,6 +1,7 @@ kind: Pod apiVersion: v1 metadata: + # this is a sample pod name. name: -dev-e2e-1x-aiu spec: restartPolicy: Always @@ -17,6 +18,7 @@ spec: requests: ibm.com/aiu_pf: '1' terminationMessagePath: /dev/termination-log + # Sample container name. Substitute with your own name. name: -dev-e2e-1x-aiu command: - bash @@ -36,6 +38,7 @@ spec: value: /home/senuser - name: HF_HUB_OFFLINE value: '1' + # This can be canned to your local home path environment. - name: HF_HOME value: /home/senuser/models/huggingface_cache - name: HF_HUB_CACHE @@ -62,6 +65,7 @@ spec: - name: dev-shm mountPath: /dev/shm terminationMessagePolicy: File + # AIU software image image: &pod_image icr.io/ibmaiu_internal/x86_64/dd2/e2e_stable:latest workingDir: /home/senuser args: