fix container env + Neuron related changes

JingyaHuang · JingyaHuang · commit 142520a5e829 · 2025-11-03T16:38:18.000Z
diff --git a/Dockerfile-neuron b/Dockerfile-neuron
@@ -90,10 +90,9 @@ ARG NEURONX_COLLECTIVES_LIB_VERSION=2.28.27.0-bc30ece58
 ARG NEURONX_RUNTIME_LIB_VERSION=2.28.23.0-dd5879008
 ARG NEURONX_TOOLS_VERSION=2.26.14.0
 
-ARG NEURONX_CC_VERSION=2.21.18209.0+043b1bf7
-ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.13553+1e4dd6ca
+ARG NEURONX_CC_VERSION=2.21.33363.0+82129205
+ARG NEURONX_FRAMEWORK_VERSION=2.8.0.2.10.16998+e9bf8a50
 ARG NEURONX_DISTRIBUTED_VERSION=0.15.22404+1f27bddf
-ARG NEURONX_DISTRIBUTED_INFERENCE_VERSION=0.6.10598+a59fdc00
 
 RUN apt-get update \
  && apt-get upgrade -y \
@@ -137,13 +136,13 @@ RUN apt-get update \
  && apt-get clean
 
 ENV PATH="/opt/aws/neuron/bin:${PATH}"
-ENV NEURON_RT_VISIBLE_CORES=ALL
 
 RUN pip install --index-url https://pip.repos.neuron.amazonaws.com \
     --extra-index-url https://pypi.org/simple \
     --trusted-host pip.repos.neuron.amazonaws.com \
     neuronx-cc==$NEURONX_CC_VERSION \
     torch-neuronx==$NEURONX_FRAMEWORK_VERSION \
+    torchvision \
     neuronx_distributed==$NEURONX_DISTRIBUTED_VERSION \
  && rm -rf ~/.cache/pip/*
 
diff --git a/backends/python/server/text_embeddings_server/models/__init__.py b/backends/python/server/text_embeddings_server/models/__init__.py
@@ -14,7 +14,7 @@
 from text_embeddings_server.models.jinaBert_model import FlashJinaBert
 from text_embeddings_server.models.flash_mistral import FlashMistral
 from text_embeddings_server.models.flash_qwen3 import FlashQwen3
-from text_embeddings_server.models.neuron_models import NeuronSentenceTransformers
+from text_embeddings_server.models.neuron_models import NeuronSentenceTransformersModel
 
 from text_embeddings_server.utils.device import get_device, use_ipex, is_neuron
 
@@ -80,7 +80,7 @@ def get_model(model_path: Path, dtype: Optional[str], pool: str):
     # Neuron cases
     if is_neuron():
         if config.model_type == "bert":
-            return create_model(NeuronSentenceTransformers, model_path)
+            return create_model(NeuronSentenceTransformersModel, model_path, device, datatype)
 
     if (
         hasattr(config, "auto_map")
diff --git a/backends/python/server/text_embeddings_server/models/neuron_models.py b/backends/python/server/text_embeddings_server/models/neuron_models.py
@@ -3,7 +3,7 @@
 
 from pathlib import Path
 from typing import Type, List
-from optimum.neuron import NeuronModelForSentenceTransformers
+from optimum.neuron import NeuronSentenceTransformers
 from opentelemetry import trace
 
 from text_embeddings_server.models import Model
@@ -12,14 +12,14 @@
 tracer = trace.get_tracer(__name__)
 
 
-class NeuronSentenceTransformers(Model):
+class NeuronSentenceTransformersModel(Model):
     def __init__(
         self,
         model_path: Path,
         device: torch.device,
         dtype: torch.dtype,
     ):
-        model = NeuronModelForSentenceTransformers.from_pretrained(model_path)
+        model = NeuronSentenceTransformers.from_pretrained(model_path)
 
         self.hidden_size = model.config.hidden_size
         position_offset = 0
@@ -42,7 +42,7 @@ def __init__(
             is not None
         )
 
-        super(NeuronSentenceTransformers, self).__init__(
+        super(NeuronSentenceTransformersModel, self).__init__(
             model=model, dtype=dtype, device=device
         )
 
@@ -52,16 +52,20 @@ def batch_type(self) -> Type[PaddedBatch]:
 
     @tracer.start_as_current_span("embed")
     def embed(self, batch: PaddedBatch) -> List[Embedding]:
-        pass
-
-    @tracer.start_as_current_span("predict")
-    def predict(self, batch: PaddedBatch) -> List[Score]:
         kwargs = {"input_ids": batch.input_ids, "attention_mask": batch.attention_mask}
         if self.has_token_type_ids:
             kwargs["token_type_ids"] = batch.token_type_ids
-        if self.has_position_ids:
-            kwargs["position_ids"] = batch.position_ids
+        output = self.model(**kwargs)
+
+        sentence_embedding = output["sentence_embedding"]
 
-        output = self.model(**kwargs, return_dict=True)
-        all_scores = output.logits.tolist()
-        return [Score(values=scores) for scores in all_scores]
+        return [
+            Embedding(
+                values=sentence_embedding[i * self.hidden_size : (i + 1) * self.hidden_size]
+            )
+            for i in range(len(batch))
+        ]
+
+    @tracer.start_as_current_span("predict")
+    def predict(self, batch: PaddedBatch) -> List[Score]:
+        pass
diff --git a/backends/src/lib.rs b/backends/src/lib.rs
@@ -67,6 +67,15 @@ fn is_hpu() -> bool {
     }
 }
 
+fn is_neuron() -> bool {
+    match Command::new("neuron-ls")
+        .output()
+    {
+        Ok(output) => output.status.success(),
+        Err(_) => false,
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct Backend {
     /// Channel to communicate with the background thread
@@ -409,16 +418,39 @@ async fn init_backend(
     if let Some(api_repo) = api_repo.as_ref() {
         if cfg!(feature = "python") || cfg!(feature = "candle") {
             let start = std::time::Instant::now();
-            if download_safetensors(api_repo).await.is_err() {
-                tracing::warn!("safetensors weights not found. Using `pytorch_model.bin` instead. Model loading will be significantly slower.");
-                tracing::info!("Downloading `pytorch_model.bin`");
-                api_repo
-                    .get("pytorch_model.bin")
+            if is_neuron() {
+                tracing::info!("Downloading `model.neuron`");
+                let model_files = download_neuron(api_repo)
                     .await
                     .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
-            }
 
-            tracing::info!("Model weights downloaded in {:?}", start.elapsed());
+                if model_files.is_empty() {
+                    tracing::error!(
+                        "Neuron model files not found in the repository. \
+                        You can easily compile your model to neuron format following the guide: \
+                        https://huggingface.co/docs/optimum-neuron/en/model_doc/sentence_transformers/overview "
+                    );
+                    return Err(BackendError::WeightsNotFound(
+                        "No Neuron model files found".into(),
+                    ));
+                }
+
+                tracing::info!("Neuron model downloaded in {:?}", start.elapsed());
+            } else {
+                if download_safetensors(api_repo).await.is_err() {
+                    tracing::warn!(
+                        "safetensors weights not found. Using `pytorch_model.bin` instead. \
+                        Model loading will be significantly slower."
+                    );
+                    tracing::info!("Downloading `pytorch_model.bin`");
+                    api_repo
+                        .get("pytorch_model.bin")
+                        .await
+                        .map_err(|err| BackendError::WeightsNotFound(err.to_string()))?;
+                }
+
+                tracing::info!("Model weights downloaded in {:?}", start.elapsed());
+            }
         }
     }
 
@@ -655,6 +687,20 @@ async fn download_onnx(api: &ApiRepo) -> Result<Vec<PathBuf>, ApiError> {
     Ok(model_files)
 }
 
+async fn download_neuron(api: &ApiRepo) -> Result<Vec<PathBuf>, ApiError> {
+    let mut model_files: Vec<PathBuf> = Vec::new();
+
+    tracing::info!("Downloading `model.neuron`");
+    match api.get("model.neuron").await {
+        Ok(p) => model_files.push(p),
+        Err(err) => {
+            tracing::warn!("Could not download `model.neuron`: {err}");
+        }
+    };
+
+    Ok(model_files)
+}
+
 #[cfg(feature = "candle")]
 #[derive(Debug, Clone, Deserialize, PartialEq)]
 enum ModuleType {
diff --git a/docs/source/en/ aws_neuron.md b/docs/source/en/ aws_neuron.md
@@ -22,16 +22,16 @@ To build a container optimized for AWS Neuron devices, run the following command
 ```shell
 platform="neuron"
 
-docker build . -f Dockerfile-neuron -t tei_neuron
+docker build . -f Dockerfile-neuron -t tei-neuron:main
 ```
 
 ### Deploy Docker Container
 
 To deploy your model on an AWS Trainium or Inferentia instance, use the following command:
 
 ```shell
-model='Qwen/Qwen3-Embedding-0.6B'
+model='optimum/bge-base-en-v1.5-neuronx'
 volume=$PWD/data
 
-docker run -p 8080:80 -v $volume:/data tei_neuron --model-id $model
+docker run -p 8080:80 -v $volume:/data tei-neuron:main --model-id $model --dtype float32
 ```