feat(mlx): add gemma3 example (second-state#189)

grorge123 · web-flow · commit 1978c8574a21 · 2025-05-03T14:58:01.000+08:00
diff --git a/wasmedge-mlx/llama/Cargo.toml b/wasmedge-mlx/llama/Cargo.toml
diff --git a/wasmedge-mlx/llama/README.md b/wasmedge-mlx/llama/README.md
@@ -41,10 +41,10 @@ wget https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0/resolve/main/toke
 
 ## Build wasm
 
-Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasi/release/`
+Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasip1/release/`
 
 ```bash
-cargo build --target wasm32-wasi --release
+cargo build --target wasm32-wasip1 --release
 ```
 ## Execute 
 
@@ -53,7 +53,7 @@ Execute the WASM with the `wasmedge` using nn-preload to load model.
 ``` bash
 wasmedge --dir .:. \
  --nn-preload default:mlx:AUTO:model.safetensors \
-  ./target/wasm32-wasi/release/wasmedge-mlx.wasm default
+  ./target/wasm32-wasip1/release/wasmedge-mlx.wasm default
 
 ```
 
@@ -63,7 +63,7 @@ For example:
 ``` bash
 wasmedge --dir .:. \
  --nn-preload default:mlx:AUTO:llama2-7b/model-00001-of-00002.safetensors:llama2-7b/model-00002-of-00002.safetensors \
-  ./target/wasm32-wasi/release/wasmedge-mlx.wasm default
+  ./target/wasm32-wasip1/release/wasmedge-mlx.wasm default
 ```
 
 ## Other 
diff --git a/wasmedge-mlx/llama/src/main.rs b/wasmedge-mlx/llama/src/main.rs
diff --git a/wasmedge-mlx/vlm/Cargo.toml b/wasmedge-mlx/vlm/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-vlm"
+version = "0.1.0"
+edition = "2024"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.8.0"
diff --git a/wasmedge-mlx/vlm/README.md b/wasmedge-mlx/vlm/README.md
@@ -0,0 +1,90 @@
+# VLM example with WasmEdge WASI-NN MLX plugin
+
+This example demonstrates using WasmEdge WASI-NN MLX plugin to perform an inference task with VLM model.
+
+## Supported Models
+
+| Family | Models |
+|--------|--------|
+| Gemma 3 | gemma-3-4b-pt-bf16 |
+
+## Install WasmEdge with WASI-NN MLX plugin
+
+The MLX backend relies on [MLX](https://github.com/ml-explore/mlx), but we will auto-download MLX when you build WasmEdge. You do not need to install it yourself. If you want to custom MLX, install it yourself or set the `CMAKE_PREFIX_PATH` variable when configuring cmake.
+
+Build and install WasmEdge from source:
+
+``` bash
+cd <path/to/your/wasmedge/source/folder>
+
+cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_PLUGIN_WASI_NN_BACKEND="mlx"
+cmake --build build
+
+# For the WASI-NN plugin, you should install this project.
+cmake --install build
+```
+
+Then you will have an executable `wasmedge` runtime under `/usr/local/bin` and the WASI-NN with MLX backend plug-in under `/usr/local/lib/wasmedge/libwasmedgePluginWasiNN.so` after installation.
+
+## Install dependencies
+
+Currently, we use the Python transformer library to embed the prompt and image to input the token. You can use any other library instead of this step.
+
+``` bash
+sudo apt install python3 python3-pip
+pip install transformers pillow mlx
+```
+
+## Download the model and tokenizer
+
+In this example, we will use `gemma-3-4b-pt-bf16`.
+
+``` bash
+git clone https://huggingface.co/mlx-community/gemma-3-4b-pt-bf16
+```
+
+## Build wasm
+
+Run the following command to build wasm, the output WASM file will be at `target/wasm32-wasip1/release/`
+
+```bash
+cargo build --target wasm32-wasip1 --release
+```
+## Execute 
+
+Execute the WASM with the `wasmedge` using nn-preload to load model. 
+
+``` bash
+# Download sample image
+wget https://github.com/WasmEdge/WasmEdge/raw/master/docs/wasmedge-runtime-logo.png 
+
+# python encode.py <model_path> <image_path> <prompt>
+python encode.py gemma-3-4b-it-bf16 wasmedge-runtime-logo.png "What is this icon?"
+
+wasmedge --dir .:. \
+ --nn-preload default:mlx:AUTO:model.safetensors \
+  ./target/wasm32-wasip1/release/wasmedge-vlm.wasm default
+
+# python encode.py <model_path> <Output mlx array path>
+python decode.py gemma-3-4b-it-bf16 Answer.npy
+
+```
+
+If your model has multiple weight files, you need to provide all in the nn-preload.
+
+For example:
+``` bash
+wasmedge --dir .:. \                        
+  --nn-preload default:mlx:AUTO:gemma-3-4b-it-bf16/model-00001-of-00002.safetensors:gemma-3-4b-it-bf16/model-00002-of-00002.safetensors \
+  ./target/wasm32-wasip1/release/wasmedge-vlm.wasm default
+```
+
+## Other 
+
+There are some metadata for MLX plugin you can set.
+
+### Basic setting
+
+- model_type (required): model type.
+- max_token (option): maximum generate token number, default is 1024.
+- enable_debug_log (option): if print debug log, default is false.
diff --git a/wasmedge-mlx/vlm/decode.py b/wasmedge-mlx/vlm/decode.py
@@ -0,0 +1,51 @@
+from transformers import AutoProcessor
+import mlx.core as mx
+import sys
+
+
+def _remove_space(x):
+    if x and x[0] == " ":
+        return x[1:]
+    return x
+
+
+class Detokenizer():
+    def __init__(self, tokenizer, trim_space=True):
+        self.trim_space = trim_space
+        self.tokenmap = [None] * len(tokenizer.vocab)
+        for value, tokenid in tokenizer.vocab.items():
+            self.tokenmap[tokenid] = value
+        for i in range(len(self.tokenmap)):
+            if self.tokenmap[i].startswith("<0x"):
+                self.tokenmap[i] = chr(int(self.tokenmap[i][3:5], 16))
+
+        self.offset = 0
+        self._unflushed = ""
+        self.text = ""
+        self.tokens = []
+
+    def add_token(self, token):
+        v = self.tokenmap[token]
+        if v[0] == "\u2581":
+            if self.text or not self.trim_space:
+                self.text += self._unflushed.replace("\u2581", " ")
+            else:
+                self.text = _remove_space(
+                    self._unflushed.replace("\u2581", " "))
+            self._unflushed = v
+        else:
+            self._unflushed += v
+
+
+def decode(token: list, model_path: str, **kwargs):
+    processor = AutoProcessor.from_pretrained(model_path, **kwargs)
+    detokenizer = Detokenizer(processor.tokenizer)
+    for (i, token) in enumerate(token):
+        detokenizer.add_token(token)
+    return detokenizer.text
+
+
+if __name__ == "__main__":
+    model_path, output = sys.argv[1:]
+    tokenList = mx.load(output)
+    print(decode(tokenList.tolist(), model_path))
diff --git a/wasmedge-mlx/vlm/encode.py b/wasmedge-mlx/vlm/encode.py
@@ -0,0 +1,48 @@
+
+from transformers import AutoProcessor
+import mlx.core as mx
+from PIL import Image, ImageOps
+import sys
+
+
+def encode(processor, image, prompts):
+    model_inputs = {}
+    processor.tokenizer.pad_token = processor.tokenizer.eos_token
+
+    image = Image.open(image)
+    image = ImageOps.exif_transpose(image)
+    image = image.convert("RGB")
+    images = [image]
+    inputs = processor(
+        text=prompts, images=images, padding=True, return_tensors="mlx"
+    )
+    if "images" in inputs:
+        inputs["pixel_values"] = inputs["images"]
+        inputs.pop("images")
+
+    if isinstance(inputs["pixel_values"], list):
+        pixel_values = inputs["pixel_values"]
+    else:
+        pixel_values = mx.array(inputs["pixel_values"])
+
+    model_inputs["pixel_values"] = pixel_values
+    model_inputs["attention_mask"] = (
+        mx.array(inputs["attention_mask"]
+                 ) if "attention_mask" in inputs else None
+    )
+    # Convert inputs to model_inputs with mx.array if present
+    for key, value in inputs.items():
+        if key not in model_inputs and not isinstance(value, (str, list)):
+            model_inputs[key] = mx.array(value)
+    mx.save("input_ids.npy", model_inputs["input_ids"])
+    mx.save("pixel_values.npy", model_inputs["pixel_values"])
+    mx.save("mask.npy", model_inputs["attention_mask"])
+
+
+if __name__ == "__main__":
+    model_path, image, prompts = sys.argv[1:]
+    processor = AutoProcessor.from_pretrained(model_path)
+    formatted_prompt = f"<bos><start_of_turn>user\n\
+        {prompts}<start_of_image><end_of_turn>\n\
+            <start_of_turn>model"
+    encode(processor, image, formatted_prompt)
diff --git a/wasmedge-mlx/vlm/src/main.rs b/wasmedge-mlx/vlm/src/main.rs
@@ -0,0 +1,56 @@
+use serde_json::json;
+use std::env;
+use wasmedge_wasi_nn::{
+    self, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext, TensorType,
+};
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    return String::from_utf8_lossy(&output_buffer[..output_size]).to_string();
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+fn main() {
+    // prompt: "What is this icon?";
+    // image: "wasmedge-runtime-logo.png";
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+    let graph = GraphBuilder::new(GraphEncoding::Mlx, ExecutionTarget::AUTO)
+        .config(
+            serde_json::to_string(&json!({"model_type": "gemma3", "max_token":250}))
+                .expect("Failed to serialize options"),
+        )
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    let tensor_data = "input_ids.npy".as_bytes().to_vec();
+    context
+        .set_input(0, TensorType::U8, &[1], &tensor_data)
+        .expect("Failed to set input");
+    let tensor_data = "pixel_values.npy".as_bytes().to_vec();
+    context
+        .set_input(1, TensorType::U8, &[1], &tensor_data)
+        .expect("Failed to set input");
+    let tensor_data = "mask.npy".as_bytes().to_vec();
+    context
+        .set_input(2, TensorType::U8, &[1], &tensor_data)
+        .expect("Failed to set input");
+
+    context.compute().expect("Failed to compute");
+    let output = get_output_from_context(&context);
+    println!("{}", output.trim());
+}