yamt
diff --git a/‎.github/workflows/llama.yml
Lines changed: 18 additions & 0 deletions b/‎.github/workflows/llama.yml
Lines changed: 18 additions & 0 deletions
diff --git a/‎wasmedge-ggml/gemma-3/Cargo.toml
Lines changed: 16 additions & 0 deletions b/‎wasmedge-ggml/gemma-3/Cargo.toml
Lines changed: 16 additions & 0 deletions
diff --git a/‎wasmedge-ggml/gemma-3/README.md
Lines changed: 107 additions & 0 deletions b/‎wasmedge-ggml/gemma-3/README.md
Lines changed: 107 additions & 0 deletions
diff --git a/‎wasmedge-ggml/gemma-3/src/base64.rs
Lines changed: 203 additions & 0 deletions b/‎wasmedge-ggml/gemma-3/src/base64.rs
Lines changed: 203 additions & 0 deletions
diff --git a/‎wasmedge-ggml/gemma-3/src/main.rs
Lines changed: 204 additions & 0 deletions b/‎wasmedge-ggml/gemma-3/src/main.rs
Lines changed: 204 additions & 0 deletions
diff --git a/‎wasmedge-ggml/gemma-3/wasmedge-ggml-gemma-3-base64.wasm
898 KB b/‎wasmedge-ggml/gemma-3/wasmedge-ggml-gemma-3-base64.wasm
898 KB
diff --git a/‎wasmedge-ggml/gemma-3/wasmedge-ggml-gemma-3.wasm
191 KB b/‎wasmedge-ggml/gemma-3/wasmedge-ggml-gemma-3.wasm
191 KB
@@ -214,6 +214,24 @@ jobs:
                 'Hello, world.'
               sha1sum *.wav
 
+          - name: Gemma-3 Vision
+            shell: bash
+            run: |
+              test -f ~/.wasmedge/env && source ~/.wasmedge/env
+              cd wasmedge-ggml/gemma-3
+              curl -LO https://huggingface.co/second-state/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q5_K_M.gguf
+              curl -LO https://huggingface.co/second-state/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-mmproj-f16.gguf
+              curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
+              cargo build --target wasm32-wasip1 --release
+              time wasmedge --dir .:. \
+                --env n_gpu_layers="$NGL" \
+                --env image=monalisa.jpg \
+                --env mmproj=gemma-3-4b-it-mmproj-f16.gguf \
+                --nn-preload default:GGML:AUTO:gemma-3-4b-it-Q5_K_M.gguf \
+                target/wasm32-wasip1/release/wasmedge-ggml-gemma-3.wasm \
+                default \
+                $'<start_of_turn>user\n<start_of_image><image><end_of_image>Describe this image<end_of_turn>\n<start_of_turn>model\n'
+
           - name: Build llama-stream
             run: |
               cd wasmedge-ggml/llama-stream
 
@@ -0,0 +1,16 @@
+[package]
+name = "wasmedge-ggml-gemma-3"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.7.1"
+
+[[bin]]
+name = "wasmedge-ggml-gemma-3"
+path = "src/main.rs"
+
+[[bin]]
+name = "wasmedge-ggml-gemma-3-base64"
+path = "src/base64.rs"
@@ -0,0 +1,107 @@
+# Gemma-3 Example For WASI-NN with GGML Backend
+
+> [!NOTE]
+> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of the Gemma-3 model.
+
+## Get Gemma-3 Model
+
+```bash
+curl -LO https://huggingface.co/second-state/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-Q5_K_M.gguf
+curl -LO https://huggingface.co/second-state/gemma-3-4b-it-GGUF/resolve/main/gemma-3-4b-it-mmproj-f16.gguf
+```
+
+## Prepare the Image
+
+```bash
+curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
+```
+
+## Execute (with image file)
+
+> [!NOTE]
+> You may see some warnings stating `key clip.vision.* not found in file.`. These are expected and can be ignored.
+
+```console
+$ wasmedge --dir .:. \
+  --env mmproj=gemma-3-4b-it-mmproj-f16.gguf \
+  --env image=monalisa.jpg \
+  --nn-preload default:GGML:AUTO:gemma-3-4b-it-Q5_K_M.gguf \
+  wasmedge-ggml-gemme-3.wasm default
+```
+
+## Execute (with base64 encoded image)
+
+> [!NOTE]
+> You may see some warnings stating `key clip.vision.* not found in file.`. These are expected and can be ignored.
+
+```console
+$ wasmedge --dir .:. \
+  --env mmproj=gemma-3-4b-it-mmproj-f16.gguf \
+  --nn-preload default:GGML:AUTO:gemma-3-4b-it-Q5_K_M.gguf \
+  wasmedge-ggml-gemme-3-base64.wasm default
+```
+
+## Results
+
+```
+USER:
+Describe this image
+
+ASSISTANT:
+Okay, let's describe the image.
+
+The image is a portrait of a man, almost certainly **Leonardo da Vinci's *Mona Lisa*.**
+
+Here's a breakdown of the key features and overall impression:
+
+*   **Subject:** A woman, believed to be Lisa del Giocondo, is the central figure. She is seated, turned slightly to the viewer, with her hands folded in her
+lap.
+
+*   **Composition:** She's positioned in a pyramidal form, creating a sense of stability and balance.
+
+*   **Expression:** Her most famous feature is her enigmatic smile. It's subtle, almost ambiguous, and seems to shift depending on the angle of observation.
+This is a key part of the painting's enduring mystery.
+
+*   **Technique:** Da Vinci employed his signature *sfumato* technique - a subtle blending of colors and tones that creates a soft, hazy effect, particularly
+around her eyes and mouth. This contributes to the dreamlike quality of the painting.
+
+*   **Background:** A landscape is visible in the background, seemingly a hazy, distant vista of mountains and water. The landscape is atmospheric and slightl
+y blurred, further drawing attention to the figure.
+
+*   **Color Palette:** The painting employs a muted, earthy color palette – browns, greens, golds, and blues - giving it a timeless and serene quality.
+
+*   **Overall Impression:** The *Mona Lisa* is a masterpiece of Renaissance art. It's renowned for its realism, psychological depth, and technical brilliance.
+ It exudes an aura of mystery and beauty, which is why it's so widely recognized and studied.
+
+**Do you want me to delve deeper into a specific aspect of the image, such as:**
+
+*   The historical context of the painting?
+*   The techniques Da Vinci used?
+*   The theories surrounding her smile?
+
+USER:
+The techniques Da Vinci used?
+
+ASSISTANT:
+Okay, let’s dive deeper into the techniques Leonardo da Vinci employed to create the *Mona Lisa*. He was a meticulous and innovative artist, and the painting showcases several groundbreaking techniques he developed and perfected. Here’s a breakdown of the key ones:
+
+**1. Sfumato:**
+
+*   **What it is:** This is arguably the *most* famous technique associated with the *Mona Lisa*. “Sfumato” is an Italian word meaning “smoked” or “blurred.” It’s a subtle, almost imperceptible blending of colors and tones that creates a soft, hazy effect, particularly around the edges of forms and features.
+*   **How he used it:** Da Vinci achieved this by applying incredibly thin layers of oil paint – often just a glaze – and meticulously blending them without harsh lines. He’d work with a tiny brush, gradually building up the tones to create a sense of depth and softness. You see it most prominently around her eyes and mouth, contributing to the elusive quality of her smile.
+
+**2. Chiaroscuro:**
+
+*   **What it is:** Chiaroscuro (Italian for "light-dark") is the use of strong contrasts between light and dark to create dramatic effects.
+*   **How he used it:** Da Vinci uses chiaroscuro to model Mona Lisa’s face and body, giving her a three-dimensional appearance. The subtle gradations of light and shadow define her features and create a sense of volume.
+
+**3. Layering and Glazing:**
+
+*   **What it is:**  This technique involves applying many thin, translucent layers of paint (glazes) over a dry underpainting.
+*   **How he used it:** Da Vinci built up the colors of the *Mona Lisa* through numerous thin glazes. Each layer subtly alters the color and tone of the layer beneath it. This created a luminous quality and depth of color that was far more vibrant than previous painting techniques. It also helped to create the *sfumato* effect.
+
+**4. Aerial Perspective (Atmospheric Perspective):**
+
+*   **What it is:** This technique uses variations in color and detail to create the illusion of depth in a landscape. Objects further away appear paler, less detailed, and bluer.
+*   **How he used it:** The background landscape in the *Mona Lisa* demonstrates aerial perspective beautifully. The mountains and the distant water are rendered with muted colors and softened details, suggesting their
+```
@@ -0,0 +1,204 @@
+use serde_json::Value;
+use std::collections::HashMap;
+use std::env;
+use std::io;
+use wasmedge_wasi_nn::{
+    self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+
+fn read_input() -> String {
+    loop {
+        let mut answer = String::new();
+        io::stdin()
+            .read_line(&mut answer)
+            .expect("Failed to read line");
+        if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
+            return answer.trim().to_string();
+        }
+    }
+}
+
+fn get_options_from_env() -> HashMap<&'static str, Value> {
+    let mut options = HashMap::new();
+
+    // Required parameters for llava
+    if let Ok(val) = env::var("mmproj") {
+        options.insert("mmproj", Value::from(val.as_str()));
+    } else {
+        eprintln!("Failed to get mmproj model.");
+        std::process::exit(1);
+    }
+    if let Ok(val) = env::var("image") {
+        options.insert("image", Value::from(val.as_str()));
+    } else {
+        eprintln!("Failed to get the target image.");
+        std::process::exit(1);
+    }
+
+    // Optional parameters
+    if let Ok(val) = env::var("enable_log") {
+        options.insert("enable-log", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("enable-log", Value::from(false));
+    }
+    if let Ok(val) = env::var("enable_debug_log") {
+        options.insert(
+            "enable-debug-log",
+            serde_json::from_str(val.as_str()).unwrap(),
+        );
+    } else {
+        options.insert("enable-debug-log", Value::from(false));
+    }
+    if let Ok(val) = env::var("ctx_size") {
+        options.insert("ctx-size", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("ctx-size", Value::from(4096));
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options.insert("n-gpu-layers", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("n-gpu-layers", Value::from(0));
+    }
+    options
+}
+
+fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(0, TensorType::U8, &[1], &data)
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    String::from_utf8_lossy(&output_buffer[..output_size]).to_string()
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+fn get_metadata_from_context(context: &GraphExecutionContext) -> Value {
+    serde_json::from_str(&get_data_from_context(context, 1)).expect("Failed to get metadata")
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let options = get_options_from_env();
+    // You could also set the options manually like this:
+
+    // Create graph and initialize context.
+    let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() >= 3 {
+        let prompt = &args[2];
+        // Set the prompt.
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        println!("Response:");
+
+        // Get the number of input tokens and llama.cpp versions.
+        let input_metadata = get_metadata_from_context(&context);
+        println!("[INFO] llama_commit: {}", input_metadata["llama_commit"]);
+        println!(
+            "[INFO] llama_build_number: {}",
+            input_metadata["llama_build_number"]
+        );
+        println!(
+            "[INFO] Number of input tokens: {}",
+            input_metadata["input_tokens"]
+        );
+
+        // Get the output.
+        context.compute().expect("Failed to compute");
+        let output = get_output_from_context(&context);
+        println!("{}", output.trim());
+
+        // Retrieve the output metadata.
+        let metadata = get_metadata_from_context(&context);
+        println!(
+            "[INFO] Number of input tokens: {}",
+            metadata["input_tokens"]
+        );
+        println!(
+            "[INFO] Number of output tokens: {}",
+            metadata["output_tokens"]
+        );
+        return;
+    }
+
+    let mut saved_prompt = String::new();
+    let image_placeholder = "<image>";
+
+    loop {
+        println!("USER:");
+        let input = read_input();
+
+        // Gemma-3 prompt format: '<start_of_turn>user\n<start_of_image><image><end_of_image>Describe this image<end_of_turn>\n<start_of_turn>model\n'
+        if saved_prompt.is_empty() {
+            saved_prompt = format!(
+                "<start_of_turn>user\n<start_of_image>{}<end_of_image>{}<end_of_turn>\n<start_of_turn>model\n",
+                image_placeholder, input
+            );
+        } else {
+            saved_prompt = format!(
+                "{}<start_of_turn>user\n{}<end_of_turn>\n<start_of_turn>model\n",
+                saved_prompt, input
+            );
+        }
+
+        // Set prompt to the input tensor.
+        set_data_to_context(&mut context, saved_prompt.as_bytes().to_vec())
+            .expect("Failed to set input");
+
+        // Execute the inference.
+        let mut reset_prompt = false;
+        match context.compute() {
+            Ok(_) => (),
+            Err(Error::BackendError(BackendError::ContextFull)) => {
+                println!("\n[INFO] Context full, we'll reset the context and continue.");
+                reset_prompt = true;
+            }
+            Err(Error::BackendError(BackendError::PromptTooLong)) => {
+                println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+                reset_prompt = true;
+            }
+            Err(err) => {
+                println!("\n[ERROR] {}", err);
+                std::process::exit(1);
+            }
+        }
+
+        // Retrieve the output.
+        let mut output = get_output_from_context(&context);
+        println!("ASSISTANT:\n{}", output.trim());
+
+        // Update the saved prompt.
+        if reset_prompt {
+            saved_prompt.clear();
+        } else {
+            output = output.trim().to_string();
+            saved_prompt = format!("{}{}<end_of_turn>\n", saved_prompt, output);
+        }
+    }
+}