vllm-project
diff --git a/‎benchmarks/jina_embeddings_v4_validation.py
Lines changed: 89 additions & 86 deletions b/‎benchmarks/jina_embeddings_v4_validation.py
Lines changed: 89 additions & 86 deletions
@@ -9,7 +9,6 @@
 
 import argparse
 import time
-from typing import List, Tuple
 
 import numpy as np
 import torch
@@ -18,108 +17,112 @@
 
 from vllm import LLM
 from vllm.config import PoolerConfig
+from vllm.inputs.data import TextPrompt
 
 # Vision token IDs
 VISION_START_TOKEN_ID = 151652
 VISION_END_TOKEN_ID = 151653
-from vllm.inputs.data import TextPrompt
 
 
-def create_test_cases() -> List[Tuple[str, str, any]]:
+def create_test_cases() -> list[tuple[str, str, any]]:
     """Create comprehensive test cases for validation."""
     test_cases = []
-    
+
     # Text-only test cases
-    test_cases.extend([
-        ("text", "Query: What is artificial intelligence?", None),
-        ("text", "Passage: AI is a field of computer science focusing on creating intelligent machines.", None),
-        ("text", "Query: 你好世界", None),  # Chinese text
-        ("text", "Passage: " + " ".join(["word"] * 100), None),  # Long text
-    ])
-    
+    test_cases.extend(
+        [
+            ("text", "Query: What is artificial intelligence?", None),
+            (
+                "text",
+                "Passage: AI is a field of computer science focusing on "
+                "creating intelligent machines.",
+                None,
+            ),
+            ("text", "Query: 你好世界", None),  # Chinese text
+            ("text", "Passage: " + " ".join(["word"] * 100), None),  # Long text
+        ]
+    )
+
     # Image test cases
     for color in ["red", "green", "blue"]:
-        img = Image.new('RGB', (224, 224), color=color)
+        img = Image.new("RGB", (224, 224), color=color)
         test_cases.append(("image", f"{color} image", img))
-    
+
     # Complex image
-    complex_img = Image.new('RGB', (224, 224))
+    complex_img = Image.new("RGB", (224, 224))
     pixels = complex_img.load()
     for i in range(224):
         for j in range(224):
-            pixels[i, j] = (i % 256, j % 256, (i+j) % 256)
+            pixels[i, j] = (i % 256, j % 256, (i + j) % 256)
     test_cases.append(("image", "complex pattern", complex_img))
-    
+
     return test_cases
 
 
 def compute_hf_embeddings(
-    model_name: str,
-    test_cases: List[Tuple[str, str, any]]
-) -> List[torch.Tensor]:
+    model_name: str, test_cases: list[tuple[str, str, any]]
+) -> list[torch.Tensor]:
     """Compute embeddings using HuggingFace implementation."""
     print("Loading HuggingFace model...")
-    model = AutoModel.from_pretrained(
-        model_name,
-        trust_remote_code=True,
-        torch_dtype=torch.float16
-    ).cuda().eval()
-    
-    processor = AutoProcessor.from_pretrained(
-        model_name,
-        trust_remote_code=True
+    model = (
+        AutoModel.from_pretrained(
+            model_name, trust_remote_code=True, torch_dtype=torch.float16
+        )
+        .cuda()
+        .eval()
     )
-    
+
+    processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
+
     embeddings = []
-    
+
     print("Computing HuggingFace embeddings...")
     start_time = time.time()
-    
+
     for case_type, text, image in test_cases:
         if case_type == "text":
             inputs = processor(text=text, return_tensors="pt").to("cuda")
         else:  # image
             inputs = processor(
-                text="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",
+                text="<|im_start|>user\n<|vision_start|><|image_pad|>"
+                "<|vision_end|>Describe the image.<|im_end|>\n",
                 images=image,
-                return_tensors="pt"
+                return_tensors="pt",
             ).to("cuda")
-        
+
         with torch.no_grad():
             outputs = model(**inputs)
             # Extract embeddings based on model output structure
-            if hasattr(outputs, 'embeddings'):
+            if hasattr(outputs, "embeddings"):
                 embedding = outputs.embeddings[0]
             else:
                 # Fallback to last hidden state with custom pooling
                 hidden_states = outputs.last_hidden_state[0]
-                
+
                 # Apply token-type-aware pooling
-                input_ids = inputs['input_ids'][0]
-                vision_mask = (
-                    (input_ids >= VISION_START_TOKEN_ID) & 
-                    (input_ids <= VISION_END_TOKEN_ID)
+                input_ids = inputs["input_ids"][0]
+                vision_mask = (input_ids >= VISION_START_TOKEN_ID) & (
+                    input_ids <= VISION_END_TOKEN_ID
                 )
-                
+
                 if vision_mask.any():
                     embedding = hidden_states[vision_mask].mean(dim=0)
                 else:
                     embedding = hidden_states.mean(dim=0)
-                
+
                 embedding = torch.nn.functional.normalize(embedding, p=2, dim=-1)
-        
+
         embeddings.append(embedding.cpu())
-    
+
     hf_time = time.time() - start_time
     print(f"HuggingFace processing time: {hf_time:.2f}s")
-    
+
     return embeddings
 
 
 def compute_vllm_embeddings(
-    model_name: str,
-    test_cases: List[Tuple[str, str, any]]
-) -> List[torch.Tensor]:
+    model_name: str, test_cases: list[tuple[str, str, any]]
+) -> list[torch.Tensor]:
     """Compute embeddings using vLLM implementation."""
     print("\nLoading vLLM model...")
     model = LLM(
@@ -128,93 +131,93 @@ def compute_vllm_embeddings(
         override_pooler_config=PoolerConfig(pooling_type="ALL", normalize=False),
         dtype="float16",
     )
-    
+
     embeddings = []
     prompts = []
-    
+
     # Prepare prompts
     for case_type, text, image in test_cases:
         if case_type == "text":
             prompt = TextPrompt(prompt=text)
         else:  # image
             prompt = TextPrompt(
-                prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",
+                prompt="<|im_start|>user\n<|vision_start|><|image_pad|>"
+                "<|vision_end|>Describe the image.<|im_end|>\n",
                 multi_modal_data={"image": image},
             )
         prompts.append(prompt)
-    
+
     print("Computing vLLM embeddings...")
     start_time = time.time()
-    
+
     # Process all at once for better performance
     outputs = model.encode(prompts)
-    
+
     for output in outputs:
         # Extract based on token type
         if 151652 in output.prompt_token_ids:  # VISION_START_TOKEN_ID
             img_start = output.prompt_token_ids.index(151652)
             img_end = output.prompt_token_ids.index(151653)
-            embedding_data = output.outputs.data[img_start:img_end + 1]
+            embedding_data = output.outputs.data[img_start : img_end + 1]
         else:
             embedding_data = output.outputs.data
-        
+
         # Pool and normalize
         pooled = embedding_data.mean(dim=0, dtype=torch.float32)
         normalized = torch.nn.functional.normalize(pooled, p=2, dim=-1)
         embeddings.append(normalized.cpu())
-    
+
     vllm_time = time.time() - start_time
     print(f"vLLM processing time: {vllm_time:.2f}s")
-    
+
     return embeddings
 
 
 def compare_embeddings(
-    hf_embeddings: List[torch.Tensor],
-    vllm_embeddings: List[torch.Tensor],
-    test_cases: List[Tuple[str, str, any]]
+    hf_embeddings: list[torch.Tensor],
+    vllm_embeddings: list[torch.Tensor],
+    test_cases: list[tuple[str, str, any]],
 ) -> None:
     """Compare embeddings and report differences."""
-    print("\n" + "="*60)
+    print("\n" + "=" * 60)
     print("EMBEDDING COMPARISON RESULTS")
-    print("="*60)
-    
+    print("=" * 60)
+
     similarities = []
     max_diffs = []
-    
+
     for i, (case_type, desc, _) in enumerate(test_cases):
         hf_emb = hf_embeddings[i]
         vllm_emb = vllm_embeddings[i]
-        
+
         # Compute cosine similarity
         similarity = torch.nn.functional.cosine_similarity(
-            hf_emb.unsqueeze(0),
-            vllm_emb.unsqueeze(0)
+            hf_emb.unsqueeze(0), vllm_emb.unsqueeze(0)
         ).item()
-        
+
         # Compute max absolute difference
         max_diff = torch.max(torch.abs(hf_emb - vllm_emb)).item()
-        
+
         similarities.append(similarity)
         max_diffs.append(max_diff)
-        
-        print(f"\nTest case {i+1}: {case_type} - {desc[:50]}...")
+
+        print(f"\nTest case {i + 1}: {case_type} - {desc[:50]}...")
         print(f"  Cosine similarity: {similarity:.6f}")
         print(f"  Max absolute diff: {max_diff:.6f}")
         print(f"  HF norm: {hf_emb.norm():.6f}, vLLM norm: {vllm_emb.norm():.6f}")
-        
+
         # Flag significant differences
         if similarity < 0.99:
-            print(f"  ⚠️  WARNING: Low similarity detected!")
-    
+            print("  ⚠️  WARNING: Low similarity detected!")
+
     # Summary statistics
-    print("\n" + "-"*60)
+    print("\n" + "-" * 60)
     print("SUMMARY STATISTICS")
-    print("-"*60)
+    print("-" * 60)
     print(f"Average cosine similarity: {np.mean(similarities):.6f}")
     print(f"Min cosine similarity: {np.min(similarities):.6f}")
     print(f"Max absolute difference: {np.max(max_diffs):.6f}")
-    
+
     # Overall assessment
     if np.min(similarities) > 0.99:
         print("\n✅ VALIDATION PASSED: vLLM implementation matches HuggingFace")
@@ -230,27 +233,27 @@ def main():
         "--model",
         type=str,
         default="jinaai/jina-embeddings-v4-vllm-retrieval",
-        help="Model name to test"
+        help="Model name to test",
     )
     parser.add_argument(
         "--skip-hf",
         action="store_true",
-        help="Skip HuggingFace comparison (for performance testing only)"
+        help="Skip HuggingFace comparison (for performance testing only)",
     )
-    
+
     args = parser.parse_args()
-    
+
     # Create test cases
     test_cases = create_test_cases()
     print(f"Created {len(test_cases)} test cases")
-    
+
     # Compute vLLM embeddings
     vllm_embeddings = compute_vllm_embeddings(args.model, test_cases)
-    
+
     if not args.skip_hf:
         # Compute HuggingFace embeddings
         hf_embeddings = compute_hf_embeddings(args.model, test_cases)
-        
+
         # Compare results
         compare_embeddings(hf_embeddings, vllm_embeddings, test_cases)
     else:
@@ -259,4 +262,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()