JudgmentLabs · abhishekg999 · Oct 14, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -28,6 +28,8 @@ jobs:
       PYTHONPATH: "."
       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
+      GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
+      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
       JUDGMENT_DEV: true
 
     steps:
@@ -49,7 +51,7 @@ jobs:
           cd src
           export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
           export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
-          uv run pytest tests
+          uv run pytest tests -n auto
 
   run-e2e-tests:
     needs: [validate-branch]

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,11 +1,11 @@
 repos:
   - repo: https://github.com/astral-sh/uv-pre-commit
-    rev: 0.8.23
+    rev: 0.9.2
     hooks:
       - id: uv-lock
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.13.3
+    rev: v0.14.0
     hooks:
       - id: ruff
         name: ruff (linter)

diff --git a/pyproject.toml b/pyproject.toml
@@ -19,7 +19,7 @@ license-files = ["LICENSE.md"]
 dependencies = [
     "dotenv",
     "httpx>=0.28.1",
-    "litellm>=1.75.0",      
+    "litellm>=1.75.0",
     "opentelemetry-exporter-otlp>=1.36.0",
     "opentelemetry-sdk>=1.36.0",
     "orjson>=3.9.0",

diff --git a/src/e2etests/test_tracer.py b/src/e2etests/test_tracer.py
@@ -4,7 +4,6 @@
 import time
 from openai import OpenAI, AsyncOpenAI
 from anthropic import Anthropic, AsyncAnthropic
-from groq import Groq, AsyncGroq
 from together import Together, AsyncTogether
 from google import genai
 from e2etests.utils import (
@@ -43,14 +42,12 @@ def teardown_module(module):
 # Wrap clients
 openai_client = wrap(OpenAI())
 anthropic_client = wrap(Anthropic())
-groq_client = wrap(Groq(api_key=os.getenv("GROQ_API_KEY")))
 together_client = wrap(Together(api_key=os.getenv("TOGETHER_API_KEY")))
 google_client = wrap(genai.Client(api_key=os.getenv("GOOGLE_API_KEY")))
 
 # Async clients
 openai_client_async = wrap(AsyncOpenAI())
 anthropic_client_async = wrap(AsyncAnthropic())
-groq_client_async = wrap(AsyncGroq(api_key=os.getenv("GROQ_API_KEY")))
 together_client_async = wrap(AsyncTogether(api_key=os.getenv("TOGETHER_API_KEY")))
 
 QUERY_RETRY = 15
@@ -107,18 +104,6 @@ def anthropic_llm_call():
     return format(judgment.get_current_span().get_span_context().trace_id, "032x")
 
 
-@judgment.observe()
-def groq_llm_call():
-    groq_client.chat.completions.create(
-        model="llama-3.1-8b-instant",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": PROMPT},
-        ],
-    )
-    return format(judgment.get_current_span().get_span_context().trace_id, "032x")
-
-
 @judgment.observe()
 def together_llm_call():
     together_client.chat.completions.create(
@@ -174,25 +159,6 @@ def anthropic_streaming_llm_call():
     return format(judgment.get_current_span().get_span_context().trace_id, "032x")
 
 
-@judgment.observe()
-def groq_streaming_llm_call():
-    stream = groq_client.chat.completions.create(
-        model="llama-3.1-8b-instant",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": PROMPT},
-        ],
-        stream=True,
-    )
-
-    accumulated_content = ""
-    for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.content:
-            accumulated_content += chunk.choices[0].delta.content
-
-    return format(judgment.get_current_span().get_span_context().trace_id, "032x")
-
-
 @judgment.observe()
 def together_streaming_llm_call():
     stream = together_client.chat.completions.create(
@@ -231,14 +197,6 @@ async def anthropic_async_llm_call():
     return format(judgment.get_current_span().get_span_context().trace_id, "032x")
 
 
-@judgment.observe()
-async def groq_async_llm_call():
-    await groq_client_async.chat.completions.create(
-        model="llama-3.1-8b-instant", messages=[{"role": "user", "content": PROMPT}]
-    )
-    return format(judgment.get_current_span().get_span_context().trace_id, "032x")
-
-
 @judgment.observe()
 async def together_async_llm_call():
     await together_client_async.chat.completions.create(
@@ -285,25 +243,6 @@ async def anthropic_async_streaming_llm_call():
     return format(judgment.get_current_span().get_span_context().trace_id, "032x")
 
 
-@judgment.observe()
-async def groq_async_streaming_llm_call():
-    stream = await groq_client_async.chat.completions.create(
-        model="llama-3.1-8b-instant",
-        messages=[
-            {"role": "system", "content": "You are a helpful assistant."},
-            {"role": "user", "content": PROMPT},
-        ],
-        stream=True,
-    )
-
-    accumulated_content = ""
-    async for chunk in stream:
-        if chunk.choices and chunk.choices[0].delta.content:
-            accumulated_content += chunk.choices[0].delta.content
-
-    return format(judgment.get_current_span().get_span_context().trace_id, "032x")
-
-
 @judgment.observe()
 async def together_async_streaming_llm_call():
     stream = await together_client_async.chat.completions.create(
@@ -411,11 +350,6 @@ def test_anthropic_llm_cost():
     retrieve_llm_cost_helper(trace_id)
 
 
-def test_groq_llm_cost():
-    trace_id = groq_llm_call()
-    retrieve_llm_cost_helper(trace_id)
-
-
 def test_together_llm_cost():
     trace_id = together_llm_call()
     retrieve_llm_cost_helper(trace_id)
@@ -438,12 +372,6 @@ async def test_anthropic_async_llm_cost():
     retrieve_llm_cost_helper(trace_id)
 
 
-@pytest.mark.asyncio
-async def test_groq_async_llm_cost():
-    trace_id = await groq_async_llm_call()
-    retrieve_llm_cost_helper(trace_id)
-
-
 @pytest.mark.asyncio
 async def test_together_async_llm_cost():
     trace_id = await together_async_llm_call()

diff --git a/src/judgeval/api/api_types.py b/src/judgeval/api/api_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-10-09T00:16:42+00:00
+#   timestamp: 2025-10-15T19:25:00+00:00
 
 from __future__ import annotations
 from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
@@ -94,7 +94,6 @@ class ResolveProjectNameRequest(TypedDict):
 
 class ResolveProjectNameResponse(TypedDict):
     project_id: str
-    project_created: bool
 
 
 class TraceIdRequest(TypedDict):

diff --git a/src/judgeval/data/judgment_types.py b/src/judgeval/data/judgment_types.py
@@ -1,6 +1,6 @@
 # generated by datamodel-codegen:
 #   filename:  .openapi.json
-#   timestamp: 2025-10-09T00:16:41+00:00
+#   timestamp: 2025-10-15T19:24:59+00:00
 
 from __future__ import annotations
 from typing import Annotated, Any, Dict, List, Optional, Union
@@ -101,7 +101,6 @@ class ResolveProjectNameRequest(BaseModel):
 
 class ResolveProjectNameResponse(BaseModel):
     project_id: Annotated[str, Field(title="Project Id")]
-    project_created: Annotated[bool, Field(title="Project Created")]
 
 
 class TraceIdRequest(BaseModel):

diff --git a/src/judgeval/tracer/__init__.py b/src/judgeval/tracer/__init__.py
@@ -66,7 +66,6 @@
 from judgeval.api import JudgmentSyncClient
 from judgeval.tracer.llm import wrap_provider
 from judgeval.utils.url import url_for
-from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
 from judgeval.tracer.processors import (
     JudgmentSpanProcessor,
     NoOpJudgmentSpanProcessor,
@@ -99,7 +98,6 @@ class Tracer(metaclass=SingletonMeta):
         "enable_evaluation",
         "resource_attributes",
         "api_client",
-        "local_eval_queue",
         "judgment_processor",
         "tracer",
         "agent_context",
@@ -113,7 +111,6 @@ class Tracer(metaclass=SingletonMeta):
     enable_evaluation: bool
     resource_attributes: Optional[Dict[str, Any]]
     api_client: JudgmentSyncClient
-    local_eval_queue: LocalEvaluationQueue
     judgment_processor: JudgmentSpanProcessor
     tracer: ABCTracer
     agent_context: ContextVar[Optional[AgentContext]]
@@ -148,7 +145,6 @@ def __init__(
                 api_key=self.api_key,
                 organization_id=self.organization_id,
             )
-            self.local_eval_queue = LocalEvaluationQueue()
 
             if initialize:
                 self.initialize()
@@ -159,14 +155,10 @@ def initialize(self) -> Tracer:
 
         self.judgment_processor = NoOpJudgmentSpanProcessor()
         if self.enable_monitoring:
-            project_id, project_created = Tracer._resolve_project_id(
+            project_id = Tracer._resolve_project_id(
                 self.project_name, self.api_key, self.organization_id
-            ) or (None, False)
+            )
             if project_id:
-                if project_created:
-                    judgeval_logger.info(
-                        f"Project {self.project_name} was autocreated successfully."
-                    )
                 self.judgment_processor = self.get_processor(
                     tracer=self,
                     project_name=self.project_name,
@@ -190,9 +182,6 @@ def initialize(self) -> Tracer:
             get_version(),
         )
 
-        if self.enable_evaluation and self.enable_monitoring:
-            self.local_eval_queue.start_workers()
-
         self._initialized = True
         atexit.register(self._atexit_flush)
         return self
@@ -240,14 +229,14 @@ def get_processor(
     @staticmethod
     def _resolve_project_id(
         project_name: str, api_key: str, organization_id: str
-    ) -> Tuple[str, bool]:
+    ) -> str:
         """Resolve project_id from project_name using the API."""
         client = JudgmentSyncClient(
             api_key=api_key,
             organization_id=organization_id,
         )
         response = client.projects_resolve({"project_name": project_name})
-        return response["project_id"], response["project_created"]
+        return response["project_id"]
 
     def get_current_span(self):
         return get_current_span()
@@ -299,6 +288,7 @@ def add_agent_attributes_to_span(self, span):
         )
         current_agent_context["is_agent_entry_point"] = False
 
+    @dont_throw
     def record_instance_state(self, record_point: Literal["before", "after"], span):
         current_agent_context = self.agent_context.get()
 
@@ -955,45 +945,10 @@ def async_evaluate(
                 eval_run.model_dump(warnings=False)  # type: ignore
             )
         else:
-            # Enqueue the evaluation run to the local evaluation queue
-            self.local_eval_queue.enqueue(eval_run)
-
-    def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
-        """Wait for all evaluations and span processing to complete.
-
-        This method blocks until all queued evaluations are processed and
-        all pending spans are flushed to the server.
-
-        Args:
-            timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
-                    None means wait indefinitely.
-
-        Returns:
-            True if all processing completed within the timeout, False otherwise.
-
-        """
-        try:
-            judgeval_logger.debug(
-                "Waiting for all evaluations and spans to complete..."
+            judgeval_logger.warning(
+                "The scorer provided is not hosted, skipping evaluation."
             )
 
-            # Wait for all queued evaluation work to complete
-            eval_completed = self.local_eval_queue.wait_for_completion()
-            if not eval_completed:
-                judgeval_logger.warning(
-                    f"Local evaluation queue did not complete within {timeout} seconds"
-                )
-                return False
-
-            self.force_flush()
-
-            judgeval_logger.debug("All evaluations and spans completed successfully")
-            return True
-
-        except Exception as e:
-            judgeval_logger.warning(f"Error while waiting for completion: {e}")
-            return False
-
 
 def wrap(client: ApiClient) -> ApiClient:
     try: