Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ jobs:
PYTHONPATH: "."
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
TOGETHER_API_KEY: ${{ secrets.TOGETHER_API_KEY }}
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
JUDGMENT_DEV: true

steps:
Expand All @@ -49,7 +51,7 @@ jobs:
cd src
export JUDGMENT_API_KEY="$JUDGEVAL_GH_JUDGMENT_API_KEY"
export JUDGMENT_ORG_ID="$JUDGEVAL_GH_JUDGMENT_ORG_ID"
uv run pytest tests
uv run pytest tests -n auto

run-e2e-tests:
needs: [validate-branch]
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
repos:
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.8.23
rev: 0.9.2
hooks:
- id: uv-lock

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.3
rev: v0.14.0
hooks:
- id: ruff
name: ruff (linter)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ license-files = ["LICENSE.md"]
dependencies = [
"dotenv",
"httpx>=0.28.1",
"litellm>=1.75.0",
"litellm>=1.75.0",
"opentelemetry-exporter-otlp>=1.36.0",
"opentelemetry-sdk>=1.36.0",
"orjson>=3.9.0",
Expand Down
72 changes: 0 additions & 72 deletions src/e2etests/test_tracer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import time
from openai import OpenAI, AsyncOpenAI
from anthropic import Anthropic, AsyncAnthropic
from groq import Groq, AsyncGroq
from together import Together, AsyncTogether
from google import genai
from e2etests.utils import (
Expand Down Expand Up @@ -43,14 +42,12 @@ def teardown_module(module):
# Wrap clients
openai_client = wrap(OpenAI())
anthropic_client = wrap(Anthropic())
groq_client = wrap(Groq(api_key=os.getenv("GROQ_API_KEY")))
together_client = wrap(Together(api_key=os.getenv("TOGETHER_API_KEY")))
google_client = wrap(genai.Client(api_key=os.getenv("GOOGLE_API_KEY")))

# Async clients
openai_client_async = wrap(AsyncOpenAI())
anthropic_client_async = wrap(AsyncAnthropic())
groq_client_async = wrap(AsyncGroq(api_key=os.getenv("GROQ_API_KEY")))
together_client_async = wrap(AsyncTogether(api_key=os.getenv("TOGETHER_API_KEY")))

QUERY_RETRY = 15
Expand Down Expand Up @@ -107,18 +104,6 @@ def anthropic_llm_call():
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
def groq_llm_call():
groq_client.chat.completions.create(
model="llama-3.1-8b-instant",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": PROMPT},
],
)
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
def together_llm_call():
together_client.chat.completions.create(
Expand Down Expand Up @@ -174,25 +159,6 @@ def anthropic_streaming_llm_call():
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
def groq_streaming_llm_call():
stream = groq_client.chat.completions.create(
model="llama-3.1-8b-instant",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": PROMPT},
],
stream=True,
)

accumulated_content = ""
for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
accumulated_content += chunk.choices[0].delta.content

return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
def together_streaming_llm_call():
stream = together_client.chat.completions.create(
Expand Down Expand Up @@ -231,14 +197,6 @@ async def anthropic_async_llm_call():
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
async def groq_async_llm_call():
await groq_client_async.chat.completions.create(
model="llama-3.1-8b-instant", messages=[{"role": "user", "content": PROMPT}]
)
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
async def together_async_llm_call():
await together_client_async.chat.completions.create(
Expand Down Expand Up @@ -285,25 +243,6 @@ async def anthropic_async_streaming_llm_call():
return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
async def groq_async_streaming_llm_call():
stream = await groq_client_async.chat.completions.create(
model="llama-3.1-8b-instant",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": PROMPT},
],
stream=True,
)

accumulated_content = ""
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
accumulated_content += chunk.choices[0].delta.content

return format(judgment.get_current_span().get_span_context().trace_id, "032x")


@judgment.observe()
async def together_async_streaming_llm_call():
stream = await together_client_async.chat.completions.create(
Expand Down Expand Up @@ -411,11 +350,6 @@ def test_anthropic_llm_cost():
retrieve_llm_cost_helper(trace_id)


def test_groq_llm_cost():
trace_id = groq_llm_call()
retrieve_llm_cost_helper(trace_id)


def test_together_llm_cost():
trace_id = together_llm_call()
retrieve_llm_cost_helper(trace_id)
Expand All @@ -438,12 +372,6 @@ async def test_anthropic_async_llm_cost():
retrieve_llm_cost_helper(trace_id)


@pytest.mark.asyncio
async def test_groq_async_llm_cost():
trace_id = await groq_async_llm_call()
retrieve_llm_cost_helper(trace_id)


@pytest.mark.asyncio
async def test_together_async_llm_cost():
trace_id = await together_async_llm_call()
Expand Down
3 changes: 1 addition & 2 deletions src/judgeval/api/api_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: .openapi.json
# timestamp: 2025-10-09T00:16:42+00:00
# timestamp: 2025-10-15T19:25:00+00:00

from __future__ import annotations
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
Expand Down Expand Up @@ -94,7 +94,6 @@ class ResolveProjectNameRequest(TypedDict):

class ResolveProjectNameResponse(TypedDict):
project_id: str
project_created: bool


class TraceIdRequest(TypedDict):
Expand Down
3 changes: 1 addition & 2 deletions src/judgeval/data/judgment_types.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# generated by datamodel-codegen:
# filename: .openapi.json
# timestamp: 2025-10-09T00:16:41+00:00
# timestamp: 2025-10-15T19:24:59+00:00

from __future__ import annotations
from typing import Annotated, Any, Dict, List, Optional, Union
Expand Down Expand Up @@ -101,7 +101,6 @@ class ResolveProjectNameRequest(BaseModel):

class ResolveProjectNameResponse(BaseModel):
project_id: Annotated[str, Field(title="Project Id")]
project_created: Annotated[bool, Field(title="Project Created")]


class TraceIdRequest(BaseModel):
Expand Down
59 changes: 7 additions & 52 deletions src/judgeval/tracer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,6 @@
from judgeval.api import JudgmentSyncClient
from judgeval.tracer.llm import wrap_provider
from judgeval.utils.url import url_for
from judgeval.tracer.local_eval_queue import LocalEvaluationQueue
from judgeval.tracer.processors import (
JudgmentSpanProcessor,
NoOpJudgmentSpanProcessor,
Expand Down Expand Up @@ -99,7 +98,6 @@ class Tracer(metaclass=SingletonMeta):
"enable_evaluation",
"resource_attributes",
"api_client",
"local_eval_queue",
"judgment_processor",
"tracer",
"agent_context",
Expand All @@ -113,7 +111,6 @@ class Tracer(metaclass=SingletonMeta):
enable_evaluation: bool
resource_attributes: Optional[Dict[str, Any]]
api_client: JudgmentSyncClient
local_eval_queue: LocalEvaluationQueue
judgment_processor: JudgmentSpanProcessor
tracer: ABCTracer
agent_context: ContextVar[Optional[AgentContext]]
Expand Down Expand Up @@ -148,7 +145,6 @@ def __init__(
api_key=self.api_key,
organization_id=self.organization_id,
)
self.local_eval_queue = LocalEvaluationQueue()

if initialize:
self.initialize()
Expand All @@ -159,14 +155,10 @@ def initialize(self) -> Tracer:

self.judgment_processor = NoOpJudgmentSpanProcessor()
if self.enable_monitoring:
project_id, project_created = Tracer._resolve_project_id(
project_id = Tracer._resolve_project_id(
self.project_name, self.api_key, self.organization_id
) or (None, False)
)
if project_id:
if project_created:
judgeval_logger.info(
f"Project {self.project_name} was autocreated successfully."
)
self.judgment_processor = self.get_processor(
tracer=self,
project_name=self.project_name,
Expand All @@ -190,9 +182,6 @@ def initialize(self) -> Tracer:
get_version(),
)

if self.enable_evaluation and self.enable_monitoring:
self.local_eval_queue.start_workers()

self._initialized = True
atexit.register(self._atexit_flush)
return self
Expand Down Expand Up @@ -240,14 +229,14 @@ def get_processor(
@staticmethod
def _resolve_project_id(
project_name: str, api_key: str, organization_id: str
) -> Tuple[str, bool]:
) -> str:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

style: Return type annotation should be Optional[str] since @dont_throw decorator can cause this to return None on exceptions

Prompt To Fix With AI
This is a comment left during a code review.
Path: src/judgeval/tracer/__init__.py
Line: 232:232

Comment:
**style:** Return type annotation should be `Optional[str]` since `@dont_throw` decorator can cause this to return None on exceptions

How can I resolve this? If you propose a fix, please make it concise.

"""Resolve project_id from project_name using the API."""
client = JudgmentSyncClient(
api_key=api_key,
organization_id=organization_id,
)
response = client.projects_resolve({"project_name": project_name})
return response["project_id"], response["project_created"]
return response["project_id"]

def get_current_span(self):
return get_current_span()
Expand Down Expand Up @@ -299,6 +288,7 @@ def add_agent_attributes_to_span(self, span):
)
current_agent_context["is_agent_entry_point"] = False

@dont_throw
def record_instance_state(self, record_point: Literal["before", "after"], span):
current_agent_context = self.agent_context.get()

Expand Down Expand Up @@ -955,45 +945,10 @@ def async_evaluate(
eval_run.model_dump(warnings=False) # type: ignore
)
else:
# Enqueue the evaluation run to the local evaluation queue
self.local_eval_queue.enqueue(eval_run)

def wait_for_completion(self, timeout: Optional[float] = 30.0) -> bool:
"""Wait for all evaluations and span processing to complete.

This method blocks until all queued evaluations are processed and
all pending spans are flushed to the server.

Args:
timeout: Maximum time to wait in seconds. Defaults to 30 seconds.
None means wait indefinitely.

Returns:
True if all processing completed within the timeout, False otherwise.

"""
try:
judgeval_logger.debug(
"Waiting for all evaluations and spans to complete..."
judgeval_logger.warning(
"The scorer provided is not hosted, skipping evaluation."
)
Comment on lines +948 to 950
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This warning is helpful, but it could be more actionable for the user. Consider suggesting which types of scorers are supported for async_evaluate (e.g., ExampleAPIScorerConfig or server-hosted scorers) so the user knows how to fix their code.


# Wait for all queued evaluation work to complete
eval_completed = self.local_eval_queue.wait_for_completion()
if not eval_completed:
judgeval_logger.warning(
f"Local evaluation queue did not complete within {timeout} seconds"
)
return False

self.force_flush()

judgeval_logger.debug("All evaluations and spans completed successfully")
return True

except Exception as e:
judgeval_logger.warning(f"Error while waiting for completion: {e}")
return False


def wrap(client: ApiClient) -> ApiClient:
try:
Expand Down
Loading