Skip to content

Commit 5a25a52

Browse files
feat: add embedding hiding configuration and align spec with instrumentation
1 parent d5069b8 commit 5a25a52

File tree

29 files changed

+1105
-371
lines changed

29 files changed

+1105
-371
lines changed

python/instrumentation/openinference-instrumentation-beeai/pyproject.toml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,11 @@ instruments = [
3838
test = [
3939
"beeai-framework >= 0.1.36",
4040
"opentelemetry-sdk",
41-
"opentelemetry-exporter-otlp"
41+
"opentelemetry-exporter-otlp",
42+
"pytest",
43+
"pytest-vcr",
44+
"pytest-asyncio",
45+
"vcrpy"
4246
]
4347

4448
[project.entry-points.opentelemetry_instrumentor]

python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/processors/base.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import TYPE_CHECKING, Any, ClassVar
1+
from typing import TYPE_CHECKING, Any, ClassVar, Optional
22

33
if TYPE_CHECKING:
44
from beeai_framework.context import RunContextFinishEvent, RunContextStartEvent
@@ -18,7 +18,9 @@
1818
class Processor:
1919
kind: ClassVar[OpenInferenceSpanKindValues] = OpenInferenceSpanKindValues.UNKNOWN
2020

21-
def __init__(self, event: "RunContextStartEvent", meta: "EventMeta"):
21+
def __init__(
22+
self, event: "RunContextStartEvent", meta: "EventMeta", span_name: Optional[str] = None
23+
):
2224
from beeai_framework.context import RunContext
2325

2426
assert isinstance(meta.creator, RunContext)
@@ -27,7 +29,7 @@ def __init__(self, event: "RunContextStartEvent", meta: "EventMeta"):
2729
assert meta.trace is not None
2830
self.run_id = meta.trace.run_id
2931

30-
self.span = SpanWrapper(name=target_cls.__name__, kind=type(self).kind)
32+
self.span = SpanWrapper(name=span_name or target_cls.__name__, kind=type(self).kind)
3133
self.span.started_at = meta.created_at
3234
self.span.attributes.update(
3335
{

python/instrumentation/openinference-instrumentation-beeai/src/openinference/instrumentation/beeai/processors/embedding.py

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,23 @@
1212
from beeai_framework.context import RunContext
1313
from typing_extensions import override
1414

15+
from openinference.instrumentation import safe_json_dumps
1516
from openinference.instrumentation.beeai.processors.base import Processor
1617
from openinference.semconv.trace import (
1718
EmbeddingAttributes,
1819
OpenInferenceSpanKindValues,
1920
SpanAttributes,
2021
)
2122

23+
# TODO: Update to use SpanAttributes.EMBEDDING_INVOCATION_PARAMETERS when released in semconv
24+
_EMBEDDING_INVOCATION_PARAMETERS = "embedding.invocation_parameters"
25+
2226

2327
class EmbeddingModelProcessor(Processor):
2428
kind: ClassVar[OpenInferenceSpanKindValues] = OpenInferenceSpanKindValues.EMBEDDING
2529

2630
def __init__(self, event: "RunContextStartEvent", meta: "EventMeta"):
27-
super().__init__(event, meta)
31+
super().__init__(event, meta, span_name="CreateEmbeddings")
2832

2933
assert isinstance(meta.creator, RunContext)
3034
assert isinstance(meta.creator.instance, EmbeddingModel)
@@ -34,6 +38,7 @@ def __init__(self, event: "RunContextStartEvent", meta: "EventMeta"):
3438
{
3539
SpanAttributes.EMBEDDING_MODEL_NAME: llm.model_id,
3640
SpanAttributes.LLM_PROVIDER: llm.provider_id,
41+
SpanAttributes.LLM_SYSTEM: "beeai",
3742
}
3843
)
3944

@@ -45,20 +50,35 @@ async def update(
4550
) -> None:
4651
await super().update(event, meta)
4752

53+
# Add event to the span but don't create child spans
4854
self.span.add_event(f"{meta.name} ({meta.path})", timestamp=meta.created_at)
49-
self.span.child(meta.name, event=(event, meta))
5055

5156
if isinstance(event, EmbeddingModelStartEvent):
57+
# Extract invocation parameters
58+
invocation_params = {}
59+
if hasattr(event.input, "__dict__"):
60+
input_dict = vars(event.input)
61+
# Remove the actual text values from invocation parameters
62+
invocation_params = {k: v for k, v in input_dict.items() if k != "values"}
63+
if invocation_params:
64+
self.span.set_attribute(
65+
_EMBEDDING_INVOCATION_PARAMETERS,
66+
safe_json_dumps(invocation_params),
67+
)
68+
5269
for idx, txt in enumerate(event.input.values):
5370
self.span.set_attribute(
5471
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.{idx}.{EmbeddingAttributes.EMBEDDING_TEXT}",
5572
txt,
5673
)
5774
elif isinstance(event, EmbeddingModelSuccessEvent):
5875
for idx, embedding in enumerate(event.value.embeddings):
76+
# Ensure the embedding vector is a list, not a tuple
77+
# Always convert to list to handle tuples from BeeAI framework
78+
vector = list(embedding)
5979
self.span.set_attribute(
6080
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.{idx}.{EmbeddingAttributes.EMBEDDING_VECTOR}",
61-
embedding,
81+
vector,
6282
)
6383

6484
if event.value.usage:
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# BeeAI Instrumentation Tests
2+
3+
## Re-recording VCR Cassettes
4+
5+
When tests fail due to outdated VCR cassettes (e.g., API authentication errors or changed responses), follow these steps to re-record:
6+
7+
### Prerequisites
8+
1. Ensure `OPENAI_API_KEY` is set in your environment with a valid API key
9+
2. The `passenv = OPENAI_API_KEY` directive must be present in the root `tox.ini` file
10+
11+
### Steps to Re-record
12+
13+
1. Delete the existing cassette file:
14+
```bash
15+
rm tests/cassettes/test_openai_embeddings.yaml
16+
```
17+
18+
2. Run the tests with VCR in record mode using tox:
19+
```bash
20+
OPENAI_API_KEY=$OPENAI_API_KEY uvx --with tox-uv tox -r -e py313-ci-beeai -- tests/test_instrumentor.py::test_openai_embeddings -xvs --vcr-record=once
21+
```
22+
23+
### Important Notes
24+
- The test reads `OPENAI_API_KEY` from the environment, falling back to "sk-test" if not set
25+
- VCR will cache responses including authentication errors (401), so always delete the cassette before re-recording
26+
- The `--vcr-record=once` flag ensures the cassette is only recorded when it doesn't exist
27+
- Use `-r` flag with tox to ensure a clean environment when re-recording

python/instrumentation/openinference-instrumentation-beeai/tests/cassettes/test_openai_embeddings.yaml

Lines changed: 31 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
from typing import Generator
2+
3+
import pytest
4+
from opentelemetry import trace as trace_api
5+
from opentelemetry.sdk import trace as trace_sdk
6+
from opentelemetry.sdk.trace.export import SimpleSpanProcessor
7+
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
8+
9+
from openinference.instrumentation.beeai import BeeAIInstrumentor
10+
11+
12+
@pytest.fixture(scope="session")
13+
def in_memory_span_exporter() -> InMemorySpanExporter:
14+
return InMemorySpanExporter()
15+
16+
17+
@pytest.fixture(scope="session")
18+
def tracer_provider(
19+
in_memory_span_exporter: InMemorySpanExporter,
20+
) -> trace_api.TracerProvider:
21+
tracer_provider = trace_sdk.TracerProvider()
22+
span_processor = SimpleSpanProcessor(span_exporter=in_memory_span_exporter)
23+
tracer_provider.add_span_processor(span_processor=span_processor)
24+
return tracer_provider
25+
26+
27+
@pytest.fixture(autouse=True)
28+
def instrument(
29+
tracer_provider: trace_api.TracerProvider,
30+
in_memory_span_exporter: InMemorySpanExporter,
31+
) -> Generator[None, None, None]:
32+
BeeAIInstrumentor().instrument(tracer_provider=tracer_provider)
33+
in_memory_span_exporter.clear()
34+
yield
35+
BeeAIInstrumentor().uninstrument()
36+
in_memory_span_exporter.clear()

python/instrumentation/openinference-instrumentation-beeai/tests/test_dummy.py

Lines changed: 0 additions & 2 deletions
This file was deleted.
Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
import json
2+
import os
3+
from typing import Mapping, cast
4+
5+
import pytest
6+
from beeai_framework.adapters.openai import OpenAIEmbeddingModel
7+
from opentelemetry.sdk.trace.export.in_memory_span_exporter import InMemorySpanExporter
8+
from opentelemetry.util.types import AttributeValue
9+
10+
from openinference.semconv.trace import (
11+
EmbeddingAttributes,
12+
OpenInferenceSpanKindValues,
13+
SpanAttributes,
14+
)
15+
16+
17+
@pytest.mark.vcr(
18+
decode_compressed_response=True,
19+
before_record_request=lambda _: _.headers.clear() or _,
20+
before_record_response=lambda _: {**_, "headers": {}},
21+
)
22+
@pytest.mark.asyncio
23+
async def test_openai_embeddings(in_memory_span_exporter: InMemorySpanExporter) -> None:
24+
"""Test that BeeAI OpenAI embeddings are properly traced."""
25+
# API key from environment - only used when re-recording the cassette
26+
# When using the cassette, the key is not needed
27+
api_key = os.getenv("OPENAI_API_KEY", "sk-test")
28+
29+
# Create an embedding model
30+
embedding_model = OpenAIEmbeddingModel(
31+
model_id="text-embedding-3-small",
32+
api_key=api_key,
33+
)
34+
35+
# Create embeddings for test texts
36+
texts = ["Hello world", "Test embedding"]
37+
38+
# Run the embedding request
39+
response = await embedding_model.create(texts)
40+
41+
# Verify we got embeddings back
42+
assert response is not None
43+
assert response.embeddings is not None
44+
assert len(response.embeddings) == 2
45+
46+
# Get the spans
47+
spans = in_memory_span_exporter.get_finished_spans()
48+
assert len(spans) == 1
49+
50+
# Get the single span
51+
openinference_span = spans[0]
52+
assert openinference_span is not None
53+
54+
# Verify span attributes
55+
attributes = dict(cast(Mapping[str, AttributeValue], openinference_span.attributes))
56+
57+
# Check basic attributes as per spec
58+
assert (
59+
attributes.get(SpanAttributes.OPENINFERENCE_SPAN_KIND)
60+
== OpenInferenceSpanKindValues.EMBEDDING.value
61+
)
62+
assert attributes.get(SpanAttributes.EMBEDDING_MODEL_NAME) == "text-embedding-3-small"
63+
assert attributes.get(SpanAttributes.LLM_SYSTEM) == "beeai"
64+
assert attributes.get(SpanAttributes.LLM_PROVIDER) == "openai"
65+
66+
# Check embedding texts
67+
assert (
68+
attributes.get(
69+
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.0.{EmbeddingAttributes.EMBEDDING_TEXT}"
70+
)
71+
== "Hello world"
72+
)
73+
assert (
74+
attributes.get(
75+
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.1.{EmbeddingAttributes.EMBEDDING_TEXT}"
76+
)
77+
== "Test embedding"
78+
)
79+
80+
# Check embedding vectors exist and have correct structure
81+
vector_0 = attributes.get(
82+
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.0.{EmbeddingAttributes.EMBEDDING_VECTOR}"
83+
)
84+
vector_1 = attributes.get(
85+
f"{SpanAttributes.EMBEDDING_EMBEDDINGS}.1.{EmbeddingAttributes.EMBEDDING_VECTOR}"
86+
)
87+
88+
assert vector_0 is not None
89+
assert vector_1 is not None
90+
# Vectors are tuples in the cassette, check exact length from recorded data
91+
assert isinstance(vector_0, (list, tuple))
92+
assert isinstance(vector_1, (list, tuple))
93+
assert len(vector_0) == 1536 # text-embedding-3-small dimension
94+
assert len(vector_1) == 1536 # text-embedding-3-small dimension
95+
# Check first few values are correct floats from cassette
96+
assert vector_0[0] == pytest.approx(-0.002078542485833168)
97+
assert vector_0[1] == pytest.approx(-0.04908587411046028)
98+
assert vector_1[0] == pytest.approx(-0.005330947693437338)
99+
assert vector_1[1] == pytest.approx(-0.03916504979133606)
100+
101+
# Check invocation parameters
102+
invocation_params = attributes.get("embedding.invocation_parameters")
103+
assert isinstance(invocation_params, str)
104+
assert json.loads(invocation_params) == {"abort_signal": None, "max_retries": 0}
105+
106+
# Check token counts
107+
assert attributes.get(SpanAttributes.LLM_TOKEN_COUNT_TOTAL) == 4
108+
assert attributes.get(SpanAttributes.LLM_TOKEN_COUNT_PROMPT) == 4
109+
assert attributes.get(SpanAttributes.LLM_TOKEN_COUNT_COMPLETION) == 0

0 commit comments

Comments
 (0)