feat: OTLP Metrics Support (#239)

mabdinur · web-flow · commit 2b779b16c2bf · 2025-08-15T15:08:57.000-04:00
* feat: add support for receving metrics

* rm

* fix docs

* refactor common fixtures

* consolidate fixtures

* Update README.md

* clean up

* add rn
diff --git a/README.md b/README.md
@@ -391,6 +391,15 @@ Return OpenTelemetry logs that have been received by the agent for the given ses
 
 Logs are returned as a JSON list of the OTLP logs payloads received. The logs are in the standard OpenTelemetry Protocol (OTLP) v1.7.0 format, decoded from protobuf into JSON.
 
+### /test/session/metrics
+
+Return OpenTelemetry metrics that have been received by the agent for the given session token.
+
+#### [optional] `?test_session_token=`
+#### [optional] `X-Datadog-Test-Session-Token`
+
+Metrics are returned as a JSON list of the OTLP metrics payloads received. The metrics are in the standard OpenTelemetry Protocol (OTLP) v1.7.0 format, decoded from protobuf into JSON.
+
 ### /test/session/responses/config (POST)
 Create a Remote Config payload to retrieve in endpoint `/v0.7/config`
 
@@ -495,24 +504,30 @@ Mimics the pipeline_stats endpoint of the agent, but always returns OK, and logs
 
 Accepts OpenTelemetry Protocol (OTLP) v1.7.0 logs in protobuf format via HTTP. This endpoint validates and decodes OTLP logs payloads for testing OpenTelemetry logs exporters and libraries.
 
-The HTTP endpoint accepts `POST` requests with `Content-Type: application/x-protobuf` and stores the decoded logs for retrieval via the `/test/session/logs` endpoint.
+The HTTP endpoint accepts `POST` requests with `Content-Type: application/x-protobuf` and `Content-Type: application/json` and stores the decoded logs for retrieval via the `/test/session/logs` endpoint.
+
+### /v1/metrics (HTTP)
+
+Accepts OpenTelemetry Protocol (OTLP) v1.7.0 metrics in protobuf format via HTTP. This endpoint validates and decodes OTLP metrics payloads for testing OpenTelemetry metrics exporters and libraries.
+
+The HTTP endpoint accepts `POST` requests with `Content-Type: application/x-protobuf` and `Content-Type: application/json` and stores the decoded metrics for retrieval via the `/test/session/metrics` endpoint.
 
-### OTLP Logs via GRPC
+### OTLP Logs and Metrics via GRPC
 
-OTLP logs can also be sent via GRPC using the OpenTelemetry `LogsService.Export` method. The GRPC server implements the standard OTLP logs service interface and forwards all requests to the HTTP server, ensuring consistent processing and session management.
+OTLP logs and metrics can also be sent via GRPC using the OpenTelemetry `LogsService.Export` and `MetricsService.Export` methods respectively. The GRPC server implements the standard OTLP service interfaces and forwards all requests to the HTTP server, ensuring consistent processing and session management.
 
-**Note:** OTLP logs are served on separate ports from the main APM endpoints (default: 8126):
+**Note:** OTLP endpoints are served on separate ports from the main APM endpoints (default: 8126):
 - **HTTP**: Port 4318 (default) - Use `--otlp-http-port` to configure
 - **GRPC**: Port 4317 (default) - Use `--otlp-grpc-port` to configure
 
-Both protocols store decoded logs for retrieval via the `/test/session/logs` HTTP endpoint.
+Both protocols store decoded data for retrieval via the `/test/session/logs` and `/test/session/metrics` HTTP endpoints respectively.
 
 GRPC Client → GRPC Server → HTTP POST → HTTP Server → Agent Storage
                     ↓                                      ↓
             (forwards protobuf)                    (session management)
                     ↓                                      ↓
                    HTTP                              Retrievable via
-                Response                          /test/session/logs
+                Response                     /test/session/{logs,metrics}
 
 ### /tracer_flare/v1
 
diff --git a/ddapm_test_agent/agent.py b/ddapm_test_agent/agent.py
@@ -38,6 +38,7 @@
 from msgpack.exceptions import ExtraData as MsgPackExtraDataException
 from multidict import CIMultiDict
 from opentelemetry.proto.collector.logs.v1.logs_service_pb2_grpc import add_LogsServiceServicer_to_server
+from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2_grpc import add_MetricsServiceServicer_to_server
 
 from . import _get_version
 from . import trace_snapshot
@@ -51,6 +52,9 @@
 from .logs import LOGS_ENDPOINT
 from .logs import OTLPLogsGRPCServicer
 from .logs import decode_logs_request
+from .metrics import METRICS_ENDPOINT
+from .metrics import OTLPMetricsGRPCServicer
+from .metrics import decode_metrics_request
 from .remoteconfig import RemoteConfigServer
 from .trace import Span
 from .trace import Trace
@@ -513,6 +517,20 @@ async def _logs_by_session(self, token: Optional[str]) -> List[Dict[str, Any]]:
                 logs.append(logs_data)
         return logs
 
+    async def _metrics_by_session(self, token: Optional[str]) -> List[Dict[str, Any]]:
+        """Return the metrics that belong to the given session token.
+
+        If token is None or if the token was used to manually start a session
+        with /session-start then return all metrics that were sent since the last
+        /session-start request was made.
+        """
+        metrics: List[Dict[str, Any]] = []
+        for req in self._requests_by_session(token):
+            if req.match_info.handler == self.handle_v1_metrics:
+                metrics_data = self._decode_v1_metrics(req)
+                metrics.append(metrics_data)
+        return metrics
+
     async def _integration_requests_by_session(
         self,
         token: Optional[str],
@@ -593,6 +611,14 @@ def _decode_v1_logs(self, request: Request) -> Dict[str, Any]:
         except Exception as e:
             raise web.HTTPBadRequest(text=str(e))
 
+    def _decode_v1_metrics(self, request: Request) -> Dict[str, Any]:
+        raw_data = self._request_data(request)
+        content_type = request.headers.get("Content-Type", "").lower().strip()
+        try:
+            return decode_metrics_request(raw_data, content_type)
+        except Exception as e:
+            raise web.HTTPBadRequest(text=str(e))
+
     async def handle_v04_traces(self, request: Request) -> web.Response:
         return await self._handle_traces(request, version="v0.4")
 
@@ -631,6 +657,21 @@ async def handle_v1_logs(self, request: Request) -> web.Response:
         )
         return web.HTTPOk()
 
+    async def handle_v1_metrics(self, request: Request) -> web.Response:
+        metrics_data = self._decode_v1_metrics(request)
+        num_resource_metrics = len(metrics_data.get("resource_metrics", []))
+        total_metrics = sum(
+            len(scope_metric.get("metrics", []))
+            for resource_metric in metrics_data.get("resource_metrics", [])
+            for scope_metric in resource_metric.get("scope_metrics", [])
+        )
+        log.info(
+            "received /v1/metrics payload with %r resource metric(s) containing %r metric(s)",
+            num_resource_metrics,
+            total_metrics,
+        )
+        return web.HTTPOk()
+
     async def handle_v07_remoteconfig(self, request: Request) -> web.Response:
         """Emulates Remote Config endpoint: /v0.7/config"""
         token = _session_token(request)
@@ -996,6 +1037,11 @@ async def handle_session_logs(self, request: Request) -> web.Response:
         logs = await self._logs_by_session(token)
         return web.json_response(logs)
 
+    async def handle_session_metrics(self, request: Request) -> web.Response:
+        token = request["session_token"]
+        metrics = await self._metrics_by_session(token)
+        return web.json_response(metrics)
+
     async def handle_session_requests(self, request: Request) -> web.Response:
         token = request["session_token"]
         resp = []
@@ -1013,6 +1059,7 @@ async def handle_session_requests(self, request: Request) -> web.Response:
                 self.handle_evp_proxy_v2_api_v2_llmobs,
                 self.handle_evp_proxy_v2_llmobs_eval_metric,
                 self.handle_v1_logs,
+                self.handle_v1_metrics,
             ):
                 continue
             resp.append(
@@ -1250,8 +1297,10 @@ async def otlp_store_request_middleware(request: Request, handler: _Handler) ->
     app.add_routes(
         [
             web.post(LOGS_ENDPOINT, agent.handle_v1_logs),
+            web.post(METRICS_ENDPOINT, agent.handle_v1_metrics),
             web.get("/test/session/requests", agent.handle_session_requests),
             web.get("/test/session/logs", agent.handle_session_logs),
+            web.get("/test/session/metrics", agent.handle_session_metrics),
             web.get("/test/session/clear", agent.handle_session_clear),
             web.get("/test/session/start", agent.handle_session_start),
         ]
@@ -1269,6 +1318,10 @@ async def make_otlp_grpc_server_async(agent: Agent, http_port: int, grpc_port: i
     logs_servicer = OTLPLogsGRPCServicer(http_port)
     add_LogsServiceServicer_to_server(logs_servicer, server)
 
+    # Add the OTLP metrics servicer
+    metrics_servicer = OTLPMetricsGRPCServicer(http_port)
+    add_MetricsServiceServicer_to_server(metrics_servicer, server)
+
     # Setup and start the server
     listen_addr = f"[::]:{grpc_port}"
     server.add_insecure_port(listen_addr)
diff --git a/ddapm_test_agent/client.py b/ddapm_test_agent/client.py
@@ -163,3 +163,20 @@ def wait_for_num_logs(self, num: int, clear: bool = False, wait_loops: int = 30)
                 return logs
             time.sleep(0.1)
         raise ValueError("Number (%r) of logs not available from test agent, got %r" % (num, len(logs)))
+
+    def metrics(self, clear: bool = False, **kwargs: Any) -> List[Any]:
+        resp = self._session.get(self._url("/test/session/metrics"), **kwargs)
+        if clear:
+            self.clear()
+        return cast(List[Any], resp.json())
+
+    def wait_for_num_metrics(self, num: int, clear: bool = False, wait_loops: int = 30) -> List[Any]:
+        """Wait for `num` metrics to be received from the test agent."""
+        for _ in range(wait_loops):
+            metrics = self.metrics(clear=False)
+            if len(metrics) == num:
+                if clear:
+                    self.clear()
+                return metrics
+            time.sleep(0.1)
+        raise ValueError("Number (%r) of metrics not available from test agent, got %r" % (num, len(metrics)))
diff --git a/ddapm_test_agent/logs.py b/ddapm_test_agent/logs.py
@@ -20,7 +20,6 @@
 
 
 def decode_logs_request(request_body: bytes, content_type: str) -> Dict[str, Any]:
-    """Decode the protobuf request body into an ExportLogsServiceRequest object."""
     if content_type == "application/json":
         parsed_json = json.loads(request_body)
         if not isinstance(parsed_json, dict):
@@ -35,27 +34,23 @@ def decode_logs_request(request_body: bytes, content_type: str) -> Dict[str, Any
 
 
 def protobuf_to_dict(pb_obj: Any) -> Dict[str, Any]:
-    """Convert a protobuf object to a dictionary."""
     return MessageToDict(pb_obj, preserving_proto_field_name=True)
 
 
 class OTLPLogsGRPCServicer(LogsServiceServicer):
-    """GRPC servicer that forwards OTLP logs to HTTP server."""
 
     def __init__(self, http_port: int):
         self.http_url = f"http://127.0.0.1:{http_port}"
 
     async def Export(
         self, request: ExportLogsServiceRequest, context: grpc_aio.ServicerContext
     ) -> ExportLogsServiceResponse:
-        """Export logs by forwarding to HTTP server."""
         try:
             protobuf_data = request.SerializeToString()
             headers = {"Content-Type": "application/x-protobuf"}
             metadata = dict(context.invocation_metadata())
             if "session-token" in metadata:
                 headers["Session-Token"] = metadata["session-token"]
-            # Forward to OTLP HTTP server
             async with ClientSession(self.http_url) as session:
                 async with session.post(LOGS_ENDPOINT, headers=headers, data=protobuf_data) as resp:
                     context.set_trailing_metadata([("http-status", str(resp.status))])
diff --git a/ddapm_test_agent/metrics.py b/ddapm_test_agent/metrics.py
@@ -0,0 +1,94 @@
+"""OTLP Metrics handling for the test agent."""
+
+import json
+import logging
+from typing import Any
+from typing import Dict
+
+from aiohttp import ClientSession
+from google.protobuf.json_format import MessageToDict
+from grpc import aio as grpc_aio
+from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2 import ExportMetricsServiceRequest
+from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2 import ExportMetricsServiceResponse
+from opentelemetry.proto.collector.metrics.v1.metrics_service_pb2_grpc import MetricsServiceServicer
+
+
+METRICS_ENDPOINT = "/v1/metrics"
+
+
+log = logging.getLogger(__name__)
+
+
+def decode_metrics_request(request_body: bytes, content_type: str) -> Dict[str, Any]:
+    if content_type == "application/json":
+        parsed_json = json.loads(request_body)
+        if not isinstance(parsed_json, dict):
+            raise Exception("JSON payload must be an object")
+        return parsed_json
+    elif content_type == "application/x-protobuf":
+        export_request = ExportMetricsServiceRequest()
+        export_request.ParseFromString(request_body)
+        return protobuf_to_dict(export_request)
+    else:
+        raise ValueError(f"Content-Type must be application/x-protobuf or application/json, got {content_type}")
+
+
+def protobuf_to_dict(pb_obj: Any) -> Dict[str, Any]:
+    return MessageToDict(pb_obj, preserving_proto_field_name=True)
+
+
+class OTLPMetricsGRPCServicer(MetricsServiceServicer):
+
+    def __init__(self, http_port: int):
+        self.http_url = f"http://127.0.0.1:{http_port}"
+
+    def _count_data_points(self, request: ExportMetricsServiceRequest) -> int:
+        return len(
+            [
+                dp
+                for rm in request.resource_metrics
+                for sm in rm.scope_metrics
+                for m in sm.metrics
+                for dp in (
+                    m.gauge.data_points
+                    if m.HasField("gauge")
+                    else (
+                        m.sum.data_points
+                        if m.HasField("sum")
+                        else (
+                            m.histogram.data_points
+                            if m.HasField("histogram")
+                            else (
+                                m.exponential_histogram.data_points
+                                if m.HasField("exponential_histogram")
+                                else m.summary.data_points if m.HasField("summary") else []
+                            )
+                        )
+                    )
+                )
+            ]
+        )
+
+    async def Export(
+        self, request: ExportMetricsServiceRequest, context: grpc_aio.ServicerContext
+    ) -> ExportMetricsServiceResponse:
+        try:
+            protobuf_data = request.SerializeToString()
+            headers = {"Content-Type": "application/x-protobuf"}
+            metadata = dict(context.invocation_metadata())
+            if "session-token" in metadata:
+                headers["Session-Token"] = metadata["session-token"]
+            async with ClientSession(self.http_url) as session:
+                async with session.post(METRICS_ENDPOINT, headers=headers, data=protobuf_data) as resp:
+                    context.set_trailing_metadata([("http-status", str(resp.status))])
+                    response = ExportMetricsServiceResponse()
+                    if resp.status >= 400:
+                        response.partial_success.rejected_data_points = self._count_data_points(request)
+                        response.partial_success.error_message = f"HTTP {resp.status}: {await resp.text()}"
+                    return response
+        except Exception as e:
+            context.set_trailing_metadata([("http-status", "500"), ("error", str(e))])
+            response = ExportMetricsServiceResponse()
+            response.partial_success.rejected_data_points = self._count_data_points(request)
+            response.partial_success.error_message = f"Forward failed: {str(e)}"
+            return response
diff --git a/releasenotes/notes/add-otel-metrics-support-a2ebeb28cae2f0ba.yaml b/releasenotes/notes/add-otel-metrics-support-a2ebeb28cae2f0ba.yaml
@@ -0,0 +1,7 @@
+---
+features:
+  - |
+    OTLP: Added OpenTelemetry Protocol (OTLP) metrics support via HTTP endpoint ``/v1/metrics`` on port 4318 and GRPC server on port 4317.
+    Supports all OTLP metric types including Gauge, Sum, Histogram, ExponentialHistogram, and Summary.
+    The GRPC server forwards requests to the OTLP HTTP server for validation and storage, enabling testing of applications that send metrics via either protocol.
+    Session endpoint ``/test/session/metrics`` allows retrieval of captured metrics for test validation.
diff --git a/tests/conftest.py b/tests/conftest.py
diff --git a/tests/test_logs.py b/tests/test_logs.py
diff --git a/tests/test_metrics.py b/tests/test_metrics.py