diff --git a/cmd/synthetic-monitoring-agent/main.go b/cmd/synthetic-monitoring-agent/main.go index e0da6741b..e2abb9974 100644 --- a/cmd/synthetic-monitoring-agent/main.go +++ b/cmd/synthetic-monitoring-agent/main.go @@ -20,6 +20,13 @@ import ( "github.com/jpillora/backoff" "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" + otelGlobal "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp" + "go.opentelemetry.io/otel/sdk/resource" + sdktrace "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.26.0" + "go.opentelemetry.io/otel/trace" + "go.opentelemetry.io/otel/trace/noop" "golang.org/x/sync/errgroup" "google.golang.org/grpc/grpclog" @@ -73,6 +80,7 @@ func run(args []string, stdout io.Writer) error { AutoMemLimit bool MemLimitRatio float64 DisableK6 bool + OTLPEndpoint string }{ GrpcApiServerAddr: "localhost:4031", HttpListenAddr: "localhost:4050", @@ -104,6 +112,7 @@ func run(args []string, stdout io.Writer) error { flags.BoolVar(&config.DisableK6, "disable-k6", config.DisableK6, "disables running k6 checks on this probe") flags.Float64Var(&config.MemLimitRatio, "memlimit-ratio", config.MemLimitRatio, "fraction of available memory to use") flags.Var(&features, "features", "optional feature flags") + flags.StringVar(&config.OTLPEndpoint, "otlp-endpoint", config.OTLPEndpoint, "OTLP endpoint for OTEL exporter") if err := flags.Parse(args[1:]); err != nil { return err @@ -152,13 +161,7 @@ func run(args []string, stdout io.Writer) error { return fmt.Errorf("invalid API token") } - baseCtx, cancel := context.WithCancel(context.Background()) - defer cancel() - - g, ctx := errgroup.WithContext(baseCtx) - zerolog.TimeFieldFormat = zerolog.TimeFormatUnixMs - zl := zerolog.New(stdout).With().Timestamp().Str("program", filepath.Base(args[0])).Logger() switch { @@ -176,6 +179,11 @@ func run(args []string, stdout io.Writer) error { zerolog.SetGlobalLevel(zerolog.ErrorLevel) } + baseCtx, cancel := context.WithCancel(context.Background()) + defer cancel() + + g, ctx := errgroup.WithContext(baseCtx) + g.Go(func() error { return signalHandler(ctx, zl.With().Str("subsystem", "signal handler").Logger()) }) @@ -208,6 +216,15 @@ func run(args []string, stdout io.Writer) error { zl.Info().Msg("disabling k6 checks") } + tracerProvider, err := newTracerProvider(ctx, config.OTLPEndpoint, flags.Name()) + if err != nil { + zl.Warn().Err(err).Msg("could not configure tracer provider, disabling tracing functionality") + tracerProvider = noop.NewTracerProvider() + } + + // Set the global tracer provider to no-op, in case some library tries to obtain one that way. + otelGlobal.SetTracerProvider(noop.NewTracerProvider()) + promRegisterer := prometheus.NewRegistry() if err := registerMetrics(promRegisterer); err != nil { @@ -299,6 +316,7 @@ func run(args []string, stdout io.Writer) error { checksUpdater, err := checks.NewUpdater(checks.UpdaterOptions{ Conn: conn, Logger: zl.With().Str("subsystem", "updater").Logger(), + TracerProvider: tracerProvider, Backoff: newConnectionBackoff(), Publisher: publisher, TenantCh: tenantCh, @@ -321,6 +339,7 @@ func run(args []string, stdout io.Writer) error { adhocHandler, err := adhoc.NewHandler(adhoc.HandlerOpts{ Conn: conn, Logger: zl.With().Str("subsystem", "adhoc").Logger(), + TracerProvider: tracerProvider, Backoff: newConnectionBackoff(), Publisher: publisher, TenantCh: tenantCh, @@ -439,3 +458,49 @@ func setupGoMemLimit(ratio float64) error { return nil } + +// newTracerProvider creates a TracerProvider configured with the relevant +// resource attributes, configured to export traces over OTLP+HTTP to the +// specified endpoint. The provided name will be used to identify the service +// in traces. +// +// The context is needed so that the exporter can be stopped when the context +// is cancelled, so it's probably a good idea to use a context that is +// cancelled when the program is shutting down. +// +// If no endpoint is provided, a no-op tracer provider is returned and no error +// is reported. +// +// After this function returns successfully, the tracer exporter would have +// been started. +func newTracerProvider(ctx context.Context, endpoint, name string) (trace.TracerProvider, error) { + if endpoint == "" { + return noop.NewTracerProvider(), nil + } + + if _, err := url.Parse(endpoint); err != nil { + return nil, fmt.Errorf("invalid OTLP endpoint URL: %w", err) + } + + res, err := resource.Merge( + resource.Default(), + resource.NewWithAttributes( + semconv.SchemaURL, + semconv.ServiceName(name), + semconv.ServiceVersion(version.Short()), + ), + ) + if err != nil { + return nil, fmt.Errorf("creating otel resources: %w", err) + } + + te, err := otlptracehttp.New(ctx, otlptracehttp.WithEndpointURL(endpoint)) + if err != nil { + return nil, fmt.Errorf("starting otelhttp trace exporter: %w", err) + } + + return sdktrace.NewTracerProvider( + sdktrace.WithBatcher(te), + sdktrace.WithResource(res), + ), nil +} diff --git a/go.mod b/go.mod index cf55a303b..8c4446b37 100644 --- a/go.mod +++ b/go.mod @@ -38,7 +38,12 @@ require ( github.com/mccutchen/go-httpbin/v2 v2.15.0 github.com/quasilyte/go-ruleguard/dsl v0.3.22 github.com/spf13/afero v1.11.0 - golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 + go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 + go.opentelemetry.io/otel v1.31.0 + go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 + go.opentelemetry.io/otel/sdk v1.30.0 + go.opentelemetry.io/otel/trace v1.31.0 + golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 gopkg.in/yaml.v3 v3.0.1 kernel.org/pub/linux/libs/security/libcap/cap v1.2.70 ) @@ -47,6 +52,7 @@ require ( github.com/andybalholm/brotli v1.1.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/buger/goterm v1.0.4 // indirect + github.com/cenkalti/backoff/v4 v4.3.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cilium/ebpf v0.11.0 // indirect github.com/containerd/cgroups/v3 v3.0.3 // indirect @@ -54,9 +60,12 @@ require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dennwc/varint v1.0.0 // indirect github.com/docker/go-units v0.5.0 // indirect + github.com/go-logr/logr v1.4.2 // indirect + github.com/go-logr/stdr v1.2.2 // indirect github.com/godbus/dbus/v5 v5.0.4 // indirect github.com/golang/protobuf v1.5.4 // indirect github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect + github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 // indirect github.com/klauspost/compress v1.17.9 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/mattn/go-colorable v0.1.13 // indirect @@ -67,13 +76,17 @@ require ( github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/sirupsen/logrus v1.9.3 // indirect + go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0 // indirect + go.opentelemetry.io/otel/metric v1.31.0 // indirect + go.opentelemetry.io/proto/otlp v1.3.1 // indirect go.uber.org/atomic v1.11.0 // indirect golang.org/x/mod v0.21.0 // indirect golang.org/x/oauth2 v0.23.0 // indirect golang.org/x/sys v0.26.0 // indirect golang.org/x/text v0.19.0 // indirect golang.org/x/tools v0.26.0 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 // indirect google.golang.org/protobuf v1.34.2 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect kernel.org/pub/linux/libs/security/libcap/psx v1.2.70 // indirect diff --git a/go.sum b/go.sum index ae78c1b3a..eb7350331 100644 --- a/go.sum +++ b/go.sum @@ -21,6 +21,8 @@ github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6r github.com/buger/goterm v0.0.0-20181115115552-c206103e1f37/go.mod h1:u9UyCz2eTrSGy6fbupqJ54eY5c4IC8gREQ1053dK12U= github.com/buger/goterm v1.0.4 h1:Z9YvGmOih81P0FbVtEYTFF6YsSgxSUKEhf/f9bTMXbY= github.com/buger/goterm v1.0.4/go.mod h1:HiFWV3xnkolgrBV3mY8m0X0Pumt4zg4QhbdOzQtB8tE= +github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8= +github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/cilium/ebpf v0.11.0 h1:V8gS/bTCCjX9uUnkUFUpPsksM8n1lXBAvHcpiFk1X2Y= @@ -48,8 +50,11 @@ github.com/go-kit/log v0.2.1 h1:MRVx0/zhvdseW+Gza6N9rVzU/IVzaeE1SFI4raAhmBU= github.com/go-kit/log v0.2.1/go.mod h1:NwTd00d/i8cPZ3xOwwiv2PO5MOcx78fFErGNcVmBjv0= github.com/go-logfmt/logfmt v0.6.0 h1:wGYYu3uicYdqXVgoYbvnkrPVXkuLM1p1ifugDMEdRi4= github.com/go-logfmt/logfmt v0.6.0/go.mod h1:WYhtIu8zTZfxdn5+rREduYbwxfcBr/Vr6KEVveWlfTs= +github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag= +github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE= github.com/go-ping/ping v1.1.0 h1:3MCGhVX4fyEUuhsfwPrsEdQw6xspHkv5zHsiSoDFZYw= github.com/go-ping/ping v1.1.0/go.mod h1:xIFjORFzTxqIV/tDVGO4eDy/bLuSyawEeojSm3GfRGk= github.com/godbus/dbus/v5 v5.0.4 h1:9349emZab16e7zQvpmsbtjc18ykshndd8y2PG3sgJbA= @@ -79,6 +84,8 @@ github.com/grafana/mtr v0.1.1-0.20221107202107-a9806fdda166 h1:COSDtVDArtLKK9p+m github.com/grafana/mtr v0.1.1-0.20221107202107-a9806fdda166/go.mod h1:qDO1rp1hUZzunyD0f38VNbY0j6k+ZFRNZkHl3jbn/MU= github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc h1:GN2Lv3MGO7AS6PrRoT6yV5+wkrOpcszoIsO4+4ds248= github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc/go.mod h1:+JKpmjMGhpgPL+rXZ5nsZieVzvarn86asRlBg4uNGnk= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys= +github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I= github.com/hokaccha/go-prettyjson v0.0.0-20180920040306-f579f869bbfe/go.mod h1:pFlLw2CfqZiIBOx6BuCeRLCrfxBJipTY0nIOF/VbGcI= github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= @@ -142,8 +149,8 @@ github.com/prometheus/prometheus v0.54.1 h1:vKuwQNjnYN2/mDoWfHXDhAsz/68q/dQDb+Yb github.com/prometheus/prometheus v0.54.1/go.mod h1:xlLByHhk2g3ycakQGrMaU8K7OySZx98BzeCR99991NY= github.com/quasilyte/go-ruleguard/dsl v0.3.22 h1:wd8zkOhSNr+I+8Qeciml08ivDt1pSXe60+5DqOpCjPE= github.com/quasilyte/go-ruleguard/dsl v0.3.22/go.mod h1:KeCP03KrjuSO0H1kTuZQCWlQPulDV6YMIXmpQss17rU= -github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ= -github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog= +github.com/rogpeppe/go-internal v1.12.0 h1:exVL4IDcn6na9z1rAb56Vxr+CgyK3nn3O+epU5NdKM8= +github.com/rogpeppe/go-internal v1.12.0/go.mod h1:E+RYuTGaKKdloAfM02xzb0FW3Paa99yedzYV+kq4uf4= github.com/rs/xid v1.5.0/go.mod h1:trrq9SKmegXys3aeAKXMUTdJsYXVwGY3RLcfgqegfbg= github.com/rs/zerolog v1.33.0 h1:1cU2KZkvPxNyfgEmhHAz/1A9Bz+llsdYzklWFzgp0r8= github.com/rs/zerolog v1.33.0/go.mod h1:/7mN4D5sKwJLZQ2b/znpjC3/GQWY/xaDXUM0kKWRHss= @@ -165,6 +172,22 @@ github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsT github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0 h1:4K4tsIXefpVJtvA/8srF4V4y0akAoPHkIslgAkjixJA= +go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.53.0/go.mod h1:jjdQuTGVsXV4vSs+CJ2qYDeDPf9yIJV23qlIzBm73Vg= +go.opentelemetry.io/otel v1.31.0 h1:NsJcKPIW0D0H3NgzPDHmo0WW6SptzPdqg/L1zsIm2hY= +go.opentelemetry.io/otel v1.31.0/go.mod h1:O0C14Yl9FgkjqcCZAsE053C13OaddMYr/hz6clDkEJE= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0 h1:lsInsfvhVIfOI6qHVyysXMNDnjO9Npvl7tlDPJFBVd4= +go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.30.0/go.mod h1:KQsVNh4OjgjTG0G6EiNi1jVpnaeeKsKMRwbLN+f1+8M= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0 h1:umZgi92IyxfXd/l4kaDhnKgY8rnN/cZcF1LKc6I8OQ8= +go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.30.0/go.mod h1:4lVs6obhSVRb1EW5FhOuBTyiQhtRtAnnva9vD3yRfq8= +go.opentelemetry.io/otel/metric v1.31.0 h1:FSErL0ATQAmYHUIzSezZibnyVlft1ybhy4ozRPcF2fE= +go.opentelemetry.io/otel/metric v1.31.0/go.mod h1:C3dEloVbLuYoX41KpmAhOqNriGbA+qqH6PQ5E5mUfnY= +go.opentelemetry.io/otel/sdk v1.30.0 h1:cHdik6irO49R5IysVhdn8oaiR9m8XluDaJAs4DfOrYE= +go.opentelemetry.io/otel/sdk v1.30.0/go.mod h1:p14X4Ok8S+sygzblytT1nqG98QG2KYKv++HE0LY/mhg= +go.opentelemetry.io/otel/trace v1.31.0 h1:ffjsj1aRouKewfr85U2aGagJ46+MvodynlQ1HYdmJys= +go.opentelemetry.io/otel/trace v1.31.0/go.mod h1:TXZkRk7SM2ZQLtR6eoAWQFIHPvzQ06FJAsO1tJg480A= +go.opentelemetry.io/proto/otlp v1.3.1 h1:TrMUixzpM0yuc/znrFTP9MMRh8trP93mkCiDVeXrui0= +go.opentelemetry.io/proto/otlp v1.3.1/go.mod h1:0X1WI4de4ZsLrrJNLAQbFeLCm3T7yBkR0XqQ7niQU+8= go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= @@ -174,8 +197,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.28.0 h1:GBDwsMXVQi34v5CCYUm2jkJvu4cbtru2U4TN2PSyQnw= golang.org/x/crypto v0.28.0/go.mod h1:rmgy+3RHxRZMyY0jjAJShp2zgEdOqj2AO7U0pYmeQ7U= -golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6 h1:1wqE9dj9NpSm04INVsJhhEUzhuDVjbcyKH91sVyPATw= -golang.org/x/exp v0.0.0-20241004190924-225e2abe05e6/go.mod h1:NQtJDoLvd6faHhE7m4T/1IY708gDefGGjR/iUW8yQQ8= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 h1:e66Fs6Z+fZTbFBAxKfP3PALWBtpfqks2bwGcexMxgtk= +golang.org/x/exp v0.0.0-20240909161429-701f63a606c0/go.mod h1:2TbTHSBQa924w8M6Xs1QcRcFwyucIwBGpK1p2f1YFFY= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.21.0 h1:vvrHzRwRfVKSiLrG+d4FMl/Qi4ukBCE6kZlTUkDYRT0= @@ -226,8 +249,10 @@ golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8T golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= google.golang.org/genproto v0.0.0-20180518175338-11a468237815/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142 h1:e7S5W7MGGLaSu8j3YjdezkZ+m1/Nm0uRVRMEMGk26Xs= -google.golang.org/genproto/googleapis/rpc v0.0.0-20240814211410-ddb44dafa142/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1 h1:hjSy6tcFQZ171igDaN5QHOw2n6vx40juYbC/x67CEhc= +google.golang.org/genproto/googleapis/api v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:qpvKtACPCQhAdu3PyQgV4l3LMXZEtft7y8QcarRsp9I= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1 h1:pPJltXNxVzT4pK9yD8vR9X75DaWYYmLGMsEvBfFQZzQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20240903143218-8af14fe29dc1/go.mod h1:UqMtugtsSgubUsoxbuAoiCXvqvErP7Gf0so0mK9tHxU= google.golang.org/grpc v1.12.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw= google.golang.org/grpc v1.67.1 h1:zWnc1Vrcno+lHZCOofnIMvycFcc0QRGIzm9dhnDX68E= google.golang.org/grpc v1.67.1/go.mod h1:1gLDyUQU7CTLJI90u3nXZ9ekeghjeM7pTDZlqFNg2AA= diff --git a/internal/adhoc/adhoc.go b/internal/adhoc/adhoc.go index 72c0ebf89..e925eb3f3 100644 --- a/internal/adhoc/adhoc.go +++ b/internal/adhoc/adhoc.go @@ -11,6 +11,8 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/prompb" "github.com/rs/zerolog" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/connectivity" @@ -32,6 +34,7 @@ import ( type Handler struct { api apiInfo logger zerolog.Logger + tracerProvider trace.TracerProvider features feature.Collection backoff Backoffer probe *sm.Probe @@ -101,6 +104,7 @@ func (b constantBackoff) Duration() time.Duration { return time.Duration(b) } type HandlerOpts struct { Conn ClientConn Logger zerolog.Logger + TracerProvider trace.TracerProvider Backoff Backoffer Publisher pusher.Publisher TenantCh chan<- sm.Tenant @@ -133,6 +137,7 @@ func NewHandler(opts HandlerOpts) (*Handler, error) { h := &Handler{ logger: opts.Logger, + tracerProvider: opts.TracerProvider, features: opts.Features, backoff: opts.Backoff, publisher: opts.Publisher, @@ -349,6 +354,15 @@ func (h *Handler) processAdHocChecks(ctx context.Context, client sm.AdHocChecks_ } func (h *Handler) handleAdHocCheck(ctx context.Context, ahReq *sm.AdHocRequest) error { + ctx, adHocSpan := h.tracerProvider.Tracer("").Start(ctx, "adHoc check") + defer adHocSpan.End() + adHocSpan.SetAttributes( + attribute.String("type", ahReq.AdHocCheck.Type().String()), + attribute.Int64("tenantId", ahReq.AdHocCheck.TenantId), + attribute.String("id", ahReq.AdHocCheck.Id), + ) + adHocSpan.SetAttributes(attribute.String("type", ahReq.AdHocCheck.Type().String())) + h.logger.Debug().Interface("request", ahReq).Msg("got ad-hoc check request") h.metrics.opsCounter.WithLabelValues(ahReq.AdHocCheck.Type().String()).Inc() @@ -479,10 +493,14 @@ func (r *runner) Run(ctx context.Context, tenantId model.GlobalID, publisher pus start := time.Now() + ctx, runSpan := trace.SpanFromContext(ctx).TracerProvider().Tracer("").Start(ctx, "adHoc run") + defer runSpan.End() + rCtx, cancel := context.WithTimeout(ctx, r.timeout) defer cancel() success, duration := r.prober.Probe(rCtx, r.target, registry, logger) + runSpan.SetAttributes(attribute.Bool("success", success), attribute.Float64("durationSeconds", duration)) if success { successGauge.Set(1) diff --git a/internal/adhoc/adhoc_test.go b/internal/adhoc/adhoc_test.go index 1d350cefb..5fee195df 100644 --- a/internal/adhoc/adhoc_test.go +++ b/internal/adhoc/adhoc_test.go @@ -10,6 +10,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/rs/zerolog" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" "google.golang.org/grpc" "google.golang.org/grpc/codes" "google.golang.org/grpc/connectivity" @@ -29,6 +30,7 @@ func TestNewHandler(t *testing.T) { opts := HandlerOpts{ Conn: nil, Logger: zerolog.New(io.Discard), + TracerProvider: noop.NewTracerProvider(), Publisher: channelPublisher(make(chan pusher.Payload)), TenantCh: make(chan sm.Tenant), PromRegisterer: prometheus.NewPedanticRegistry(), @@ -63,6 +65,7 @@ func TestHandlerRun(t *testing.T) { opts := HandlerOpts{ Logger: logger, + TracerProvider: noop.NewTracerProvider(), Publisher: channelPublisher(publishCh), TenantCh: make(chan sm.Tenant), PromRegisterer: prometheus.NewPedanticRegistry(), @@ -132,8 +135,7 @@ func (testBackoff) Duration() time.Duration { func (testBackoff) Reset() {} -type grpcTestConn struct { -} +type grpcTestConn struct{} func (grpcTestConn) GetState() connectivity.State { return connectivity.Ready @@ -189,6 +191,7 @@ func TestHandlerRunErrors(t *testing.T) { opts := HandlerOpts{ Conn: &grpcTestConn{}, Logger: logger, + TracerProvider: noop.NewTracerProvider(), Publisher: channelPublisher(publishCh), Backoff: testBackoff{}, TenantCh: make(chan sm.Tenant), diff --git a/internal/checks/checks.go b/internal/checks/checks.go index 1b9232708..18a7ff90b 100644 --- a/internal/checks/checks.go +++ b/internal/checks/checks.go @@ -16,6 +16,7 @@ import ( "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/prometheus/prompb" "github.com/rs/zerolog" + "go.opentelemetry.io/otel/trace" "golang.org/x/sync/errgroup" "google.golang.org/grpc" "google.golang.org/grpc/codes" @@ -67,6 +68,7 @@ type Backoffer interface { type Updater struct { api apiInfo logger zerolog.Logger + tracerProvider trace.TracerProvider features feature.Collection backoff Backoffer publisher pusher.Publisher @@ -104,6 +106,7 @@ type ( type UpdaterOptions struct { Conn *grpc.ClientConn Logger zerolog.Logger + TracerProvider trace.TracerProvider Backoff Backoffer Publisher pusher.Publisher TenantCh chan<- sm.Tenant @@ -226,6 +229,7 @@ func NewUpdater(opts UpdaterOptions) (*Updater, error) { conn: opts.Conn, }, logger: opts.Logger, + tracerProvider: opts.TracerProvider, features: opts.Features, backoff: opts.Backoff, publisher: opts.Publisher, @@ -894,6 +898,7 @@ func (c *Updater) addAndStartScraperWithLock(ctx context.Context, check model.Ch ctx, check, c.publisher, *c.probe, c.features, c.logger, + c.tracerProvider, metrics, c.k6Runner, c.tenantLimits, c.telemeter, diff --git a/internal/checks/checks_test.go b/internal/checks/checks_test.go index bebc064a2..2f0f83212 100644 --- a/internal/checks/checks_test.go +++ b/internal/checks/checks_test.go @@ -12,6 +12,7 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" "github.com/rs/zerolog" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc" "github.com/grafana/synthetic-monitoring-agent/internal/feature" @@ -467,6 +468,7 @@ func (l testLabelsLimiter) LogLabels(ctx context.Context, tenantID model.GlobalI func testScraperFactory(ctx context.Context, check model.Check, publisher pusher.Publisher, _ sm.Probe, _ feature.Collection, logger zerolog.Logger, + tracerProvider trace.TracerProvider, metrics scraper.Metrics, k6Runner k6runner.Runner, labelsLimiter scraper.LabelsLimiter, diff --git a/internal/k6runner/http.go b/internal/k6runner/http.go index e3dfd6b81..ae185bb29 100644 --- a/internal/k6runner/http.go +++ b/internal/k6runner/http.go @@ -10,6 +10,9 @@ import ( "time" "github.com/rs/zerolog" + "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/trace" "golang.org/x/exp/rand" ) @@ -110,8 +113,7 @@ func (r HttpRunner) Run(ctx context.Context, script Script) (*RunResponse, error select { case <-ctx.Done(): waitTimer.Stop() - // TODO: Log the returned error in the Processor instead. - r.logger.Error().Err(err).Msg("retries exhausted") + r.logger.Error().Err(err).Msg("retries exhausted") // TODO: Log the returned error in the Processor instead. return nil, fmt.Errorf("cannot retry further: %w", errors.Join(err, ctx.Err())) case <-waitTimer.C: } @@ -165,7 +167,21 @@ func (r HttpRunner) request(ctx context.Context, script Script) (*RunResponse, e req.Header.Add("content-type", "application/json") - resp, err := http.DefaultClient.Do(req) + // Build a tracing-enabled http client. + httpClient := http.Client{ + Transport: otelhttp.NewTransport( + http.DefaultTransport, + otelhttp.WithTracerProvider(trace.SpanFromContext(ctx).TracerProvider()), + // Span names do not include method and path by default to avoid cardinality explosion with paths containing + // IDs. As this is not the case with this endpoint, we use a custom formatter that includes both. + otelhttp.WithSpanNameFormatter(func(_ string, r *http.Request) string { + return fmt.Sprintf("%s %s", r.Method, r.URL.Path) + }), + otelhttp.WithPropagators(propagation.TraceContext{}), // Send TraceIDs in outgoing requests. + ), + } + + resp, err := httpClient.Do(req) if err != nil { r.logger.Error().Err(err).Msg("sending request") diff --git a/internal/k6runner/k6runner.go b/internal/k6runner/k6runner.go index 0ec4c222a..3f1f72edc 100644 --- a/internal/k6runner/k6runner.go +++ b/internal/k6runner/k6runner.go @@ -16,6 +16,8 @@ import ( "github.com/prometheus/common/model" "github.com/rs/zerolog" "github.com/spf13/afero" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" ) // Script is a k6 script that a runner is able to run, with some added instructions for that runner to act on. @@ -92,15 +94,27 @@ var ( func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logger logger.Logger, internalLogger zerolog.Logger) (bool, error) { k6runner := r.runner.WithLogger(&internalLogger) - // TODO: This error message is okay to be Debug for local k6 execution, but should be Error for remote runners. - result, err := k6runner.Run(ctx, r.script) + k6Ctx, k6Span := trace.SpanFromContext(ctx).TracerProvider().Tracer("").Start(ctx, "k6 processor") + result, err := k6runner.Run(k6Ctx, r.script) if err != nil { + // TODO: This error message is okay to be Debug for local k6 execution, but should be Error for remote runners. internalLogger.Debug(). Err(err). Msg("k6 script exited with error code") + k6Span.RecordError(err) + k6Span.End() return false, err } + k6Span.SetAttributes( + attribute.String("error", result.Error), + attribute.String("errorCode", result.ErrorCode), + attribute.Int("metricsSizeBytes", len(result.Metrics)), + attribute.Int("logsSizeBytes", len(result.Logs)), + ) + + k6Span.End() + // If only one of Error and ErrorCode are non-empty, the proxy is misbehaving. switch { case result.Error == "" && result.ErrorCode != "": @@ -126,11 +140,15 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge }() } + _, reportTelemetrySpan := trace.SpanFromContext(ctx).TracerProvider().Tracer("").Start(ctx, "report telemetry") + // Send logs before metrics to make sure logs are submitted even if the metrics output is not parsable. if err := k6LogsToLogger(result.Logs, logger); err != nil { internalLogger.Debug(). Err(err). Msg("cannot load logs to logger") + reportTelemetrySpan.RecordError(err) + reportTelemetrySpan.End() return false, err } @@ -143,6 +161,8 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge internalLogger.Debug(). Err(err). Msg("cannot extract metric samples") + reportTelemetrySpan.RecordError(err) + reportTelemetrySpan.End() return false, err } @@ -150,9 +170,13 @@ func (r Processor) Run(ctx context.Context, registry *prometheus.Registry, logge internalLogger.Error(). Err(err). Msg("cannot register collector") + reportTelemetrySpan.RecordError(err) + reportTelemetrySpan.End() return false, err } + reportTelemetrySpan.End() + // https://github.com/grafana/sm-k6-runner/blob/b811839d444a7e69fd056b0a4e6ccf7e914197f3/internal/mq/runner.go#L51 switch result.ErrorCode { case "": diff --git a/internal/scraper/scraper.go b/internal/scraper/scraper.go index 08221d38d..4a102f143 100644 --- a/internal/scraper/scraper.go +++ b/internal/scraper/scraper.go @@ -20,6 +20,8 @@ import ( prom "github.com/prometheus/common/model" "github.com/prometheus/prometheus/prompb" "github.com/rs/zerolog" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" logproto "github.com/grafana/loki/pkg/push" "github.com/grafana/synthetic-monitoring-agent/internal/feature" @@ -61,26 +63,28 @@ type Telemeter interface { } type Scraper struct { - publisher pusher.Publisher - cancel context.CancelFunc - checkName string - target string - logger zerolog.Logger - check model.Check - probe sm.Probe - prober prober.Prober - labelsLimiter LabelsLimiter - stop chan struct{} - metrics Metrics - summaries map[uint64]prometheus.Summary - histograms map[uint64]prometheus.Histogram - telemeter Telemeter + publisher pusher.Publisher + cancel context.CancelFunc + checkName string + target string + logger zerolog.Logger + tracerProvider trace.TracerProvider + check model.Check + probe sm.Probe + prober prober.Prober + labelsLimiter LabelsLimiter + stop chan struct{} + metrics Metrics + summaries map[uint64]prometheus.Summary + histograms map[uint64]prometheus.Histogram + telemeter Telemeter } type Factory func( ctx context.Context, check model.Check, publisher pusher.Publisher, probe sm.Probe, features feature.Collection, logger zerolog.Logger, + tracerProvider trace.TracerProvider, metrics Metrics, k6runner k6runner.Runner, labelsLimiter LabelsLimiter, @@ -114,32 +118,35 @@ func New( ctx context.Context, check model.Check, publisher pusher.Publisher, probe sm.Probe, features feature.Collection, logger zerolog.Logger, + tracerProvider trace.TracerProvider, metrics Metrics, k6runner k6runner.Runner, labelsLimiter LabelsLimiter, telemeter *telemetry.Telemeter, ) (*Scraper, error) { return NewWithOpts(ctx, check, ScraperOpts{ - Probe: probe, - Publisher: publisher, - Logger: logger, - Metrics: metrics, - ProbeFactory: prober.NewProberFactory(k6runner, probe.Id, features), - LabelsLimiter: labelsLimiter, - Telemeter: telemeter, + Probe: probe, + Publisher: publisher, + Logger: logger, + TracerProvider: tracerProvider, + Metrics: metrics, + ProbeFactory: prober.NewProberFactory(k6runner, probe.Id, features), + LabelsLimiter: labelsLimiter, + Telemeter: telemeter, }) } var _ Factory = New type ScraperOpts struct { - Probe sm.Probe - Publisher pusher.Publisher - Logger zerolog.Logger - Metrics Metrics - ProbeFactory prober.ProberFactory - LabelsLimiter LabelsLimiter - Telemeter Telemeter + Probe sm.Probe + Publisher pusher.Publisher + Logger zerolog.Logger + TracerProvider trace.TracerProvider + Metrics Metrics + ProbeFactory prober.ProberFactory + LabelsLimiter LabelsLimiter + Telemeter Telemeter } func NewWithOpts(ctx context.Context, check model.Check, opts ScraperOpts) (*Scraper, error) { @@ -163,20 +170,21 @@ func NewWithOpts(ctx context.Context, check model.Check, opts ScraperOpts) (*Scr } return &Scraper{ - publisher: opts.Publisher, - cancel: cancel, - checkName: checkName, - target: target, - logger: logger, - check: check, - probe: opts.Probe, - prober: smProber, - labelsLimiter: opts.LabelsLimiter, - stop: make(chan struct{}), - metrics: opts.Metrics, - summaries: make(map[uint64]prometheus.Summary), - histograms: make(map[uint64]prometheus.Histogram), - telemeter: opts.Telemeter, + publisher: opts.Publisher, + cancel: cancel, + checkName: checkName, + target: target, + logger: logger, + tracerProvider: opts.TracerProvider, + check: check, + probe: opts.Probe, + prober: smProber, + labelsLimiter: opts.LabelsLimiter, + stop: make(chan struct{}), + metrics: opts.Metrics, + summaries: make(map[uint64]prometheus.Summary), + histograms: make(map[uint64]prometheus.Histogram), + telemeter: opts.Telemeter, }, nil } @@ -524,6 +532,16 @@ func (s Scraper) collectData(ctx context.Context, t time.Time) (*probeData, time timeout = time.Duration(s.check.Timeout) * time.Millisecond } + ctx, checkSpan := s.tracerProvider.Tracer("").Start(ctx, "check") + defer checkSpan.End() + checkSpan.SetAttributes( + attribute.String("type", s.prober.Name()), + attribute.Int("regionId", s.check.RegionId), + attribute.Int64("tenantId", s.check.TenantId), + attribute.String("instance", s.check.Target), + attribute.String("job", s.check.Job), + ) + success, mfs, err := getProbeMetrics( ctx, s.prober, @@ -662,6 +680,9 @@ func runProber( logger kitlog.Logger, ) bool { start := time.Now() + ctx, runSpan := trace.SpanFromContext(ctx).TracerProvider().Tracer("").Start(ctx, "check run") + defer runSpan.End() + runSpan.SetAttributes(attribute.String("type", prober.Name())) _ = level.Info(logger).Log("msg", "Beginning check", "type", prober.Name(), "timeout_seconds", timeout.Seconds()) @@ -669,6 +690,7 @@ func runProber( defer cancel() success, duration := prober.Probe(checkCtx, target, registry, logger) + runSpan.SetAttributes(attribute.Bool("success", success), attribute.Float64("durationSeconds", duration)) probeDuration := time.Since(start).Seconds() diff --git a/internal/scraper/scraper_test.go b/internal/scraper/scraper_test.go index 009176d20..f7848f5bd 100644 --- a/internal/scraper/scraper_test.go +++ b/internal/scraper/scraper_test.go @@ -49,6 +49,7 @@ import ( "github.com/prometheus/prometheus/promql/parser" "github.com/rs/zerolog" "github.com/stretchr/testify/require" + "go.opentelemetry.io/otel/trace/noop" "google.golang.org/grpc" "google.golang.org/grpc/credentials" grpchealth "google.golang.org/grpc/health/grpc_health_v1" @@ -927,10 +928,11 @@ func TestValidateLabels(t *testing.T) { check.AlertSensitivity = "low" // Force sm_check_info metric to include alert sensitivity label s := Scraper{ - checkName: "check name", - target: check.Target, - logger: zerolog.Nop(), - prober: prober, + checkName: "check name", + target: check.Target, + logger: zerolog.Nop(), + tracerProvider: noop.NewTracerProvider(), + prober: prober, labelsLimiter: testLabelsLimiter{ maxMetricLabels: 100, maxLogLabels: 100, @@ -1538,10 +1540,11 @@ func TestScraperCollectData(t *testing.T) { tc := tc t.Run(name, func(t *testing.T) { s := Scraper{ - checkName: checkName, - target: "test target", - logger: zerolog.Nop(), - prober: testProber{}, + checkName: checkName, + target: "test target", + logger: zerolog.Nop(), + tracerProvider: noop.NewTracerProvider(), + prober: testProber{}, labelsLimiter: testLabelsLimiter{ maxMetricLabels: tc.maxMetricLabels, maxLogLabels: tc.maxLogLabels, @@ -1842,10 +1845,11 @@ func TestScraperRun(t *testing.T) { metrics := NewMetrics(&counter, &errCounter) s, err := NewWithOpts(ctx, check, ScraperOpts{ - Metrics: metrics, - ProbeFactory: testProbeFactory{builder: func() prober.Prober { return testProber }}, - Logger: zerolog.New(zerolog.NewTestWriter(t)), - Publisher: &testPublisher{}, + Metrics: metrics, + ProbeFactory: testProbeFactory{builder: func() prober.Prober { return testProber }}, + Logger: zerolog.New(zerolog.NewTestWriter(t)), + TracerProvider: noop.NewTracerProvider(), + Publisher: &testPublisher{}, LabelsLimiter: testLabelsLimiter{ maxMetricLabels: 20, maxLogLabels: 15,