metrics: Implement pushing via OLTP (#7664)

This commit is contained in:
Kévin Dunglas 2026-04-25 12:52:08 +02:00 committed by GitHub
parent 355c178213
commit 2a3ed96f8c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 183 additions and 8 deletions

View file

@ -484,6 +484,8 @@ func unmarshalCaddyfileMetricsOptions(d *caddyfile.Dispenser) (any, error) {
metrics.PerHost = true
case "observe_catchall_hosts":
metrics.ObserveCatchallHosts = true
case "otlp":
metrics.OTLP = true
default:
return nil, d.Errf("unrecognized servers option '%s'", d.Val())
}

View file

@ -0,0 +1,35 @@
{
metrics {
otlp
}
}
:80 {
respond "Hello"
}
----------
{
"apps": {
"http": {
"servers": {
"srv0": {
"listen": [
":80"
],
"routes": [
{
"handle": [
{
"body": "Hello",
"handler": "static_response"
}
]
}
]
}
},
"metrics": {
"otlp": true
}
}
}
}

4
go.mod
View file

@ -30,11 +30,13 @@ require (
github.com/tailscale/tscert v0.0.0-20251216020129-aea342f6d747
github.com/yuin/goldmark v1.8.2
github.com/yuin/goldmark-highlighting/v2 v2.0.0-20230729083705-37449abec8cc
go.opentelemetry.io/contrib/bridges/prometheus v0.68.0
go.opentelemetry.io/contrib/exporters/autoexport v0.65.0
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0
go.opentelemetry.io/contrib/propagators/autoprop v0.65.0
go.opentelemetry.io/otel v1.43.0
go.opentelemetry.io/otel/sdk v1.43.0
go.opentelemetry.io/otel/sdk/metric v1.43.0
go.step.sm/crypto v0.77.1
go.uber.org/automaxprocs v1.6.0
go.uber.org/zap v1.27.1
@ -87,7 +89,6 @@ require (
github.com/x448/float16 v0.8.4 // indirect
github.com/zeebo/blake3 v0.2.4 // indirect
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
go.opentelemetry.io/contrib/bridges/prometheus v0.68.0 // indirect
go.opentelemetry.io/contrib/propagators/aws v1.43.0 // indirect
go.opentelemetry.io/contrib/propagators/b3 v1.43.0 // indirect
go.opentelemetry.io/contrib/propagators/jaeger v1.43.0 // indirect
@ -104,7 +105,6 @@ require (
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect
go.opentelemetry.io/otel/log v0.19.0 // indirect
go.opentelemetry.io/otel/sdk/log v0.19.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
go.yaml.in/yaml/v2 v2.4.4 // indirect
go.yaml.in/yaml/v3 v3.0.4 // indirect
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect

View file

@ -208,6 +208,9 @@ func (app *App) Provision(ctx caddy.Context) error {
app.Metrics.httpMetrics = &httpMetrics{}
// Scan config for allowed hosts to prevent cardinality explosion
app.Metrics.scanConfigForHosts(app)
if err := app.Metrics.provisionOTLP(ctx); err != nil {
return err
}
}
// prepare each server
oldContext := ctx.Context
@ -817,6 +820,12 @@ func (app *App) Stop() error {
}
}
// flush and shut down the OTLP metrics exporter (if configured) so any
// last data point reaches the collector before the process exits
if err := app.Metrics.shutdown(ctx); err != nil {
app.logger.Error("shutting down OTLP metrics", zap.Error(err))
}
app.stopped = true
return nil
}

View file

@ -3,6 +3,7 @@ package caddyhttp
import (
"context"
"errors"
"fmt"
"net/http"
"strings"
"sync"
@ -10,9 +11,14 @@ import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
otelprom "go.opentelemetry.io/contrib/bridges/prometheus"
"go.opentelemetry.io/contrib/exporters/autoexport"
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
"go.opentelemetry.io/otel/sdk/resource"
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
"github.com/caddyserver/caddy/v2"
"github.com/caddyserver/caddy/v2/internal/metrics"
caddymetrics "github.com/caddyserver/caddy/v2/internal/metrics"
)
// Metrics configures metrics observations.
@ -67,10 +73,20 @@ type Metrics struct {
// for production environments exposed to the internet).
ObserveCatchallHosts bool `json:"observe_catchall_hosts,omitempty"`
// Enable pushing metrics via OTLP in addition to the existing Prometheus
// scrape endpoints. When set, a PeriodicReader is attached to the shared
// Prometheus registry (via a Prometheus -> OpenTelemetry bridge), and the
// exporter is autoconfigured from the standard OTEL_* environment
// variables (OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_PROTOCOL,
// OTEL_METRICS_EXPORTER, ...). Set OTEL_METRICS_EXPORTER=none or simply
// keep this field false to disable OTLP export.
OTLP bool `json:"otlp,omitempty"`
init sync.Once
httpMetrics *httpMetrics
allowedHosts map[string]struct{}
hasHTTPSServer bool
meterProvider *sdkmetric.MeterProvider
}
type httpMetrics struct {
@ -147,6 +163,70 @@ func initHTTPMetrics(ctx caddy.Context, metrics *Metrics) {
}, httpLabels)
}
// provisionOTLP wires a MeterProvider that periodically reads the process-wide
// Prometheus registry and pushes the result via OTLP. The exporter and reader
// are autoconfigured from the standard OTEL_* environment variables, matching
// the ergonomics of the existing `tracing` directive. It is a no-op when
// m.OTLP is false, and honors OTEL_METRICS_EXPORTER=none (autoexport
// short-circuits to a no-op reader in that case).
func (m *Metrics) provisionOTLP(ctx caddy.Context) error {
if !m.OTLP {
return nil
}
// Register a Prometheus -> OpenTelemetry bridge against the process-wide
// Prometheus registry as the *default* source the NewMetricReader below
// will read from.
//
// NB: despite the "With*" naming, autoexport.WithFallbackMetricProducer is
// a package-level setter (it returns nothing) — it mutates autoexport's
// internal producer registry and takes effect on the very next call to
// NewMetricReader. It is NOT a MetricOption and must not be passed as one.
// Users can still override the source by setting OTEL_METRICS_PRODUCERS.
reg := ctx.GetMetricsRegistry()
autoexport.WithFallbackMetricProducer(func(context.Context) (sdkmetric.Producer, error) {
return otelprom.NewMetricProducer(otelprom.WithGatherer(reg)), nil
})
reader, err := autoexport.NewMetricReader(ctx)
if err != nil {
return fmt.Errorf("creating OTLP metric reader: %w", err)
}
version, _ := caddy.Version()
res, err := resource.Merge(resource.Default(), resource.NewSchemaless(
semconv.WebEngineName(ServerHeader),
semconv.WebEngineVersion(version),
))
if err != nil {
return fmt.Errorf("building OTLP metrics resource: %w", err)
}
m.meterProvider = sdkmetric.NewMeterProvider(
sdkmetric.WithResource(res),
sdkmetric.WithReader(reader),
)
return nil
}
// shutdown flushes and tears down the OTLP MeterProvider if one was provisioned.
// Both ForceFlush and Shutdown are always attempted so that a flush failure
// does not prevent the reader goroutines from being stopped; errors from both
// are returned joined.
func (m *Metrics) shutdown(ctx context.Context) error {
if m == nil || m.meterProvider == nil {
return nil
}
// ForceFlush gives the final collection a chance to reach the collector
// before the reader goroutine is stopped by Shutdown.
return errors.Join(
m.meterProvider.ForceFlush(ctx),
m.meterProvider.Shutdown(ctx),
)
}
// scanConfigForHosts scans the HTTP app configuration to build a set of allowed hosts
// for metrics collection, similar to how auto-HTTPS scans for domain names.
func (m *Metrics) scanConfigForHosts(app *App) {
@ -234,7 +314,7 @@ func newMetricsInstrumentedRoute(ctx caddy.Context, handler string, next Handler
func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Request) error {
server := serverNameFromContext(r.Context())
labels := prometheus.Labels{"server": server, "handler": h.handler}
method := metrics.SanitizeMethod(r.Method)
method := caddymetrics.SanitizeMethod(r.Method)
// the "code" value is set later, but initialized here to eliminate the possibility
// of a panic
statusLabels := prometheus.Labels{"server": server, "handler": h.handler, "method": method, "code": ""}
@ -264,7 +344,7 @@ func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Requ
// being called when the headers are written.
// Effectively the same behaviour as promhttp.InstrumentHandlerTimeToWriteHeader.
writeHeaderRecorder := ShouldBufferFunc(func(status int, header http.Header) bool {
statusLabels["code"] = metrics.SanitizeCode(status)
statusLabels["code"] = caddymetrics.SanitizeCode(status)
ttfb := time.Since(start).Seconds()
h.metrics.httpMetrics.responseDuration.With(statusLabels).Observe(ttfb)
return false
@ -280,7 +360,7 @@ func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Requ
if statusLabels["code"] == "" {
// we still sanitize it, even though it's likely to be 0. A 200 is
// returned on fallthrough so we want to reflect that.
statusLabels["code"] = metrics.SanitizeCode(status)
statusLabels["code"] = caddymetrics.SanitizeCode(status)
}
h.metrics.httpMetrics.requestDuration.With(statusLabels).Observe(dur)

View file

@ -523,6 +523,56 @@ func TestMetricsInstrumentedRoute(t *testing.T) {
}
}
func TestMetricsProvisionOTLPDisabled(t *testing.T) {
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
m := &Metrics{OTLP: false}
if err := m.provisionOTLP(ctx); err != nil {
t.Fatalf("provisionOTLP returned unexpected error: %v", err)
}
if m.meterProvider != nil {
t.Fatalf("meterProvider should remain nil when OTLP is disabled")
}
// shutdown must be safe on a never-provisioned Metrics.
if err := m.shutdown(context.Background()); err != nil {
t.Fatalf("shutdown returned unexpected error: %v", err)
}
}
func TestMetricsProvisionOTLPNoopExporter(t *testing.T) {
// OTEL_METRICS_EXPORTER=none makes autoexport return its built-in
// no-op reader, which avoids any network I/O while still exercising
// the full provisionOTLP -> shutdown lifecycle.
t.Setenv("OTEL_METRICS_EXPORTER", "none")
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
m := &Metrics{OTLP: true}
if err := m.provisionOTLP(ctx); err != nil {
t.Fatalf("provisionOTLP returned unexpected error: %v", err)
}
if m.meterProvider == nil {
t.Fatalf("provisionOTLP did not create a MeterProvider")
}
if err := m.shutdown(context.Background()); err != nil {
t.Fatalf("shutdown returned unexpected error: %v", err)
}
}
// shutdown on a nil receiver is a convenience so App.Stop can call it
// without guarding against app.Metrics being unset.
func TestMetricsShutdownNilReceiver(t *testing.T) {
var m *Metrics
if err := m.shutdown(context.Background()); err != nil {
t.Fatalf("shutdown on nil Metrics returned unexpected error: %v", err)
}
}
func BenchmarkMetricsInstrumentedRoute(b *testing.B) {
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
m := &Metrics{

View file

@ -21,7 +21,6 @@ import (
)
const (
webEngineName = "Caddy"
defaultSpanName = "handler"
nextCallCtxKey caddy.CtxKey = "nextCall"
)
@ -58,7 +57,7 @@ func newOpenTelemetryWrapper(
}
version, _ := caddy.Version()
res, err := ot.newResource(webEngineName, version)
res, err := ot.newResource(caddyhttp.ServerHeader, version)
if err != nil {
return ot, fmt.Errorf("creating resource error: %w", err)
}