mirror of
https://github.com/caddyserver/caddy.git
synced 2026-05-13 09:06:41 +00:00
metrics: Implement pushing via OLTP (#7664)
This commit is contained in:
parent
355c178213
commit
2a3ed96f8c
7 changed files with 183 additions and 8 deletions
|
|
@ -484,6 +484,8 @@ func unmarshalCaddyfileMetricsOptions(d *caddyfile.Dispenser) (any, error) {
|
|||
metrics.PerHost = true
|
||||
case "observe_catchall_hosts":
|
||||
metrics.ObserveCatchallHosts = true
|
||||
case "otlp":
|
||||
metrics.OTLP = true
|
||||
default:
|
||||
return nil, d.Errf("unrecognized servers option '%s'", d.Val())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
{
|
||||
metrics {
|
||||
otlp
|
||||
}
|
||||
}
|
||||
:80 {
|
||||
respond "Hello"
|
||||
}
|
||||
----------
|
||||
{
|
||||
"apps": {
|
||||
"http": {
|
||||
"servers": {
|
||||
"srv0": {
|
||||
"listen": [
|
||||
":80"
|
||||
],
|
||||
"routes": [
|
||||
{
|
||||
"handle": [
|
||||
{
|
||||
"body": "Hello",
|
||||
"handler": "static_response"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"metrics": {
|
||||
"otlp": true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
4
go.mod
4
go.mod
|
|
@ -30,11 +30,13 @@ require (
|
|||
github.com/tailscale/tscert v0.0.0-20251216020129-aea342f6d747
|
||||
github.com/yuin/goldmark v1.8.2
|
||||
github.com/yuin/goldmark-highlighting/v2 v2.0.0-20230729083705-37449abec8cc
|
||||
go.opentelemetry.io/contrib/bridges/prometheus v0.68.0
|
||||
go.opentelemetry.io/contrib/exporters/autoexport v0.65.0
|
||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.65.0
|
||||
go.opentelemetry.io/contrib/propagators/autoprop v0.65.0
|
||||
go.opentelemetry.io/otel v1.43.0
|
||||
go.opentelemetry.io/otel/sdk v1.43.0
|
||||
go.opentelemetry.io/otel/sdk/metric v1.43.0
|
||||
go.step.sm/crypto v0.77.1
|
||||
go.uber.org/automaxprocs v1.6.0
|
||||
go.uber.org/zap v1.27.1
|
||||
|
|
@ -87,7 +89,6 @@ require (
|
|||
github.com/x448/float16 v0.8.4 // indirect
|
||||
github.com/zeebo/blake3 v0.2.4 // indirect
|
||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||
go.opentelemetry.io/contrib/bridges/prometheus v0.68.0 // indirect
|
||||
go.opentelemetry.io/contrib/propagators/aws v1.43.0 // indirect
|
||||
go.opentelemetry.io/contrib/propagators/b3 v1.43.0 // indirect
|
||||
go.opentelemetry.io/contrib/propagators/jaeger v1.43.0 // indirect
|
||||
|
|
@ -104,7 +105,6 @@ require (
|
|||
go.opentelemetry.io/otel/exporters/stdout/stdouttrace v1.43.0 // indirect
|
||||
go.opentelemetry.io/otel/log v0.19.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk/log v0.19.0 // indirect
|
||||
go.opentelemetry.io/otel/sdk/metric v1.43.0 // indirect
|
||||
go.yaml.in/yaml/v2 v2.4.4 // indirect
|
||||
go.yaml.in/yaml/v3 v3.0.4 // indirect
|
||||
golang.org/x/exp v0.0.0-20251023183803-a4bb9ffd2546 // indirect
|
||||
|
|
|
|||
|
|
@ -208,6 +208,9 @@ func (app *App) Provision(ctx caddy.Context) error {
|
|||
app.Metrics.httpMetrics = &httpMetrics{}
|
||||
// Scan config for allowed hosts to prevent cardinality explosion
|
||||
app.Metrics.scanConfigForHosts(app)
|
||||
if err := app.Metrics.provisionOTLP(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
// prepare each server
|
||||
oldContext := ctx.Context
|
||||
|
|
@ -817,6 +820,12 @@ func (app *App) Stop() error {
|
|||
}
|
||||
}
|
||||
|
||||
// flush and shut down the OTLP metrics exporter (if configured) so any
|
||||
// last data point reaches the collector before the process exits
|
||||
if err := app.Metrics.shutdown(ctx); err != nil {
|
||||
app.logger.Error("shutting down OTLP metrics", zap.Error(err))
|
||||
}
|
||||
|
||||
app.stopped = true
|
||||
return nil
|
||||
}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ package caddyhttp
|
|||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
|
|
@ -10,9 +11,14 @@ import (
|
|||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
otelprom "go.opentelemetry.io/contrib/bridges/prometheus"
|
||||
"go.opentelemetry.io/contrib/exporters/autoexport"
|
||||
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
|
||||
"go.opentelemetry.io/otel/sdk/resource"
|
||||
semconv "go.opentelemetry.io/otel/semconv/v1.26.0"
|
||||
|
||||
"github.com/caddyserver/caddy/v2"
|
||||
"github.com/caddyserver/caddy/v2/internal/metrics"
|
||||
caddymetrics "github.com/caddyserver/caddy/v2/internal/metrics"
|
||||
)
|
||||
|
||||
// Metrics configures metrics observations.
|
||||
|
|
@ -67,10 +73,20 @@ type Metrics struct {
|
|||
// for production environments exposed to the internet).
|
||||
ObserveCatchallHosts bool `json:"observe_catchall_hosts,omitempty"`
|
||||
|
||||
// Enable pushing metrics via OTLP in addition to the existing Prometheus
|
||||
// scrape endpoints. When set, a PeriodicReader is attached to the shared
|
||||
// Prometheus registry (via a Prometheus -> OpenTelemetry bridge), and the
|
||||
// exporter is autoconfigured from the standard OTEL_* environment
|
||||
// variables (OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_PROTOCOL,
|
||||
// OTEL_METRICS_EXPORTER, ...). Set OTEL_METRICS_EXPORTER=none or simply
|
||||
// keep this field false to disable OTLP export.
|
||||
OTLP bool `json:"otlp,omitempty"`
|
||||
|
||||
init sync.Once
|
||||
httpMetrics *httpMetrics
|
||||
allowedHosts map[string]struct{}
|
||||
hasHTTPSServer bool
|
||||
meterProvider *sdkmetric.MeterProvider
|
||||
}
|
||||
|
||||
type httpMetrics struct {
|
||||
|
|
@ -147,6 +163,70 @@ func initHTTPMetrics(ctx caddy.Context, metrics *Metrics) {
|
|||
}, httpLabels)
|
||||
}
|
||||
|
||||
// provisionOTLP wires a MeterProvider that periodically reads the process-wide
|
||||
// Prometheus registry and pushes the result via OTLP. The exporter and reader
|
||||
// are autoconfigured from the standard OTEL_* environment variables, matching
|
||||
// the ergonomics of the existing `tracing` directive. It is a no-op when
|
||||
// m.OTLP is false, and honors OTEL_METRICS_EXPORTER=none (autoexport
|
||||
// short-circuits to a no-op reader in that case).
|
||||
func (m *Metrics) provisionOTLP(ctx caddy.Context) error {
|
||||
if !m.OTLP {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Register a Prometheus -> OpenTelemetry bridge against the process-wide
|
||||
// Prometheus registry as the *default* source the NewMetricReader below
|
||||
// will read from.
|
||||
//
|
||||
// NB: despite the "With*" naming, autoexport.WithFallbackMetricProducer is
|
||||
// a package-level setter (it returns nothing) — it mutates autoexport's
|
||||
// internal producer registry and takes effect on the very next call to
|
||||
// NewMetricReader. It is NOT a MetricOption and must not be passed as one.
|
||||
// Users can still override the source by setting OTEL_METRICS_PRODUCERS.
|
||||
reg := ctx.GetMetricsRegistry()
|
||||
autoexport.WithFallbackMetricProducer(func(context.Context) (sdkmetric.Producer, error) {
|
||||
return otelprom.NewMetricProducer(otelprom.WithGatherer(reg)), nil
|
||||
})
|
||||
|
||||
reader, err := autoexport.NewMetricReader(ctx)
|
||||
if err != nil {
|
||||
return fmt.Errorf("creating OTLP metric reader: %w", err)
|
||||
}
|
||||
|
||||
version, _ := caddy.Version()
|
||||
res, err := resource.Merge(resource.Default(), resource.NewSchemaless(
|
||||
semconv.WebEngineName(ServerHeader),
|
||||
semconv.WebEngineVersion(version),
|
||||
))
|
||||
if err != nil {
|
||||
return fmt.Errorf("building OTLP metrics resource: %w", err)
|
||||
}
|
||||
|
||||
m.meterProvider = sdkmetric.NewMeterProvider(
|
||||
sdkmetric.WithResource(res),
|
||||
sdkmetric.WithReader(reader),
|
||||
)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// shutdown flushes and tears down the OTLP MeterProvider if one was provisioned.
|
||||
// Both ForceFlush and Shutdown are always attempted so that a flush failure
|
||||
// does not prevent the reader goroutines from being stopped; errors from both
|
||||
// are returned joined.
|
||||
func (m *Metrics) shutdown(ctx context.Context) error {
|
||||
if m == nil || m.meterProvider == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
// ForceFlush gives the final collection a chance to reach the collector
|
||||
// before the reader goroutine is stopped by Shutdown.
|
||||
return errors.Join(
|
||||
m.meterProvider.ForceFlush(ctx),
|
||||
m.meterProvider.Shutdown(ctx),
|
||||
)
|
||||
}
|
||||
|
||||
// scanConfigForHosts scans the HTTP app configuration to build a set of allowed hosts
|
||||
// for metrics collection, similar to how auto-HTTPS scans for domain names.
|
||||
func (m *Metrics) scanConfigForHosts(app *App) {
|
||||
|
|
@ -234,7 +314,7 @@ func newMetricsInstrumentedRoute(ctx caddy.Context, handler string, next Handler
|
|||
func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Request) error {
|
||||
server := serverNameFromContext(r.Context())
|
||||
labels := prometheus.Labels{"server": server, "handler": h.handler}
|
||||
method := metrics.SanitizeMethod(r.Method)
|
||||
method := caddymetrics.SanitizeMethod(r.Method)
|
||||
// the "code" value is set later, but initialized here to eliminate the possibility
|
||||
// of a panic
|
||||
statusLabels := prometheus.Labels{"server": server, "handler": h.handler, "method": method, "code": ""}
|
||||
|
|
@ -264,7 +344,7 @@ func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Requ
|
|||
// being called when the headers are written.
|
||||
// Effectively the same behaviour as promhttp.InstrumentHandlerTimeToWriteHeader.
|
||||
writeHeaderRecorder := ShouldBufferFunc(func(status int, header http.Header) bool {
|
||||
statusLabels["code"] = metrics.SanitizeCode(status)
|
||||
statusLabels["code"] = caddymetrics.SanitizeCode(status)
|
||||
ttfb := time.Since(start).Seconds()
|
||||
h.metrics.httpMetrics.responseDuration.With(statusLabels).Observe(ttfb)
|
||||
return false
|
||||
|
|
@ -280,7 +360,7 @@ func (h *metricsInstrumentedRoute) ServeHTTP(w http.ResponseWriter, r *http.Requ
|
|||
if statusLabels["code"] == "" {
|
||||
// we still sanitize it, even though it's likely to be 0. A 200 is
|
||||
// returned on fallthrough so we want to reflect that.
|
||||
statusLabels["code"] = metrics.SanitizeCode(status)
|
||||
statusLabels["code"] = caddymetrics.SanitizeCode(status)
|
||||
}
|
||||
|
||||
h.metrics.httpMetrics.requestDuration.With(statusLabels).Observe(dur)
|
||||
|
|
|
|||
|
|
@ -523,6 +523,56 @@ func TestMetricsInstrumentedRoute(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestMetricsProvisionOTLPDisabled(t *testing.T) {
|
||||
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
|
||||
|
||||
m := &Metrics{OTLP: false}
|
||||
|
||||
if err := m.provisionOTLP(ctx); err != nil {
|
||||
t.Fatalf("provisionOTLP returned unexpected error: %v", err)
|
||||
}
|
||||
if m.meterProvider != nil {
|
||||
t.Fatalf("meterProvider should remain nil when OTLP is disabled")
|
||||
}
|
||||
|
||||
// shutdown must be safe on a never-provisioned Metrics.
|
||||
if err := m.shutdown(context.Background()); err != nil {
|
||||
t.Fatalf("shutdown returned unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMetricsProvisionOTLPNoopExporter(t *testing.T) {
|
||||
// OTEL_METRICS_EXPORTER=none makes autoexport return its built-in
|
||||
// no-op reader, which avoids any network I/O while still exercising
|
||||
// the full provisionOTLP -> shutdown lifecycle.
|
||||
t.Setenv("OTEL_METRICS_EXPORTER", "none")
|
||||
|
||||
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
|
||||
|
||||
m := &Metrics{OTLP: true}
|
||||
|
||||
if err := m.provisionOTLP(ctx); err != nil {
|
||||
t.Fatalf("provisionOTLP returned unexpected error: %v", err)
|
||||
}
|
||||
if m.meterProvider == nil {
|
||||
t.Fatalf("provisionOTLP did not create a MeterProvider")
|
||||
}
|
||||
|
||||
if err := m.shutdown(context.Background()); err != nil {
|
||||
t.Fatalf("shutdown returned unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// shutdown on a nil receiver is a convenience so App.Stop can call it
|
||||
// without guarding against app.Metrics being unset.
|
||||
func TestMetricsShutdownNilReceiver(t *testing.T) {
|
||||
var m *Metrics
|
||||
|
||||
if err := m.shutdown(context.Background()); err != nil {
|
||||
t.Fatalf("shutdown on nil Metrics returned unexpected error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkMetricsInstrumentedRoute(b *testing.B) {
|
||||
ctx, _ := caddy.NewContext(caddy.Context{Context: context.Background()})
|
||||
m := &Metrics{
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
webEngineName = "Caddy"
|
||||
defaultSpanName = "handler"
|
||||
nextCallCtxKey caddy.CtxKey = "nextCall"
|
||||
)
|
||||
|
|
@ -58,7 +57,7 @@ func newOpenTelemetryWrapper(
|
|||
}
|
||||
|
||||
version, _ := caddy.Version()
|
||||
res, err := ot.newResource(webEngineName, version)
|
||||
res, err := ot.newResource(caddyhttp.ServerHeader, version)
|
||||
if err != nil {
|
||||
return ot, fmt.Errorf("creating resource error: %w", err)
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue