mirror of
https://github.com/caddyserver/caddy.git
synced 2026-05-13 09:06:41 +00:00
reverseproxy: fix active health check counter tracking, add initially_unhealthy option
Fix three bugs in active health checks: - countHealthPass/countHealthFail now reset the opposite counter, ensuring only truly consecutive results accumulate toward thresholds. Previously, interleaved pass/fail results could incorrectly trip the threshold. - Context cancellation (e.g. during shutdown/reload) is no longer counted as a health check failure. - Initial health state on provision uses pass/fail counters (which survive reloads via the global hosts pool), so proven-healthy hosts stay healthy across config reloads. Also add the health_initially_unhealthy option, which marks backends as unhealthy until they pass active health checks. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
4f50458866
commit
33a2036240
4 changed files with 219 additions and 16 deletions
182
modules/caddyhttp/reverseproxy/active_health_test.go
Normal file
182
modules/caddyhttp/reverseproxy/active_health_test.go
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
// Copyright 2015 Matthew Holt and The Caddy Authors
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package reverseproxy
|
||||
|
||||
import (
|
||||
"testing"
|
||||
)
|
||||
|
||||
// TestConsecutiveCounterResetOnPass verifies that a health check pass
|
||||
// resets the consecutive failure counter to zero. Without this, non-
|
||||
// consecutive failures could accumulate and incorrectly trip the threshold.
|
||||
func TestConsecutiveCounterResetOnPass(t *testing.T) {
|
||||
host := new(Host)
|
||||
|
||||
// Simulate: fail, fail, then pass
|
||||
host.countHealthFail(1)
|
||||
host.countHealthFail(1)
|
||||
if host.activeHealthFails() != 2 {
|
||||
t.Fatalf("expected 2 fails, got %d", host.activeHealthFails())
|
||||
}
|
||||
|
||||
// A pass should reset the fail counter
|
||||
host.countHealthPass(1)
|
||||
if host.activeHealthFails() != 0 {
|
||||
t.Errorf("expected fail counter to reset to 0 after a pass, got %d", host.activeHealthFails())
|
||||
}
|
||||
if host.activeHealthPasses() != 1 {
|
||||
t.Errorf("expected 1 pass, got %d", host.activeHealthPasses())
|
||||
}
|
||||
}
|
||||
|
||||
// TestConsecutiveCounterResetOnFail verifies that a health check failure
|
||||
// resets the consecutive pass counter to zero.
|
||||
func TestConsecutiveCounterResetOnFail(t *testing.T) {
|
||||
host := new(Host)
|
||||
|
||||
// Simulate: pass, pass, then fail
|
||||
host.countHealthPass(1)
|
||||
host.countHealthPass(1)
|
||||
if host.activeHealthPasses() != 2 {
|
||||
t.Fatalf("expected 2 passes, got %d", host.activeHealthPasses())
|
||||
}
|
||||
|
||||
// A fail should reset the pass counter
|
||||
host.countHealthFail(1)
|
||||
if host.activeHealthPasses() != 0 {
|
||||
t.Errorf("expected pass counter to reset to 0 after a fail, got %d", host.activeHealthPasses())
|
||||
}
|
||||
if host.activeHealthFails() != 1 {
|
||||
t.Errorf("expected 1 fail, got %d", host.activeHealthFails())
|
||||
}
|
||||
}
|
||||
|
||||
// TestNonConsecutiveFailuresDoNotTripThreshold is a regression test:
|
||||
// interleaved pass/fail results must NOT accumulate toward the threshold.
|
||||
// Before the fix, fail-pass-fail-pass-fail would reach Fails=3 even
|
||||
// though there were zero consecutive failures.
|
||||
func TestNonConsecutiveFailuresDoNotTripThreshold(t *testing.T) {
|
||||
host := new(Host)
|
||||
|
||||
// Interleave: fail, pass, fail, pass, fail
|
||||
for i := 0; i < 3; i++ {
|
||||
host.countHealthFail(1)
|
||||
if i < 2 {
|
||||
host.countHealthPass(1)
|
||||
}
|
||||
}
|
||||
|
||||
// With correct consecutive tracking, we should have only 1 consecutive fail
|
||||
if host.activeHealthFails() != 1 {
|
||||
t.Errorf("expected 1 consecutive fail, got %d", host.activeHealthFails())
|
||||
}
|
||||
}
|
||||
|
||||
// TestConsecutiveFailuresDoTripThreshold verifies that truly consecutive
|
||||
// failures correctly accumulate and trip the threshold.
|
||||
func TestConsecutiveFailuresDoTripThreshold(t *testing.T) {
|
||||
host := new(Host)
|
||||
|
||||
const failThreshold = 3
|
||||
|
||||
host.countHealthFail(1)
|
||||
host.countHealthFail(1)
|
||||
host.countHealthFail(1)
|
||||
|
||||
if host.activeHealthFails() != 3 {
|
||||
t.Errorf("expected 3 consecutive fails, got %d", host.activeHealthFails())
|
||||
}
|
||||
if host.activeHealthFails() < failThreshold {
|
||||
t.Error("3 consecutive failures should trip threshold of 3")
|
||||
}
|
||||
// Pass counter should be 0 (reset by the first fail)
|
||||
if host.activeHealthPasses() != 0 {
|
||||
t.Errorf("expected 0 passes after consecutive fails, got %d", host.activeHealthPasses())
|
||||
}
|
||||
}
|
||||
|
||||
// TestInitiallyUnhealthy verifies that when InitiallyUnhealthy is true
|
||||
// and there are no prior health check passes, the upstream starts unhealthy.
|
||||
func TestInitiallyUnhealthy(t *testing.T) {
|
||||
upstream := &Upstream{
|
||||
Dial: "10.4.0.1:80",
|
||||
Host: new(Host),
|
||||
}
|
||||
|
||||
// Simulate what Provision does when InitiallyUnhealthy=true and
|
||||
// passes=0 (fresh host, no prior health checks)
|
||||
passes := 1 // default Passes threshold
|
||||
upstream.setHealthy(upstream.activeHealthPasses() >= passes)
|
||||
|
||||
if upstream.healthy() {
|
||||
t.Error("upstream should be unhealthy when InitiallyUnhealthy=true and no passes recorded")
|
||||
}
|
||||
}
|
||||
|
||||
// TestInitiallyUnhealthyWithPriorPasses verifies that when InitiallyUnhealthy
|
||||
// is true but the host already has enough passes (e.g., across a reload),
|
||||
// it starts healthy.
|
||||
func TestInitiallyUnhealthyWithPriorPasses(t *testing.T) {
|
||||
host := new(Host)
|
||||
host.countHealthPass(1) // simulate a prior health check pass
|
||||
|
||||
upstream := &Upstream{
|
||||
Dial: "10.4.0.2:80",
|
||||
Host: host,
|
||||
}
|
||||
|
||||
passes := 1
|
||||
upstream.setHealthy(upstream.activeHealthPasses() >= passes)
|
||||
|
||||
if !upstream.healthy() {
|
||||
t.Error("upstream should be healthy when it has enough prior passes, even with InitiallyUnhealthy=true")
|
||||
}
|
||||
}
|
||||
|
||||
// TestInitiallyHealthyDefault verifies the default behavior: upstreams
|
||||
// start healthy unless they have accumulated enough failures.
|
||||
func TestInitiallyHealthyDefault(t *testing.T) {
|
||||
upstream := &Upstream{
|
||||
Dial: "10.4.0.3:80",
|
||||
Host: new(Host),
|
||||
}
|
||||
|
||||
// Default behavior: healthy unless fails >= threshold
|
||||
fails := 1
|
||||
upstream.setHealthy(upstream.activeHealthFails() < fails)
|
||||
|
||||
if !upstream.healthy() {
|
||||
t.Error("upstream should be healthy by default when no failures recorded")
|
||||
}
|
||||
}
|
||||
|
||||
// TestInitiallyHealthyDefaultWithPriorFails verifies that an upstream
|
||||
// with prior failures (e.g., from before a reload) starts unhealthy.
|
||||
func TestInitiallyHealthyDefaultWithPriorFails(t *testing.T) {
|
||||
host := new(Host)
|
||||
host.countHealthFail(1) // simulate a prior failure
|
||||
|
||||
upstream := &Upstream{
|
||||
Dial: "10.4.0.4:80",
|
||||
Host: host,
|
||||
}
|
||||
|
||||
fails := 1
|
||||
upstream.setHealthy(upstream.activeHealthFails() < fails)
|
||||
|
||||
if upstream.healthy() {
|
||||
t.Error("upstream should be unhealthy when it has prior failures >= threshold")
|
||||
}
|
||||
}
|
||||
|
|
@ -514,6 +514,18 @@ func (h *Handler) UnmarshalCaddyfile(d *caddyfile.Dispenser) error {
|
|||
}
|
||||
h.HealthChecks.Active.FollowRedirects = true
|
||||
|
||||
case "health_initially_unhealthy":
|
||||
if d.NextArg() {
|
||||
return d.ArgErr()
|
||||
}
|
||||
if h.HealthChecks == nil {
|
||||
h.HealthChecks = new(HealthChecks)
|
||||
}
|
||||
if h.HealthChecks.Active == nil {
|
||||
h.HealthChecks.Active = new(ActiveHealthChecks)
|
||||
}
|
||||
h.HealthChecks.Active.InitiallyUnhealthy = true
|
||||
|
||||
case "health_passes":
|
||||
if !d.NextArg() {
|
||||
return d.ArgErr()
|
||||
|
|
|
|||
|
|
@ -16,6 +16,7 @@ package reverseproxy
|
|||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
|
|
@ -127,6 +128,9 @@ type ActiveHealthChecks struct {
|
|||
// body of a healthy backend.
|
||||
ExpectBody string `json:"expect_body,omitempty"`
|
||||
|
||||
// Whether backends are initially considered unhealthy.
|
||||
InitiallyUnhealthy bool `json:"initially_unhealthy,omitempty"`
|
||||
|
||||
uri *url.URL
|
||||
httpClient *http.Client
|
||||
bodyRegexp *regexp.Regexp
|
||||
|
|
@ -185,7 +189,21 @@ func (a *ActiveHealthChecks) Provision(ctx caddy.Context, h *Handler) error {
|
|||
},
|
||||
}
|
||||
|
||||
if a.Passes < 1 {
|
||||
a.Passes = 1
|
||||
}
|
||||
|
||||
if a.Fails < 1 {
|
||||
a.Fails = 1
|
||||
}
|
||||
|
||||
for _, upstream := range h.Upstreams {
|
||||
if a.InitiallyUnhealthy {
|
||||
upstream.setHealthy(upstream.activeHealthPasses() >= a.Passes)
|
||||
} else {
|
||||
upstream.setHealthy(upstream.activeHealthFails() < a.Fails)
|
||||
}
|
||||
|
||||
// if there's an alternative upstream for health-check provided in the config,
|
||||
// then use it, otherwise use the upstream's dial address. if upstream is used,
|
||||
// then the port is ignored.
|
||||
|
|
@ -210,14 +228,6 @@ func (a *ActiveHealthChecks) Provision(ctx caddy.Context, h *Handler) error {
|
|||
}
|
||||
}
|
||||
|
||||
if a.Passes < 1 {
|
||||
a.Passes = 1
|
||||
}
|
||||
|
||||
if a.Fails < 1 {
|
||||
a.Fails = 1
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -477,7 +487,6 @@ func (h *Handler) doActiveHealthCheck(dialInfo DialInfo, hostAddr string, networ
|
|||
// dispatch an event that the host newly became unhealthy
|
||||
if upstream.setHealthy(false) {
|
||||
h.events.Emit(h.ctx, "unhealthy", map[string]any{"host": hostAddr})
|
||||
upstream.Host.resetHealth()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -500,7 +509,6 @@ func (h *Handler) doActiveHealthCheck(dialInfo DialInfo, hostAddr string, networ
|
|||
c.Write(zap.String("host", hostAddr))
|
||||
}
|
||||
h.events.Emit(h.ctx, "healthy", map[string]any{"host": hostAddr})
|
||||
upstream.Host.resetHealth()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -508,6 +516,11 @@ func (h *Handler) doActiveHealthCheck(dialInfo DialInfo, hostAddr string, networ
|
|||
// do the request, being careful to tame the response body
|
||||
resp, err := h.HealthChecks.Active.httpClient.Do(req) //nolint:gosec // no SSRF
|
||||
if err != nil {
|
||||
if errors.Is(err, context.Canceled) {
|
||||
// context was canceled, so don't count this as a failure
|
||||
return nil
|
||||
}
|
||||
|
||||
if c := h.HealthChecks.Active.logger.Check(zapcore.InfoLevel, "HTTP request failed"); c != nil {
|
||||
c.Write(
|
||||
zap.String("host", hostAddr),
|
||||
|
|
|
|||
|
|
@ -227,6 +227,7 @@ func (h *Host) countHealthPass(delta int) error {
|
|||
if result < 0 {
|
||||
return fmt.Errorf("count below 0: %d", result)
|
||||
}
|
||||
h.activeFails.Store(0)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
|
@ -237,13 +238,8 @@ func (h *Host) countHealthFail(delta int) error {
|
|||
if result < 0 {
|
||||
return fmt.Errorf("count below 0: %d", result)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// resetHealth resets the health check counters.
|
||||
func (h *Host) resetHealth() {
|
||||
h.activePasses.Store(0)
|
||||
h.activeFails.Store(0)
|
||||
return nil
|
||||
}
|
||||
|
||||
// healthy returns true if the upstream is not actively marked as unhealthy.
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue