feat(M48): continuous TLS health monitoring — endpoint state machine, shared tlsprobe, 8 API endpoints, GUI

Adds continuous TLS endpoint health monitoring that closes the deploy→verify→monitor loop.
After M25 verifies a deployment succeeded once, M48 continuously confirms it stays healthy.

Key components:
- Shared `internal/tlsprobe/` package extracted from network scanner for reuse
- Health status state machine: healthy → degraded (2 failures) → down (5 failures),
  plus cert_mismatch when served fingerprint differs from expected
- 8th scheduler loop (60s tick, per-endpoint configurable intervals)
- PostgreSQL migration 000011: endpoint_health_checks + endpoint_health_history tables
- 8 REST API endpoints (CRUD, history, acknowledge, summary)
- Health Monitor GUI page with summary bar, status table, create modal, auto-refresh
- 38 new tests (5 tlsprobe + 11 domain + 10 service + 8 handler + 4 frontend)
- All coverage thresholds maintained (service 68%, handler 83%, domain 87%, middleware 63%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-15 21:45:45 -04:00
parent f2e60b93a3
commit 596d86a206
29 changed files with 3540 additions and 30 deletions
+109
View File
@@ -0,0 +1,109 @@
package domain
import "time"
// HealthStatus represents the current health state of a monitored endpoint.
type HealthStatus string
const (
HealthStatusHealthy HealthStatus = "healthy"
HealthStatusDegraded HealthStatus = "degraded"
HealthStatusDown HealthStatus = "down"
HealthStatusCertMismatch HealthStatus = "cert_mismatch"
HealthStatusUnknown HealthStatus = "unknown"
)
// IsValidHealthStatus checks if a health status string is valid.
func IsValidHealthStatus(s string) bool {
switch HealthStatus(s) {
case HealthStatusHealthy, HealthStatusDegraded, HealthStatusDown, HealthStatusCertMismatch, HealthStatusUnknown:
return true
}
return false
}
// EndpointHealthCheck represents a monitored TLS endpoint.
type EndpointHealthCheck struct {
ID string `json:"id"`
Endpoint string `json:"endpoint"`
CertificateID *string `json:"certificate_id,omitempty"`
NetworkScanTargetID *string `json:"network_scan_target_id,omitempty"`
ExpectedFingerprint string `json:"expected_fingerprint"`
ObservedFingerprint string `json:"observed_fingerprint"`
Status HealthStatus `json:"status"`
ConsecutiveFailures int `json:"consecutive_failures"`
ResponseTimeMs int `json:"response_time_ms"`
TLSVersion string `json:"tls_version"`
CipherSuite string `json:"cipher_suite"`
CertSubject string `json:"cert_subject"`
CertIssuer string `json:"cert_issuer"`
CertExpiry *time.Time `json:"cert_expiry,omitempty"`
LastCheckedAt *time.Time `json:"last_checked_at,omitempty"`
LastSuccessAt *time.Time `json:"last_success_at,omitempty"`
LastFailureAt *time.Time `json:"last_failure_at,omitempty"`
LastTransitionAt *time.Time `json:"last_transition_at,omitempty"`
FailureReason string `json:"failure_reason"`
DegradedThreshold int `json:"degraded_threshold"`
DownThreshold int `json:"down_threshold"`
CheckIntervalSecs int `json:"check_interval_seconds"`
Enabled bool `json:"enabled"`
Acknowledged bool `json:"acknowledged"`
AcknowledgedBy string `json:"acknowledged_by,omitempty"`
AcknowledgedAt *time.Time `json:"acknowledged_at,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
// TransitionStatus computes the new health status based on the probe result.
// Returns the new status and whether a transition occurred.
func (h *EndpointHealthCheck) TransitionStatus(probeSuccess bool, observedFingerprint string) (HealthStatus, bool) {
oldStatus := h.Status
var newStatus HealthStatus
if probeSuccess {
if h.ExpectedFingerprint != "" && observedFingerprint != h.ExpectedFingerprint {
newStatus = HealthStatusCertMismatch
} else {
newStatus = HealthStatusHealthy
}
} else {
// Increment failures for next calculation (caller will update h.ConsecutiveFailures)
failures := h.ConsecutiveFailures + 1
if failures >= h.DownThreshold {
newStatus = HealthStatusDown
} else if failures >= h.DegradedThreshold {
newStatus = HealthStatusDegraded
} else {
// Keep current status during initial failures before threshold
// Unless we were in an error state, transition to degraded after first failure
if h.Status == HealthStatusUnknown || h.Status == HealthStatusHealthy {
newStatus = HealthStatusHealthy // still considered healthy during grace period
} else {
newStatus = h.Status
}
}
}
return newStatus, newStatus != oldStatus
}
// HealthHistoryEntry represents a single probe record.
type HealthHistoryEntry struct {
ID string `json:"id"`
HealthCheckID string `json:"health_check_id"`
Status string `json:"status"`
ResponseTimeMs int `json:"response_time_ms"`
Fingerprint string `json:"fingerprint"`
FailureReason string `json:"failure_reason"`
CheckedAt time.Time `json:"checked_at"`
}
// HealthCheckSummary contains aggregate counts by status.
type HealthCheckSummary struct {
Healthy int `json:"healthy"`
Degraded int `json:"degraded"`
Down int `json:"down"`
CertMismatch int `json:"cert_mismatch"`
Unknown int `json:"unknown"`
Total int `json:"total"`
}
+237
View File
@@ -0,0 +1,237 @@
package domain
import (
"testing"
"time"
)
func TestIsValidHealthStatus(t *testing.T) {
tests := []struct {
status string
valid bool
}{
{"healthy", true},
{"degraded", true},
{"down", true},
{"cert_mismatch", true},
{"unknown", true},
{"invalid", false},
{"", false},
{"HEALTHY", false},
}
for _, tt := range tests {
t.Run(tt.status, func(t *testing.T) {
result := IsValidHealthStatus(tt.status)
if result != tt.valid {
t.Errorf("IsValidHealthStatus(%q) = %v, want %v", tt.status, result, tt.valid)
}
})
}
}
func TestTransitionStatus_HealthyProbe(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusUnknown,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
ExpectedFingerprint: "abc123",
}
newStatus, transitioned := h.TransitionStatus(true, "abc123")
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy, got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true, got false")
}
}
func TestTransitionStatus_CertMismatch(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusHealthy,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
ExpectedFingerprint: "abc123",
}
newStatus, transitioned := h.TransitionStatus(true, "xyz789")
if newStatus != HealthStatusCertMismatch {
t.Errorf("expected HealthStatusCertMismatch, got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true, got false")
}
}
func TestTransitionStatus_FirstFailure_BelowThreshold(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusHealthy,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
}
newStatus, transitioned := h.TransitionStatus(false, "")
// At 1 failure with degraded threshold 2, still healthy
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy (grace period), got %s", newStatus)
}
if transitioned {
t.Errorf("expected transition=false (still healthy), got true")
}
}
func TestTransitionStatus_DegradedThreshold(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusHealthy,
ConsecutiveFailures: 1, // Now will be 2 after increment
DegradedThreshold: 2,
DownThreshold: 5,
}
newStatus, transitioned := h.TransitionStatus(false, "")
if newStatus != HealthStatusDegraded {
t.Errorf("expected HealthStatusDegraded, got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true, got false")
}
}
func TestTransitionStatus_DownThreshold(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusDegraded,
ConsecutiveFailures: 4, // Now will be 5 after increment
DegradedThreshold: 2,
DownThreshold: 5,
}
newStatus, transitioned := h.TransitionStatus(false, "")
if newStatus != HealthStatusDown {
t.Errorf("expected HealthStatusDown, got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true, got false")
}
}
func TestTransitionStatus_Recovery(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusDown,
ConsecutiveFailures: 10,
DegradedThreshold: 2,
DownThreshold: 5,
ExpectedFingerprint: "abc123",
}
newStatus, transitioned := h.TransitionStatus(true, "abc123")
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy (recovery), got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true (from down to healthy), got false")
}
}
func TestTransitionStatus_NoFingerprint(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusHealthy,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
ExpectedFingerprint: "", // No expected fingerprint
}
newStatus, transitioned := h.TransitionStatus(true, "anything")
// Success with no expected fingerprint should always be healthy
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy (no fingerprint check), got %s", newStatus)
}
if transitioned {
t.Errorf("expected transition=false (already healthy), got true")
}
}
func TestTransitionStatus_UnknownToHealthy(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusUnknown,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
}
newStatus, transitioned := h.TransitionStatus(true, "")
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy, got %s", newStatus)
}
if !transitioned {
t.Errorf("expected transition=true (from unknown to healthy), got false")
}
}
func TestTransitionStatus_NoTransitionWhenSame(t *testing.T) {
h := &EndpointHealthCheck{
Status: HealthStatusHealthy,
ConsecutiveFailures: 0,
DegradedThreshold: 2,
DownThreshold: 5,
}
newStatus, transitioned := h.TransitionStatus(true, "")
if newStatus != HealthStatusHealthy {
t.Errorf("expected HealthStatusHealthy, got %s", newStatus)
}
if transitioned {
t.Errorf("expected transition=false (already healthy), got true")
}
}
func TestHealthCheckSummary(t *testing.T) {
summary := &HealthCheckSummary{
Healthy: 5,
Degraded: 2,
Down: 1,
CertMismatch: 1,
Unknown: 0,
Total: 9,
}
if summary.Total != 9 {
t.Errorf("expected Total=9, got %d", summary.Total)
}
if summary.Healthy != 5 {
t.Errorf("expected Healthy=5, got %d", summary.Healthy)
}
}
func TestHealthHistoryEntry(t *testing.T) {
now := time.Now()
entry := &HealthHistoryEntry{
ID: "hh-test-123",
HealthCheckID: "hc-test-123",
Status: "healthy",
ResponseTimeMs: 42,
Fingerprint: "abc123def456",
FailureReason: "",
CheckedAt: now,
}
if entry.ID != "hh-test-123" {
t.Errorf("expected ID='hh-test-123', got %q", entry.ID)
}
if entry.ResponseTimeMs != 42 {
t.Errorf("expected ResponseTimeMs=42, got %d", entry.ResponseTimeMs)
}
}