feat(M48): continuous TLS health monitoring — endpoint state machine, shared tlsprobe, 8 API endpoints, GUI

Adds continuous TLS endpoint health monitoring that closes the deploy→verify→monitor loop.
After M25 verifies a deployment succeeded once, M48 continuously confirms it stays healthy.

Key components:
- Shared `internal/tlsprobe/` package extracted from network scanner for reuse
- Health status state machine: healthy → degraded (2 failures) → down (5 failures),
  plus cert_mismatch when served fingerprint differs from expected
- 8th scheduler loop (60s tick, per-endpoint configurable intervals)
- PostgreSQL migration 000011: endpoint_health_checks + endpoint_health_history tables
- 8 REST API endpoints (CRUD, history, acknowledge, summary)
- Health Monitor GUI page with summary bar, status table, create modal, auto-refresh
- 38 new tests (5 tlsprobe + 11 domain + 10 service + 8 handler + 4 frontend)
- All coverage thresholds maintained (service 68%, handler 83%, domain 87%, middleware 63%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-15 21:45:45 -04:00
parent f2e60b93a3
commit 596d86a206
29 changed files with 3540 additions and 30 deletions
+69
View File
@@ -40,6 +40,11 @@ type DigestServicer interface {
ProcessDigest(ctx context.Context) error
}
// HealthCheckServicer defines the interface for endpoint TLS health monitoring used by the scheduler.
type HealthCheckServicer interface {
RunHealthChecks(ctx context.Context) error
}
// Scheduler manages background jobs and periodic tasks for the certificate control plane.
// It runs multiple concurrent loops for renewal checks, job processing, agent health checks,
// and notification processing.
@@ -50,6 +55,7 @@ type Scheduler struct {
notificationService NotificationServicer
networkScanService NetworkScanServicer
digestService DigestServicer
healthCheckService HealthCheckServicer
logger *slog.Logger
// Configurable tick intervals
@@ -60,6 +66,7 @@ type Scheduler struct {
shortLivedExpiryCheckInterval time.Duration
networkScanInterval time.Duration
digestInterval time.Duration
healthCheckInterval time.Duration
// Idempotency guards: prevent duplicate execution of slow jobs
renewalCheckRunning atomic.Bool
@@ -69,6 +76,7 @@ type Scheduler struct {
shortLivedExpiryCheckRunning atomic.Bool
networkScanRunning atomic.Bool
digestRunning atomic.Bool
healthCheckRunning atomic.Bool
// Graceful shutdown: wait for in-flight work to complete
wg sync.WaitGroup
@@ -99,6 +107,7 @@ func NewScheduler(
shortLivedExpiryCheckInterval: 30 * time.Second,
networkScanInterval: 6 * time.Hour,
digestInterval: 24 * time.Hour,
healthCheckInterval: 60 * time.Second,
}
}
@@ -143,6 +152,17 @@ func (s *Scheduler) SetShortLivedExpiryCheckInterval(d time.Duration) {
s.shortLivedExpiryCheckInterval = d
}
// SetHealthCheckService sets the health check service for the 8th scheduler loop.
// Called after construction since health monitoring is optional.
func (s *Scheduler) SetHealthCheckService(hcs HealthCheckServicer) {
s.healthCheckService = hcs
}
// SetHealthCheckInterval configures the interval for endpoint TLS health checks.
func (s *Scheduler) SetHealthCheckInterval(d time.Duration) {
s.healthCheckInterval = d
}
// Start initiates all background scheduler loops. It returns a channel that signals
// when the scheduler has started all loops. The scheduler runs until the context is cancelled.
func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
@@ -160,6 +180,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
if s.digestService != nil {
loopCount++
}
if s.healthCheckService != nil {
loopCount++
}
s.wg.Add(loopCount)
go func() { defer s.wg.Done(); s.renewalCheckLoop(ctx) }()
@@ -173,6 +196,9 @@ func (s *Scheduler) Start(ctx context.Context) <-chan struct{} {
if s.digestService != nil {
go func() { defer s.wg.Done(); s.digestLoop(ctx) }()
}
if s.healthCheckService != nil {
go func() { defer s.wg.Done(); s.healthCheckLoop(ctx) }()
}
// Signal that all loops are launched
close(startedChan)
@@ -517,6 +543,49 @@ func (s *Scheduler) runDigest(ctx context.Context) {
}
}
// healthCheckLoop runs every healthCheckInterval and performs endpoint TLS health checks.
// Do NOT run immediately on start — health checks are frequent (60s default) and may be
// resource-intensive. Wait for the first tick.
// Uses atomic.Bool to prevent duplicate execution if the previous check is still running.
func (s *Scheduler) healthCheckLoop(ctx context.Context) {
ticker := time.NewTicker(s.healthCheckInterval)
defer ticker.Stop()
// Do NOT run immediately on start for health checks — wait for the first tick.
// Health checks are frequent and shouldn't fire on every restart.
for {
select {
case <-ctx.Done():
return
case <-ticker.C:
if !s.healthCheckRunning.CompareAndSwap(false, true) {
s.logger.Debug("health check still running, skipping tick")
continue
}
s.wg.Add(1)
go func() {
defer s.wg.Done()
defer s.healthCheckRunning.Store(false)
s.runHealthCheck(ctx)
}()
}
}
}
// runHealthCheck executes a single health check cycle with error recovery.
func (s *Scheduler) runHealthCheck(ctx context.Context) {
opCtx, cancel := context.WithTimeout(ctx, 5*time.Minute)
defer cancel()
if err := s.healthCheckService.RunHealthChecks(opCtx); err != nil {
s.logger.Error("health check run failed",
"error", err,
"interval", s.healthCheckInterval.String())
} else {
s.logger.Debug("health check completed")
}
}
// WaitForCompletion waits for all in-flight scheduler work to complete.
// It respects the provided timeout and returns an error if work is still in progress after timeout.
// Call this after the scheduler context has been cancelled to ensure graceful shutdown.