feat(M48): continuous TLS health monitoring — endpoint state machine, shared tlsprobe, 8 API endpoints, GUI

Adds continuous TLS endpoint health monitoring that closes the deploy→verify→monitor loop.
After M25 verifies a deployment succeeded once, M48 continuously confirms it stays healthy.

Key components:
- Shared `internal/tlsprobe/` package extracted from network scanner for reuse
- Health status state machine: healthy → degraded (2 failures) → down (5 failures),
  plus cert_mismatch when served fingerprint differs from expected
- 8th scheduler loop (60s tick, per-endpoint configurable intervals)
- PostgreSQL migration 000011: endpoint_health_checks + endpoint_health_history tables
- 8 REST API endpoints (CRUD, history, acknowledge, summary)
- Health Monitor GUI page with summary bar, status table, create modal, auto-refresh
- 38 new tests (5 tlsprobe + 11 domain + 10 service + 8 handler + 4 frontend)
- All coverage thresholds maintained (service 68%, handler 83%, domain 87%, middleware 63%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-15 21:45:45 -04:00
parent f2e60b93a3
commit 596d86a206
29 changed files with 3540 additions and 30 deletions
+50
View File
@@ -32,6 +32,7 @@ type Config struct {
GoogleCAS GoogleCASConfig
AWSACMPCA AWSACMPCAConfig
Digest DigestConfig
HealthCheck HealthCheckConfig
Encryption EncryptionConfig
}
@@ -319,6 +320,46 @@ type DigestConfig struct {
Recipients []string
}
// HealthCheckConfig contains configuration for continuous TLS health monitoring (M48).
type HealthCheckConfig struct {
// Enabled controls whether health checks are enabled.
// Default: false.
// Setting: CERTCTL_HEALTH_CHECK_ENABLED environment variable.
Enabled bool
// CheckInterval is the main scheduler loop interval for polling due checks.
// Default: 60 seconds. Each endpoint has its own check_interval_seconds.
// Setting: CERTCTL_HEALTH_CHECK_INTERVAL environment variable.
CheckInterval time.Duration
// DefaultInterval is the default probe interval in seconds for each endpoint (per-endpoint basis).
// Default: 300 seconds (5 minutes).
// Setting: CERTCTL_HEALTH_CHECK_DEFAULT_INTERVAL environment variable.
DefaultInterval int
// DefaultTimeout is the default TLS connection timeout in milliseconds.
// Default: 5000 milliseconds (5 seconds).
// Setting: CERTCTL_HEALTH_CHECK_DEFAULT_TIMEOUT environment variable.
DefaultTimeout int
// MaxConcurrent is the maximum number of concurrent TLS probes.
// Default: 20.
// Setting: CERTCTL_HEALTH_CHECK_MAX_CONCURRENT environment variable.
MaxConcurrent int
// HistoryRetention controls how long probe history records are kept.
// Default: 30 days. Older records are purged by the scheduler.
// Setting: CERTCTL_HEALTH_CHECK_HISTORY_RETENTION environment variable.
HistoryRetention time.Duration
// AutoCreate controls whether health checks are auto-created when:
// - A deployment job completes with verification success
// - A network scan target has health_check_enabled=true
// Default: true.
// Setting: CERTCTL_HEALTH_CHECK_AUTO_CREATE environment variable.
AutoCreate bool
}
// ACMEConfig contains ACME issuer connector configuration.
type ACMEConfig struct {
// DirectoryURL is the ACME directory URL for certificate issuance.
@@ -678,6 +719,15 @@ func Load() (*Config, error) {
Interval: getEnvDuration("CERTCTL_DIGEST_INTERVAL", 24*time.Hour),
Recipients: getEnvList("CERTCTL_DIGEST_RECIPIENTS", nil),
},
HealthCheck: HealthCheckConfig{
Enabled: getEnvBool("CERTCTL_HEALTH_CHECK_ENABLED", false),
CheckInterval: getEnvDuration("CERTCTL_HEALTH_CHECK_INTERVAL", 60*time.Second),
DefaultInterval: getEnvInt("CERTCTL_HEALTH_CHECK_DEFAULT_INTERVAL", 300),
DefaultTimeout: getEnvInt("CERTCTL_HEALTH_CHECK_DEFAULT_TIMEOUT", 5000),
MaxConcurrent: getEnvInt("CERTCTL_HEALTH_CHECK_MAX_CONCURRENT", 20),
HistoryRetention: getEnvDuration("CERTCTL_HEALTH_CHECK_HISTORY_RETENTION", 30*24*time.Hour),
AutoCreate: getEnvBool("CERTCTL_HEALTH_CHECK_AUTO_CREATE", true),
},
Encryption: EncryptionConfig{
ConfigEncryptionKey: getEnv("CERTCTL_CONFIG_ENCRYPTION_KEY", ""),
},