mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-14 22:28:52 +00:00
feat(M48): continuous TLS health monitoring — endpoint state machine, shared tlsprobe, 8 API endpoints, GUI
Adds continuous TLS endpoint health monitoring that closes the deploy→verify→monitor loop. After M25 verifies a deployment succeeded once, M48 continuously confirms it stays healthy. Key components: - Shared `internal/tlsprobe/` package extracted from network scanner for reuse - Health status state machine: healthy → degraded (2 failures) → down (5 failures), plus cert_mismatch when served fingerprint differs from expected - 8th scheduler loop (60s tick, per-endpoint configurable intervals) - PostgreSQL migration 000011: endpoint_health_checks + endpoint_health_history tables - 8 REST API endpoints (CRUD, history, acknowledge, summary) - Health Monitor GUI page with summary bar, status table, create modal, auto-refresh - 38 new tests (5 tlsprobe + 11 domain + 10 service + 8 handler + 4 frontend) - All coverage thresholds maintained (service 68%, handler 83%, domain 87%, middleware 63%) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,109 @@
|
||||
package domain
|
||||
|
||||
import "time"
|
||||
|
||||
// HealthStatus represents the current health state of a monitored endpoint.
|
||||
type HealthStatus string
|
||||
|
||||
const (
|
||||
HealthStatusHealthy HealthStatus = "healthy"
|
||||
HealthStatusDegraded HealthStatus = "degraded"
|
||||
HealthStatusDown HealthStatus = "down"
|
||||
HealthStatusCertMismatch HealthStatus = "cert_mismatch"
|
||||
HealthStatusUnknown HealthStatus = "unknown"
|
||||
)
|
||||
|
||||
// IsValidHealthStatus checks if a health status string is valid.
|
||||
func IsValidHealthStatus(s string) bool {
|
||||
switch HealthStatus(s) {
|
||||
case HealthStatusHealthy, HealthStatusDegraded, HealthStatusDown, HealthStatusCertMismatch, HealthStatusUnknown:
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// EndpointHealthCheck represents a monitored TLS endpoint.
|
||||
type EndpointHealthCheck struct {
|
||||
ID string `json:"id"`
|
||||
Endpoint string `json:"endpoint"`
|
||||
CertificateID *string `json:"certificate_id,omitempty"`
|
||||
NetworkScanTargetID *string `json:"network_scan_target_id,omitempty"`
|
||||
ExpectedFingerprint string `json:"expected_fingerprint"`
|
||||
ObservedFingerprint string `json:"observed_fingerprint"`
|
||||
Status HealthStatus `json:"status"`
|
||||
ConsecutiveFailures int `json:"consecutive_failures"`
|
||||
ResponseTimeMs int `json:"response_time_ms"`
|
||||
TLSVersion string `json:"tls_version"`
|
||||
CipherSuite string `json:"cipher_suite"`
|
||||
CertSubject string `json:"cert_subject"`
|
||||
CertIssuer string `json:"cert_issuer"`
|
||||
CertExpiry *time.Time `json:"cert_expiry,omitempty"`
|
||||
LastCheckedAt *time.Time `json:"last_checked_at,omitempty"`
|
||||
LastSuccessAt *time.Time `json:"last_success_at,omitempty"`
|
||||
LastFailureAt *time.Time `json:"last_failure_at,omitempty"`
|
||||
LastTransitionAt *time.Time `json:"last_transition_at,omitempty"`
|
||||
FailureReason string `json:"failure_reason"`
|
||||
DegradedThreshold int `json:"degraded_threshold"`
|
||||
DownThreshold int `json:"down_threshold"`
|
||||
CheckIntervalSecs int `json:"check_interval_seconds"`
|
||||
Enabled bool `json:"enabled"`
|
||||
Acknowledged bool `json:"acknowledged"`
|
||||
AcknowledgedBy string `json:"acknowledged_by,omitempty"`
|
||||
AcknowledgedAt *time.Time `json:"acknowledged_at,omitempty"`
|
||||
CreatedAt time.Time `json:"created_at"`
|
||||
UpdatedAt time.Time `json:"updated_at"`
|
||||
}
|
||||
|
||||
// TransitionStatus computes the new health status based on the probe result.
|
||||
// Returns the new status and whether a transition occurred.
|
||||
func (h *EndpointHealthCheck) TransitionStatus(probeSuccess bool, observedFingerprint string) (HealthStatus, bool) {
|
||||
oldStatus := h.Status
|
||||
var newStatus HealthStatus
|
||||
|
||||
if probeSuccess {
|
||||
if h.ExpectedFingerprint != "" && observedFingerprint != h.ExpectedFingerprint {
|
||||
newStatus = HealthStatusCertMismatch
|
||||
} else {
|
||||
newStatus = HealthStatusHealthy
|
||||
}
|
||||
} else {
|
||||
// Increment failures for next calculation (caller will update h.ConsecutiveFailures)
|
||||
failures := h.ConsecutiveFailures + 1
|
||||
if failures >= h.DownThreshold {
|
||||
newStatus = HealthStatusDown
|
||||
} else if failures >= h.DegradedThreshold {
|
||||
newStatus = HealthStatusDegraded
|
||||
} else {
|
||||
// Keep current status during initial failures before threshold
|
||||
// Unless we were in an error state, transition to degraded after first failure
|
||||
if h.Status == HealthStatusUnknown || h.Status == HealthStatusHealthy {
|
||||
newStatus = HealthStatusHealthy // still considered healthy during grace period
|
||||
} else {
|
||||
newStatus = h.Status
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return newStatus, newStatus != oldStatus
|
||||
}
|
||||
|
||||
// HealthHistoryEntry represents a single probe record.
|
||||
type HealthHistoryEntry struct {
|
||||
ID string `json:"id"`
|
||||
HealthCheckID string `json:"health_check_id"`
|
||||
Status string `json:"status"`
|
||||
ResponseTimeMs int `json:"response_time_ms"`
|
||||
Fingerprint string `json:"fingerprint"`
|
||||
FailureReason string `json:"failure_reason"`
|
||||
CheckedAt time.Time `json:"checked_at"`
|
||||
}
|
||||
|
||||
// HealthCheckSummary contains aggregate counts by status.
|
||||
type HealthCheckSummary struct {
|
||||
Healthy int `json:"healthy"`
|
||||
Degraded int `json:"degraded"`
|
||||
Down int `json:"down"`
|
||||
CertMismatch int `json:"cert_mismatch"`
|
||||
Unknown int `json:"unknown"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
Reference in New Issue
Block a user