feat(M48): continuous TLS health monitoring — endpoint state machine, shared tlsprobe, 8 API endpoints, GUI

Adds continuous TLS endpoint health monitoring that closes the deploy→verify→monitor loop.
After M25 verifies a deployment succeeded once, M48 continuously confirms it stays healthy.

Key components:
- Shared `internal/tlsprobe/` package extracted from network scanner for reuse
- Health status state machine: healthy → degraded (2 failures) → down (5 failures),
  plus cert_mismatch when served fingerprint differs from expected
- 8th scheduler loop (60s tick, per-endpoint configurable intervals)
- PostgreSQL migration 000011: endpoint_health_checks + endpoint_health_history tables
- 8 REST API endpoints (CRUD, history, acknowledge, summary)
- Health Monitor GUI page with summary bar, status table, create modal, auto-refresh
- 38 new tests (5 tlsprobe + 11 domain + 10 service + 8 handler + 4 frontend)
- All coverage thresholds maintained (service 68%, handler 83%, domain 87%, middleware 63%)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
shankar0123
2026-04-15 21:45:45 -04:00
parent f2e60b93a3
commit 596d86a206
29 changed files with 3540 additions and 30 deletions
+42
View File
@@ -277,3 +277,45 @@ type OwnerRepository interface {
// Delete removes an owner.
Delete(ctx context.Context, id string) error
}
// HealthCheckRepository manages endpoint health check persistence.
type HealthCheckRepository interface {
// Create stores a new health check.
Create(ctx context.Context, check *domain.EndpointHealthCheck) error
// Update modifies an existing health check.
Update(ctx context.Context, check *domain.EndpointHealthCheck) error
// Get retrieves a health check by ID.
Get(ctx context.Context, id string) (*domain.EndpointHealthCheck, error)
// Delete removes a health check.
Delete(ctx context.Context, id string) error
// List returns health checks matching the filter with pagination.
List(ctx context.Context, filter *HealthCheckFilter) ([]*domain.EndpointHealthCheck, int, error)
// ListDueForCheck returns health checks that need to be probed (interval exceeded).
ListDueForCheck(ctx context.Context) ([]*domain.EndpointHealthCheck, error)
// GetByEndpoint retrieves a health check by endpoint address.
GetByEndpoint(ctx context.Context, endpoint string) (*domain.EndpointHealthCheck, error)
// RecordHistory records a single probe result in history.
RecordHistory(ctx context.Context, entry *domain.HealthHistoryEntry) error
// GetHistory retrieves recent probe history for a health check.
GetHistory(ctx context.Context, healthCheckID string, limit int) ([]*domain.HealthHistoryEntry, error)
// PurgeHistory deletes history entries older than the specified time.
PurgeHistory(ctx context.Context, olderThan time.Time) (int64, error)
// GetSummary returns aggregate counts by health status.
GetSummary(ctx context.Context) (*domain.HealthCheckSummary, error)
}
// HealthCheckFilter contains filter parameters for health check queries.
type HealthCheckFilter struct {
// Status filters by health status (healthy, degraded, down, cert_mismatch, unknown).
Status string
// CertificateID filters by managed certificate ID.
CertificateID string
// NetworkScanTargetID filters by network scan target ID.
NetworkScanTargetID string
// Enabled filters by enabled/disabled status (nil = all).
Enabled *bool
// Page is the page number (1-indexed).
Page int
// PerPage is the number of results per page.
PerPage int
}