mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 12:21:31 +00:00
21aeed4f4e
Phase 0 closure (Path B2, post-rewrite):
addlicense sweep — adds the canonical certctl LLC copyright + BUSL-1.1
SPDX header to every production Go file. Template:
// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1
Coverage: 338 / 338 production Go files (cmd/ + internal/, excluding
*_test.go and **/testdata/**). Pre-sweep coverage was 22 / 338 (6.5%);
post-sweep is 338 / 338 (100%).
Normalized 22 pre-existing legacy headers (`// Copyright (c) certctl`
+ `// SPDX-License-Identifier: BSL-1.1`) and 1 file using a
`Certctl Contributors` attribution. The legacy SPDX ID `BSL-1.1`
is non-standard; the official SPDX identifier for Business Source
License 1.1 is `BUSL-1.1` (capital U). All 338 files now share the
canonical form.
Generated via:
addlicense -c "certctl LLC" -y 2026 \
-f cowork/legal/copyright-header.tpl \
-ignore '**/testdata/**' -ignore '**/*_test.go' \
cmd/ internal/
Verification:
find cmd internal -name '*.go' -not -name '*_test.go' \
-not -path '*/testdata/*' \
-exec grep -L '^// Copyright 2026 certctl LLC' {} \; | wc -l
Returns: 0
gofmt clean. Header additions are comments only, no compile impact.
Closes: cowork/certctl-architecture-diligence-audit.html#fix-RED-4
317 lines
8.9 KiB
Go
317 lines
8.9 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package service
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/certctl-io/certctl/internal/domain"
|
|
"github.com/certctl-io/certctl/internal/repository"
|
|
"github.com/certctl-io/certctl/internal/tlsprobe"
|
|
)
|
|
|
|
// HealthCheckService manages endpoint TLS health monitoring.
|
|
type HealthCheckService struct {
|
|
repo repository.HealthCheckRepository
|
|
auditService *AuditService
|
|
notifService *NotificationService
|
|
logger *slog.Logger
|
|
maxConcurrent int
|
|
defaultTimeout time.Duration
|
|
historyRetention time.Duration
|
|
autoCreate bool
|
|
}
|
|
|
|
// NewHealthCheckService creates a new HealthCheckService.
|
|
func NewHealthCheckService(
|
|
repo repository.HealthCheckRepository,
|
|
auditService *AuditService,
|
|
logger *slog.Logger,
|
|
maxConcurrent int,
|
|
defaultTimeout time.Duration,
|
|
historyRetention time.Duration,
|
|
autoCreate bool,
|
|
) *HealthCheckService {
|
|
return &HealthCheckService{
|
|
repo: repo,
|
|
auditService: auditService,
|
|
logger: logger,
|
|
maxConcurrent: maxConcurrent,
|
|
defaultTimeout: defaultTimeout,
|
|
historyRetention: historyRetention,
|
|
autoCreate: autoCreate,
|
|
}
|
|
}
|
|
|
|
// SetNotificationService sets the notification service for sending status transition alerts.
|
|
func (s *HealthCheckService) SetNotificationService(ns *NotificationService) {
|
|
s.notifService = ns
|
|
}
|
|
|
|
// RunHealthChecks is the scheduler entry point for continuous TLS health monitoring.
|
|
// Fetches endpoints due for check, probes concurrently with semaphore control,
|
|
// updates health status with state transitions, records history, and sends notifications.
|
|
func (s *HealthCheckService) RunHealthChecks(ctx context.Context) error {
|
|
// Fetch all endpoints due for check
|
|
checks, err := s.repo.ListDueForCheck(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list endpoints due for check: %w", err)
|
|
}
|
|
|
|
if len(checks) == 0 {
|
|
s.logger.Debug("no endpoints due for health check")
|
|
return nil
|
|
}
|
|
|
|
s.logger.Debug("running health checks", "endpoint_count", len(checks))
|
|
|
|
// Concurrent probing with semaphore
|
|
sem := make(chan struct{}, s.maxConcurrent)
|
|
var wg sync.WaitGroup
|
|
probeResults := make(map[string]tlsprobe.ProbeResult)
|
|
var mu sync.Mutex
|
|
|
|
for _, check := range checks {
|
|
wg.Add(1)
|
|
go func(c *domain.EndpointHealthCheck) {
|
|
defer wg.Done()
|
|
sem <- struct{}{} // acquire
|
|
defer func() { <-sem }() // release
|
|
|
|
result := tlsprobe.ProbeTLS(ctx, c.Endpoint, s.defaultTimeout)
|
|
mu.Lock()
|
|
probeResults[c.ID] = result
|
|
mu.Unlock()
|
|
}(check)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
// Process results and update health status
|
|
successCount := 0
|
|
failureCount := 0
|
|
transitionCount := 0
|
|
|
|
for _, check := range checks {
|
|
result := probeResults[check.ID]
|
|
|
|
// Determine old status for transition detection
|
|
oldStatus := check.Status
|
|
|
|
// Update probe result fields
|
|
check.LastCheckedAt = timePtr(time.Now())
|
|
check.ResponseTimeMs = result.ResponseTimeMs
|
|
|
|
if result.Success {
|
|
successCount++
|
|
check.ObservedFingerprint = result.Fingerprint
|
|
check.TLSVersion = result.TLSVersion
|
|
check.CipherSuite = result.CipherSuite
|
|
check.CertSubject = result.Subject
|
|
check.CertIssuer = result.Issuer
|
|
check.CertExpiry = timePtr(result.NotAfter)
|
|
check.FailureReason = ""
|
|
check.LastSuccessAt = timePtr(time.Now())
|
|
check.ConsecutiveFailures = 0
|
|
} else {
|
|
failureCount++
|
|
check.LastFailureAt = timePtr(time.Now())
|
|
check.ConsecutiveFailures++
|
|
check.FailureReason = result.Error
|
|
}
|
|
|
|
// Transition state based on consecutive failures and fingerprint match
|
|
newStatus, transitioned := check.TransitionStatus(result.Success, result.Fingerprint)
|
|
|
|
if transitioned {
|
|
transitionCount++
|
|
check.Status = newStatus
|
|
check.LastTransitionAt = timePtr(time.Now())
|
|
// Reset acknowledged on transition
|
|
check.Acknowledged = false
|
|
|
|
// Log transition
|
|
s.logger.Info("health check status transition",
|
|
"endpoint", check.Endpoint,
|
|
"old_status", string(oldStatus),
|
|
"new_status", string(newStatus))
|
|
|
|
// Record audit event
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_status_transition", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
"old_status": string(oldStatus),
|
|
"new_status": string(newStatus),
|
|
})
|
|
}
|
|
}
|
|
|
|
// Update health check record
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
s.logger.Error("failed to update health check",
|
|
"endpoint", check.Endpoint,
|
|
"error", err)
|
|
continue
|
|
}
|
|
|
|
// Record probe result in history
|
|
if err := s.repo.RecordHistory(ctx, &domain.HealthHistoryEntry{
|
|
HealthCheckID: check.ID,
|
|
Status: string(check.Status),
|
|
ResponseTimeMs: check.ResponseTimeMs,
|
|
Fingerprint: check.ObservedFingerprint,
|
|
FailureReason: check.FailureReason,
|
|
CheckedAt: time.Now(),
|
|
}); err != nil {
|
|
s.logger.Warn("failed to record health check history",
|
|
"endpoint", check.Endpoint,
|
|
"error", err)
|
|
}
|
|
}
|
|
|
|
// Purge old history entries once per run
|
|
if err := s.PurgeOldHistory(ctx); err != nil {
|
|
s.logger.Warn("failed to purge old health check history", "error", err)
|
|
}
|
|
|
|
s.logger.Debug("health check run completed",
|
|
"total", len(checks),
|
|
"success", successCount,
|
|
"failure", failureCount,
|
|
"transitions", transitionCount)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Create creates a new health check endpoint.
|
|
func (s *HealthCheckService) Create(ctx context.Context, check *domain.EndpointHealthCheck) error {
|
|
if check.ID == "" {
|
|
check.ID = generateID("hc")
|
|
}
|
|
check.CreatedAt = time.Now()
|
|
check.UpdatedAt = time.Now()
|
|
|
|
if err := s.repo.Create(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to create health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_created", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Get retrieves a health check by ID.
|
|
func (s *HealthCheckService) Get(ctx context.Context, id string) (*domain.EndpointHealthCheck, error) {
|
|
return s.repo.Get(ctx, id)
|
|
}
|
|
|
|
// Update updates an existing health check.
|
|
func (s *HealthCheckService) Update(ctx context.Context, check *domain.EndpointHealthCheck) error {
|
|
check.UpdatedAt = time.Now()
|
|
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to update health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_updated", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Delete deletes a health check.
|
|
func (s *HealthCheckService) Delete(ctx context.Context, id string) error {
|
|
if err := s.repo.Delete(ctx, id); err != nil {
|
|
return fmt.Errorf("failed to delete health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_deleted", "health_check", id,
|
|
map[string]interface{}{})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// List lists health checks with optional filtering.
|
|
func (s *HealthCheckService) List(ctx context.Context, filter *repository.HealthCheckFilter) ([]*domain.EndpointHealthCheck, int, error) {
|
|
if filter == nil {
|
|
filter = &repository.HealthCheckFilter{}
|
|
}
|
|
return s.repo.List(ctx, filter)
|
|
}
|
|
|
|
// GetHistory retrieves health check history for an endpoint.
|
|
func (s *HealthCheckService) GetHistory(ctx context.Context, healthCheckID string, limit int) ([]*domain.HealthHistoryEntry, error) {
|
|
if limit <= 0 {
|
|
limit = 100
|
|
}
|
|
if limit > 1000 {
|
|
limit = 1000
|
|
}
|
|
return s.repo.GetHistory(ctx, healthCheckID, limit)
|
|
}
|
|
|
|
// AcknowledgeIncident marks a health check incident as acknowledged.
|
|
func (s *HealthCheckService) AcknowledgeIncident(ctx context.Context, id string, actor string) error {
|
|
check, err := s.repo.Get(ctx, id)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get health check: %w", err)
|
|
}
|
|
|
|
check.Acknowledged = true
|
|
check.AcknowledgedBy = actor
|
|
check.AcknowledgedAt = timePtr(time.Now())
|
|
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to update health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, actor, domain.ActorTypeUser,
|
|
"health_check_acknowledged", "health_check", id,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetSummary returns aggregated health check status counts.
|
|
func (s *HealthCheckService) GetSummary(ctx context.Context) (*domain.HealthCheckSummary, error) {
|
|
return s.repo.GetSummary(ctx)
|
|
}
|
|
|
|
// PurgeOldHistory removes health check history entries older than the retention period.
|
|
func (s *HealthCheckService) PurgeOldHistory(ctx context.Context) error {
|
|
cutoff := time.Now().Add(-s.historyRetention)
|
|
_, err := s.repo.PurgeHistory(ctx, cutoff)
|
|
return err
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
func timePtr(t time.Time) *time.Time {
|
|
return &t
|
|
}
|