Files
certctl/internal/service/health_check.go
T
shankar0123 8b75e0311b chore: rename Go module path to github.com/certctl-io/certctl
Mechanical sed across the main go.mod's module declaration, the f5-mock-icontrol
sub-module's go.mod, every Go file's import path (361 files), and a rebuild of
the checked-in f5-mock-icontrol binary so its embedded build-info reflects the
new module path. No behavior change.

Choice B from cowork/transfer-certctl-to-org.md, executed 2026-05-04. Choice A
(keep module path declared as github.com/shankar0123/certctl regardless of
repo URL) shipped on the day of the org transfer (2026-05-03) since we had no
external Go consumers; this commit closes that deferral.

Backward-compat: GitHub HTTP redirects continue to forward
github.com/shankar0123/certctl → github.com/certctl-io/certctl at the URL
level, but Go's module proxy uses the path declared in go.mod as the
canonical name. Pre-fix, anyone trying `go get github.com/certctl-io/certctl/...`
hit a "module path mismatch" error because go.mod said
github.com/shankar0123/certctl and the URL they fetched it from said
certctl-io/certctl. Post-fix, the canonical name and the URL agree, so
go get / go install / external Go consumers / Go-tooling integrations
work cleanly via either the new path (preferred) or the old path (which
redirects and Go follows the redirect for source fetch).

Anyone still importing the old path inside their own code keeps working
provided they update their go.mod's `require` line to match — the module
path declared in their consumer's go.sum / go.mod is the authoritative
import name, so a mass sed across their import statements is the migration
on the consumer side. No external consumers exist today.

Diff shape:
  361 *.go files  — import path replacement only
    2 go.mod     — module declaration replacement only
    1 binary     — deploy/test/f5-mock-icontrol/f5-mock-icontrol rebuilt
                   so embedded build-info reflects the new path (8618965 vs
                   8618933 bytes; 32-byte diff is the build-info change)

  Total: 364 files, 730 insertions / 730 deletions, net-zero size, pure
  mechanical substitution.

Verification:
  gofmt: 17 files needed re-alignment after sed (the new path is one char
    shorter than the old, so column-aligned import groups drifted). Applied
    `gofmt -w` to fix.
  go mod tidy: clean exit on both modules.
  go vet ./...: clean exit.
  go build ./...: clean exit.
  go test -short -count=1 on representative packages: all green
    (internal/domain, internal/validation, internal/crypto, internal/crypto/signer,
    cmd/agent). Test output now reads `ok github.com/certctl-io/certctl/...`
    confirming the module path resolves correctly.
  binary: f5-mock-icontrol rebuilt; `strings | grep shankar0123` returns
    nothing; `strings | grep certctl-io/certctl` shows the new module path
    embedded in build-info.

Files intentionally NOT touched in this commit:
  README.md / CHANGELOG.md / docs/ / etc. — already swept to certctl-io
    URLs in commit 0729ee4 (the post-transfer URL refresh). This commit is
    purely the Go-tooling layer.
  Scarf pixels (`shankar0123.docker.scarf.sh/...`) — Scarf-account
    namespace, not a Go import or GitHub repo URL. Stays.

This is a non-blocking, non-customer-impacting change. Operators pulling
container images, running `make verify`, hitting the API, or installing the
agent see no functional difference. Only Go-tooling consumers (none today)
are affected, and they're enabled — not broken — by this commit.
2026-05-04 00:30:29 +00:00

314 lines
8.8 KiB
Go

package service
import (
"context"
"fmt"
"log/slog"
"sync"
"time"
"github.com/certctl-io/certctl/internal/domain"
"github.com/certctl-io/certctl/internal/repository"
"github.com/certctl-io/certctl/internal/tlsprobe"
)
// HealthCheckService manages endpoint TLS health monitoring.
type HealthCheckService struct {
repo repository.HealthCheckRepository
auditService *AuditService
notifService *NotificationService
logger *slog.Logger
maxConcurrent int
defaultTimeout time.Duration
historyRetention time.Duration
autoCreate bool
}
// NewHealthCheckService creates a new HealthCheckService.
func NewHealthCheckService(
repo repository.HealthCheckRepository,
auditService *AuditService,
logger *slog.Logger,
maxConcurrent int,
defaultTimeout time.Duration,
historyRetention time.Duration,
autoCreate bool,
) *HealthCheckService {
return &HealthCheckService{
repo: repo,
auditService: auditService,
logger: logger,
maxConcurrent: maxConcurrent,
defaultTimeout: defaultTimeout,
historyRetention: historyRetention,
autoCreate: autoCreate,
}
}
// SetNotificationService sets the notification service for sending status transition alerts.
func (s *HealthCheckService) SetNotificationService(ns *NotificationService) {
s.notifService = ns
}
// RunHealthChecks is the scheduler entry point for continuous TLS health monitoring.
// Fetches endpoints due for check, probes concurrently with semaphore control,
// updates health status with state transitions, records history, and sends notifications.
func (s *HealthCheckService) RunHealthChecks(ctx context.Context) error {
// Fetch all endpoints due for check
checks, err := s.repo.ListDueForCheck(ctx)
if err != nil {
return fmt.Errorf("failed to list endpoints due for check: %w", err)
}
if len(checks) == 0 {
s.logger.Debug("no endpoints due for health check")
return nil
}
s.logger.Debug("running health checks", "endpoint_count", len(checks))
// Concurrent probing with semaphore
sem := make(chan struct{}, s.maxConcurrent)
var wg sync.WaitGroup
probeResults := make(map[string]tlsprobe.ProbeResult)
var mu sync.Mutex
for _, check := range checks {
wg.Add(1)
go func(c *domain.EndpointHealthCheck) {
defer wg.Done()
sem <- struct{}{} // acquire
defer func() { <-sem }() // release
result := tlsprobe.ProbeTLS(ctx, c.Endpoint, s.defaultTimeout)
mu.Lock()
probeResults[c.ID] = result
mu.Unlock()
}(check)
}
wg.Wait()
// Process results and update health status
successCount := 0
failureCount := 0
transitionCount := 0
for _, check := range checks {
result := probeResults[check.ID]
// Determine old status for transition detection
oldStatus := check.Status
// Update probe result fields
check.LastCheckedAt = timePtr(time.Now())
check.ResponseTimeMs = result.ResponseTimeMs
if result.Success {
successCount++
check.ObservedFingerprint = result.Fingerprint
check.TLSVersion = result.TLSVersion
check.CipherSuite = result.CipherSuite
check.CertSubject = result.Subject
check.CertIssuer = result.Issuer
check.CertExpiry = timePtr(result.NotAfter)
check.FailureReason = ""
check.LastSuccessAt = timePtr(time.Now())
check.ConsecutiveFailures = 0
} else {
failureCount++
check.LastFailureAt = timePtr(time.Now())
check.ConsecutiveFailures++
check.FailureReason = result.Error
}
// Transition state based on consecutive failures and fingerprint match
newStatus, transitioned := check.TransitionStatus(result.Success, result.Fingerprint)
if transitioned {
transitionCount++
check.Status = newStatus
check.LastTransitionAt = timePtr(time.Now())
// Reset acknowledged on transition
check.Acknowledged = false
// Log transition
s.logger.Info("health check status transition",
"endpoint", check.Endpoint,
"old_status", string(oldStatus),
"new_status", string(newStatus))
// Record audit event
if s.auditService != nil {
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
"health_check_status_transition", "health_check", check.ID,
map[string]interface{}{
"endpoint": check.Endpoint,
"old_status": string(oldStatus),
"new_status": string(newStatus),
})
}
}
// Update health check record
if err := s.repo.Update(ctx, check); err != nil {
s.logger.Error("failed to update health check",
"endpoint", check.Endpoint,
"error", err)
continue
}
// Record probe result in history
if err := s.repo.RecordHistory(ctx, &domain.HealthHistoryEntry{
HealthCheckID: check.ID,
Status: string(check.Status),
ResponseTimeMs: check.ResponseTimeMs,
Fingerprint: check.ObservedFingerprint,
FailureReason: check.FailureReason,
CheckedAt: time.Now(),
}); err != nil {
s.logger.Warn("failed to record health check history",
"endpoint", check.Endpoint,
"error", err)
}
}
// Purge old history entries once per run
if err := s.PurgeOldHistory(ctx); err != nil {
s.logger.Warn("failed to purge old health check history", "error", err)
}
s.logger.Debug("health check run completed",
"total", len(checks),
"success", successCount,
"failure", failureCount,
"transitions", transitionCount)
return nil
}
// Create creates a new health check endpoint.
func (s *HealthCheckService) Create(ctx context.Context, check *domain.EndpointHealthCheck) error {
if check.ID == "" {
check.ID = generateID("hc")
}
check.CreatedAt = time.Now()
check.UpdatedAt = time.Now()
if err := s.repo.Create(ctx, check); err != nil {
return fmt.Errorf("failed to create health check: %w", err)
}
if s.auditService != nil {
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
"health_check_created", "health_check", check.ID,
map[string]interface{}{
"endpoint": check.Endpoint,
})
}
return nil
}
// Get retrieves a health check by ID.
func (s *HealthCheckService) Get(ctx context.Context, id string) (*domain.EndpointHealthCheck, error) {
return s.repo.Get(ctx, id)
}
// Update updates an existing health check.
func (s *HealthCheckService) Update(ctx context.Context, check *domain.EndpointHealthCheck) error {
check.UpdatedAt = time.Now()
if err := s.repo.Update(ctx, check); err != nil {
return fmt.Errorf("failed to update health check: %w", err)
}
if s.auditService != nil {
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
"health_check_updated", "health_check", check.ID,
map[string]interface{}{
"endpoint": check.Endpoint,
})
}
return nil
}
// Delete deletes a health check.
func (s *HealthCheckService) Delete(ctx context.Context, id string) error {
if err := s.repo.Delete(ctx, id); err != nil {
return fmt.Errorf("failed to delete health check: %w", err)
}
if s.auditService != nil {
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
"health_check_deleted", "health_check", id,
map[string]interface{}{})
}
return nil
}
// List lists health checks with optional filtering.
func (s *HealthCheckService) List(ctx context.Context, filter *repository.HealthCheckFilter) ([]*domain.EndpointHealthCheck, int, error) {
if filter == nil {
filter = &repository.HealthCheckFilter{}
}
return s.repo.List(ctx, filter)
}
// GetHistory retrieves health check history for an endpoint.
func (s *HealthCheckService) GetHistory(ctx context.Context, healthCheckID string, limit int) ([]*domain.HealthHistoryEntry, error) {
if limit <= 0 {
limit = 100
}
if limit > 1000 {
limit = 1000
}
return s.repo.GetHistory(ctx, healthCheckID, limit)
}
// AcknowledgeIncident marks a health check incident as acknowledged.
func (s *HealthCheckService) AcknowledgeIncident(ctx context.Context, id string, actor string) error {
check, err := s.repo.Get(ctx, id)
if err != nil {
return fmt.Errorf("failed to get health check: %w", err)
}
check.Acknowledged = true
check.AcknowledgedBy = actor
check.AcknowledgedAt = timePtr(time.Now())
if err := s.repo.Update(ctx, check); err != nil {
return fmt.Errorf("failed to update health check: %w", err)
}
if s.auditService != nil {
_ = s.auditService.RecordEvent(ctx, actor, domain.ActorTypeUser,
"health_check_acknowledged", "health_check", id,
map[string]interface{}{
"endpoint": check.Endpoint,
})
}
return nil
}
// GetSummary returns aggregated health check status counts.
func (s *HealthCheckService) GetSummary(ctx context.Context) (*domain.HealthCheckSummary, error) {
return s.repo.GetSummary(ctx)
}
// PurgeOldHistory removes health check history entries older than the retention period.
func (s *HealthCheckService) PurgeOldHistory(ctx context.Context) error {
cutoff := time.Now().Add(-s.historyRetention)
_, err := s.repo.PurgeHistory(ctx, cutoff)
return err
}
// Helper functions
func timePtr(t time.Time) *time.Time {
return &t
}