mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 15:11:29 +00:00
8b75e0311b
Mechanical sed across the main go.mod's module declaration, the f5-mock-icontrol
sub-module's go.mod, every Go file's import path (361 files), and a rebuild of
the checked-in f5-mock-icontrol binary so its embedded build-info reflects the
new module path. No behavior change.
Choice B from cowork/transfer-certctl-to-org.md, executed 2026-05-04. Choice A
(keep module path declared as github.com/shankar0123/certctl regardless of
repo URL) shipped on the day of the org transfer (2026-05-03) since we had no
external Go consumers; this commit closes that deferral.
Backward-compat: GitHub HTTP redirects continue to forward
github.com/shankar0123/certctl → github.com/certctl-io/certctl at the URL
level, but Go's module proxy uses the path declared in go.mod as the
canonical name. Pre-fix, anyone trying `go get github.com/certctl-io/certctl/...`
hit a "module path mismatch" error because go.mod said
github.com/shankar0123/certctl and the URL they fetched it from said
certctl-io/certctl. Post-fix, the canonical name and the URL agree, so
go get / go install / external Go consumers / Go-tooling integrations
work cleanly via either the new path (preferred) or the old path (which
redirects and Go follows the redirect for source fetch).
Anyone still importing the old path inside their own code keeps working
provided they update their go.mod's `require` line to match — the module
path declared in their consumer's go.sum / go.mod is the authoritative
import name, so a mass sed across their import statements is the migration
on the consumer side. No external consumers exist today.
Diff shape:
361 *.go files — import path replacement only
2 go.mod — module declaration replacement only
1 binary — deploy/test/f5-mock-icontrol/f5-mock-icontrol rebuilt
so embedded build-info reflects the new path (8618965 vs
8618933 bytes; 32-byte diff is the build-info change)
Total: 364 files, 730 insertions / 730 deletions, net-zero size, pure
mechanical substitution.
Verification:
gofmt: 17 files needed re-alignment after sed (the new path is one char
shorter than the old, so column-aligned import groups drifted). Applied
`gofmt -w` to fix.
go mod tidy: clean exit on both modules.
go vet ./...: clean exit.
go build ./...: clean exit.
go test -short -count=1 on representative packages: all green
(internal/domain, internal/validation, internal/crypto, internal/crypto/signer,
cmd/agent). Test output now reads `ok github.com/certctl-io/certctl/...`
confirming the module path resolves correctly.
binary: f5-mock-icontrol rebuilt; `strings | grep shankar0123` returns
nothing; `strings | grep certctl-io/certctl` shows the new module path
embedded in build-info.
Files intentionally NOT touched in this commit:
README.md / CHANGELOG.md / docs/ / etc. — already swept to certctl-io
URLs in commit 0729ee4 (the post-transfer URL refresh). This commit is
purely the Go-tooling layer.
Scarf pixels (`shankar0123.docker.scarf.sh/...`) — Scarf-account
namespace, not a Go import or GitHub repo URL. Stays.
This is a non-blocking, non-customer-impacting change. Operators pulling
container images, running `make verify`, hitting the API, or installing the
agent see no functional difference. Only Go-tooling consumers (none today)
are affected, and they're enabled — not broken — by this commit.
314 lines
8.8 KiB
Go
314 lines
8.8 KiB
Go
package service
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/certctl-io/certctl/internal/domain"
|
|
"github.com/certctl-io/certctl/internal/repository"
|
|
"github.com/certctl-io/certctl/internal/tlsprobe"
|
|
)
|
|
|
|
// HealthCheckService manages endpoint TLS health monitoring.
|
|
type HealthCheckService struct {
|
|
repo repository.HealthCheckRepository
|
|
auditService *AuditService
|
|
notifService *NotificationService
|
|
logger *slog.Logger
|
|
maxConcurrent int
|
|
defaultTimeout time.Duration
|
|
historyRetention time.Duration
|
|
autoCreate bool
|
|
}
|
|
|
|
// NewHealthCheckService creates a new HealthCheckService.
|
|
func NewHealthCheckService(
|
|
repo repository.HealthCheckRepository,
|
|
auditService *AuditService,
|
|
logger *slog.Logger,
|
|
maxConcurrent int,
|
|
defaultTimeout time.Duration,
|
|
historyRetention time.Duration,
|
|
autoCreate bool,
|
|
) *HealthCheckService {
|
|
return &HealthCheckService{
|
|
repo: repo,
|
|
auditService: auditService,
|
|
logger: logger,
|
|
maxConcurrent: maxConcurrent,
|
|
defaultTimeout: defaultTimeout,
|
|
historyRetention: historyRetention,
|
|
autoCreate: autoCreate,
|
|
}
|
|
}
|
|
|
|
// SetNotificationService sets the notification service for sending status transition alerts.
|
|
func (s *HealthCheckService) SetNotificationService(ns *NotificationService) {
|
|
s.notifService = ns
|
|
}
|
|
|
|
// RunHealthChecks is the scheduler entry point for continuous TLS health monitoring.
|
|
// Fetches endpoints due for check, probes concurrently with semaphore control,
|
|
// updates health status with state transitions, records history, and sends notifications.
|
|
func (s *HealthCheckService) RunHealthChecks(ctx context.Context) error {
|
|
// Fetch all endpoints due for check
|
|
checks, err := s.repo.ListDueForCheck(ctx)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to list endpoints due for check: %w", err)
|
|
}
|
|
|
|
if len(checks) == 0 {
|
|
s.logger.Debug("no endpoints due for health check")
|
|
return nil
|
|
}
|
|
|
|
s.logger.Debug("running health checks", "endpoint_count", len(checks))
|
|
|
|
// Concurrent probing with semaphore
|
|
sem := make(chan struct{}, s.maxConcurrent)
|
|
var wg sync.WaitGroup
|
|
probeResults := make(map[string]tlsprobe.ProbeResult)
|
|
var mu sync.Mutex
|
|
|
|
for _, check := range checks {
|
|
wg.Add(1)
|
|
go func(c *domain.EndpointHealthCheck) {
|
|
defer wg.Done()
|
|
sem <- struct{}{} // acquire
|
|
defer func() { <-sem }() // release
|
|
|
|
result := tlsprobe.ProbeTLS(ctx, c.Endpoint, s.defaultTimeout)
|
|
mu.Lock()
|
|
probeResults[c.ID] = result
|
|
mu.Unlock()
|
|
}(check)
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
// Process results and update health status
|
|
successCount := 0
|
|
failureCount := 0
|
|
transitionCount := 0
|
|
|
|
for _, check := range checks {
|
|
result := probeResults[check.ID]
|
|
|
|
// Determine old status for transition detection
|
|
oldStatus := check.Status
|
|
|
|
// Update probe result fields
|
|
check.LastCheckedAt = timePtr(time.Now())
|
|
check.ResponseTimeMs = result.ResponseTimeMs
|
|
|
|
if result.Success {
|
|
successCount++
|
|
check.ObservedFingerprint = result.Fingerprint
|
|
check.TLSVersion = result.TLSVersion
|
|
check.CipherSuite = result.CipherSuite
|
|
check.CertSubject = result.Subject
|
|
check.CertIssuer = result.Issuer
|
|
check.CertExpiry = timePtr(result.NotAfter)
|
|
check.FailureReason = ""
|
|
check.LastSuccessAt = timePtr(time.Now())
|
|
check.ConsecutiveFailures = 0
|
|
} else {
|
|
failureCount++
|
|
check.LastFailureAt = timePtr(time.Now())
|
|
check.ConsecutiveFailures++
|
|
check.FailureReason = result.Error
|
|
}
|
|
|
|
// Transition state based on consecutive failures and fingerprint match
|
|
newStatus, transitioned := check.TransitionStatus(result.Success, result.Fingerprint)
|
|
|
|
if transitioned {
|
|
transitionCount++
|
|
check.Status = newStatus
|
|
check.LastTransitionAt = timePtr(time.Now())
|
|
// Reset acknowledged on transition
|
|
check.Acknowledged = false
|
|
|
|
// Log transition
|
|
s.logger.Info("health check status transition",
|
|
"endpoint", check.Endpoint,
|
|
"old_status", string(oldStatus),
|
|
"new_status", string(newStatus))
|
|
|
|
// Record audit event
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_status_transition", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
"old_status": string(oldStatus),
|
|
"new_status": string(newStatus),
|
|
})
|
|
}
|
|
}
|
|
|
|
// Update health check record
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
s.logger.Error("failed to update health check",
|
|
"endpoint", check.Endpoint,
|
|
"error", err)
|
|
continue
|
|
}
|
|
|
|
// Record probe result in history
|
|
if err := s.repo.RecordHistory(ctx, &domain.HealthHistoryEntry{
|
|
HealthCheckID: check.ID,
|
|
Status: string(check.Status),
|
|
ResponseTimeMs: check.ResponseTimeMs,
|
|
Fingerprint: check.ObservedFingerprint,
|
|
FailureReason: check.FailureReason,
|
|
CheckedAt: time.Now(),
|
|
}); err != nil {
|
|
s.logger.Warn("failed to record health check history",
|
|
"endpoint", check.Endpoint,
|
|
"error", err)
|
|
}
|
|
}
|
|
|
|
// Purge old history entries once per run
|
|
if err := s.PurgeOldHistory(ctx); err != nil {
|
|
s.logger.Warn("failed to purge old health check history", "error", err)
|
|
}
|
|
|
|
s.logger.Debug("health check run completed",
|
|
"total", len(checks),
|
|
"success", successCount,
|
|
"failure", failureCount,
|
|
"transitions", transitionCount)
|
|
|
|
return nil
|
|
}
|
|
|
|
// Create creates a new health check endpoint.
|
|
func (s *HealthCheckService) Create(ctx context.Context, check *domain.EndpointHealthCheck) error {
|
|
if check.ID == "" {
|
|
check.ID = generateID("hc")
|
|
}
|
|
check.CreatedAt = time.Now()
|
|
check.UpdatedAt = time.Now()
|
|
|
|
if err := s.repo.Create(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to create health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_created", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Get retrieves a health check by ID.
|
|
func (s *HealthCheckService) Get(ctx context.Context, id string) (*domain.EndpointHealthCheck, error) {
|
|
return s.repo.Get(ctx, id)
|
|
}
|
|
|
|
// Update updates an existing health check.
|
|
func (s *HealthCheckService) Update(ctx context.Context, check *domain.EndpointHealthCheck) error {
|
|
check.UpdatedAt = time.Now()
|
|
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to update health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_updated", "health_check", check.ID,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// Delete deletes a health check.
|
|
func (s *HealthCheckService) Delete(ctx context.Context, id string) error {
|
|
if err := s.repo.Delete(ctx, id); err != nil {
|
|
return fmt.Errorf("failed to delete health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, "system", domain.ActorTypeSystem,
|
|
"health_check_deleted", "health_check", id,
|
|
map[string]interface{}{})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// List lists health checks with optional filtering.
|
|
func (s *HealthCheckService) List(ctx context.Context, filter *repository.HealthCheckFilter) ([]*domain.EndpointHealthCheck, int, error) {
|
|
if filter == nil {
|
|
filter = &repository.HealthCheckFilter{}
|
|
}
|
|
return s.repo.List(ctx, filter)
|
|
}
|
|
|
|
// GetHistory retrieves health check history for an endpoint.
|
|
func (s *HealthCheckService) GetHistory(ctx context.Context, healthCheckID string, limit int) ([]*domain.HealthHistoryEntry, error) {
|
|
if limit <= 0 {
|
|
limit = 100
|
|
}
|
|
if limit > 1000 {
|
|
limit = 1000
|
|
}
|
|
return s.repo.GetHistory(ctx, healthCheckID, limit)
|
|
}
|
|
|
|
// AcknowledgeIncident marks a health check incident as acknowledged.
|
|
func (s *HealthCheckService) AcknowledgeIncident(ctx context.Context, id string, actor string) error {
|
|
check, err := s.repo.Get(ctx, id)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get health check: %w", err)
|
|
}
|
|
|
|
check.Acknowledged = true
|
|
check.AcknowledgedBy = actor
|
|
check.AcknowledgedAt = timePtr(time.Now())
|
|
|
|
if err := s.repo.Update(ctx, check); err != nil {
|
|
return fmt.Errorf("failed to update health check: %w", err)
|
|
}
|
|
|
|
if s.auditService != nil {
|
|
_ = s.auditService.RecordEvent(ctx, actor, domain.ActorTypeUser,
|
|
"health_check_acknowledged", "health_check", id,
|
|
map[string]interface{}{
|
|
"endpoint": check.Endpoint,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// GetSummary returns aggregated health check status counts.
|
|
func (s *HealthCheckService) GetSummary(ctx context.Context) (*domain.HealthCheckSummary, error) {
|
|
return s.repo.GetSummary(ctx)
|
|
}
|
|
|
|
// PurgeOldHistory removes health check history entries older than the retention period.
|
|
func (s *HealthCheckService) PurgeOldHistory(ctx context.Context) error {
|
|
cutoff := time.Now().Add(-s.historyRetention)
|
|
_, err := s.repo.PurgeHistory(ctx, cutoff)
|
|
return err
|
|
}
|
|
|
|
// Helper functions
|
|
|
|
func timePtr(t time.Time) *time.Time {
|
|
return &t
|
|
}
|