mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 12:21:31 +00:00
21aeed4f4e
Phase 0 closure (Path B2, post-rewrite):
addlicense sweep — adds the canonical certctl LLC copyright + BUSL-1.1
SPDX header to every production Go file. Template:
// Copyright 2026 certctl LLC. All rights reserved.
// SPDX-License-Identifier: BUSL-1.1
Coverage: 338 / 338 production Go files (cmd/ + internal/, excluding
*_test.go and **/testdata/**). Pre-sweep coverage was 22 / 338 (6.5%);
post-sweep is 338 / 338 (100%).
Normalized 22 pre-existing legacy headers (`// Copyright (c) certctl`
+ `// SPDX-License-Identifier: BSL-1.1`) and 1 file using a
`Certctl Contributors` attribution. The legacy SPDX ID `BSL-1.1`
is non-standard; the official SPDX identifier for Business Source
License 1.1 is `BUSL-1.1` (capital U). All 338 files now share the
canonical form.
Generated via:
addlicense -c "certctl LLC" -y 2026 \
-f cowork/legal/copyright-header.tpl \
-ignore '**/testdata/**' -ignore '**/*_test.go' \
cmd/ internal/
Verification:
find cmd internal -name '*.go' -not -name '*_test.go' \
-not -path '*/testdata/*' \
-exec grep -L '^// Copyright 2026 certctl LLC' {} \; | wc -l
Returns: 0
gofmt clean. Header additions are comments only, no compile impact.
Closes: cowork/certctl-architecture-diligence-audit.html#fix-RED-4
321 lines
14 KiB
Go
321 lines
14 KiB
Go
// Copyright 2026 certctl LLC. All rights reserved.
|
|
// SPDX-License-Identifier: BUSL-1.1
|
|
|
|
package service
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"log/slog"
|
|
"time"
|
|
|
|
"github.com/certctl-io/certctl/internal/domain"
|
|
)
|
|
|
|
// I-004 coverage-gap closure: the agent retirement surface.
|
|
//
|
|
// Before 000015, DELETE /api/v1/agents/{id} hard-deleted the agents row and
|
|
// the deployment_targets.agent_id FK CASCADE cleaned up downstream rows with
|
|
// no preflight, no archival, and no knowledge of in-flight jobs. Any cert
|
|
// still rotating through one of those targets would observe half-migrated
|
|
// state. I-004 closes that gap with a preflight + soft-retire + optional
|
|
// forced-cascade contract; the symbols in this file are the service-layer
|
|
// surface that the handler and operator UI bind against.
|
|
|
|
// ErrAgentIsSentinel is returned when an operator tries to retire one of the
|
|
// four reserved sentinel agent IDs (server-scanner, cloud-aws-sm,
|
|
// cloud-azure-kv, cloud-gcp-sm). These rows back the network scanner and the
|
|
// three cloud secret-manager discovery sources; retiring any of them orphans
|
|
// its subsystem. The guard fires unconditionally — force=true does not bypass
|
|
// it, because a sentinel is a structural invariant of the deployment, not
|
|
// a piece of fleet state the operator owns. Handler maps this to HTTP 403.
|
|
var ErrAgentIsSentinel = errors.New("agent is a reserved sentinel and cannot be retired")
|
|
|
|
// ErrBlockedByDependencies is returned by RetireAgent when at least one of
|
|
// (active targets, active certificates, pending jobs) referencing the agent
|
|
// is non-zero and force=false. The caller always receives it wrapped in
|
|
// a *BlockedByDependenciesError (see below), so handlers doing errors.As
|
|
// can surface the per-bucket counts in the 409 body for operator
|
|
// troubleshooting. Tests use errors.Is; handlers use errors.As.
|
|
var ErrBlockedByDependencies = errors.New("agent has active downstream dependencies")
|
|
|
|
// ErrForceReasonRequired is returned when force=true is supplied without a
|
|
// non-empty reason. The force escape hatch is deliberately chatty: operators
|
|
// pulling the emergency cord must leave an auditable breadcrumb explaining
|
|
// why a cascade was justified. Handler maps this to HTTP 400 so the operator
|
|
// retries with --reason rather than silently skipping the guard. Checked
|
|
// before any DB mutation to keep the no-reason path transactionally clean.
|
|
var ErrForceReasonRequired = errors.New("force=true requires a non-empty reason")
|
|
|
|
// ErrAgentRetired is returned by Heartbeat (and any future agent-authenticated
|
|
// call site) when a retired agent is still polling. The handler layer maps
|
|
// this to HTTP 410 Gone so the cmd/agent sendHeartbeat loop can detect it
|
|
// deterministically and shut down the agent process, rather than looping
|
|
// forever on a soft-retired identity. IsRetired() on the domain model is
|
|
// the single source of truth; the sentinel exists so service and handler
|
|
// callers can errors.Is against one symbol.
|
|
var ErrAgentRetired = errors.New("agent has been retired")
|
|
|
|
// BlockedByDependenciesError wraps ErrBlockedByDependencies and carries the
|
|
// per-bucket dependency snapshot the preflight pass captured. The embedded
|
|
// AgentDependencyCounts is the same struct the repo returns from the three
|
|
// CountActive* calls, so the handler can marshal it directly into the 409
|
|
// body without reshaping fields. Unwrap() satisfies errors.Is against the
|
|
// sentinel; Error() includes the counts so logs are diagnostic on their own.
|
|
type BlockedByDependenciesError struct {
|
|
Counts domain.AgentDependencyCounts
|
|
}
|
|
|
|
// Error formats the wrapped error with the per-bucket counts. Kept short so
|
|
// it reads cleanly in slog output.
|
|
func (e *BlockedByDependenciesError) Error() string {
|
|
return fmt.Sprintf(
|
|
"%s (active_targets=%d, active_certificates=%d, pending_jobs=%d)",
|
|
ErrBlockedByDependencies.Error(),
|
|
e.Counts.ActiveTargets,
|
|
e.Counts.ActiveCertificates,
|
|
e.Counts.PendingJobs,
|
|
)
|
|
}
|
|
|
|
// Unwrap lets errors.Is(err, ErrBlockedByDependencies) match the wrapped
|
|
// struct — the test contract (agent_retire_test.go:167) depends on it.
|
|
func (e *BlockedByDependenciesError) Unwrap() error { return ErrBlockedByDependencies }
|
|
|
|
// AgentRetirementResult is the outcome surface the handler returns to the
|
|
// operator. It discriminates the three happy paths the endpoint can take —
|
|
// idempotent no-op (AlreadyRetired), clean soft-retire (Cascade=false), and
|
|
// forced cascade (Cascade=true) — and always carries the retired_at timestamp
|
|
// and the dependency-count snapshot so the 200/204 response body can echo
|
|
// what was (or would have been) affected.
|
|
//
|
|
// AlreadyRetired=true → agent was already retired; no new audit
|
|
// event was emitted; RetiredAt is the
|
|
// original stamp, not the current time.
|
|
// Cascade=false → clean soft-retire; Counts is all zeros.
|
|
// Cascade=true → force=true retired agent + downstream
|
|
// targets; Counts is the PRE-cascade
|
|
// snapshot (so the operator sees what
|
|
// they just retired).
|
|
type AgentRetirementResult struct {
|
|
AlreadyRetired bool
|
|
Cascade bool
|
|
RetiredAt time.Time
|
|
Counts domain.AgentDependencyCounts
|
|
}
|
|
|
|
// RetireAgent implements the I-004 retirement contract. Ordering matters —
|
|
// every guard fires before the one that would mutate state, so a rejected
|
|
// retire leaves zero trace (no audit event, no partial DB write):
|
|
//
|
|
// 1. Sentinel check (unconditional; force does not bypass).
|
|
// 2. Fetch agent (404 surfaces as-is from the repo).
|
|
// 3. Already-retired idempotency: return AlreadyRetired=true with NO new
|
|
// audit event — the original retire already recorded one.
|
|
// 4. Preflight count pass via the three CountActive* repo methods.
|
|
// 5. Force-reason guard: force=true with empty reason is rejected here,
|
|
// after the counts are known but before any mutation.
|
|
// 6. Default no-force path: any non-zero count returns
|
|
// *BlockedByDependenciesError with counts attached.
|
|
// 7. Mutation: SoftRetire (no cascade) or RetireAgentWithCascade, with
|
|
// a single retiredAt timestamp pinned BEFORE the repo call so the
|
|
// audit event and the DB row agree to the nanosecond.
|
|
// 8. Audit: agent_retired always; agent_retirement_cascaded additionally
|
|
// on the force=true cascade path.
|
|
//
|
|
// Actor comes from the handler's resolveActor (API key → user, agent key →
|
|
// agent-<id>, unauthenticated → "anonymous"); the service does not second-
|
|
// guess it. Audit emission is best-effort: a failed RecordEvent logs a
|
|
// warning but does not fail the overall retirement, consistent with how
|
|
// the rest of the codebase treats audit as an observability concern
|
|
// rather than a correctness barrier.
|
|
func (s *AgentService) RetireAgent(ctx context.Context, id string, actor string, force bool, reason string) (*AgentRetirementResult, error) {
|
|
// Step 1 — reserved-sentinel guard. Applies even under force=true.
|
|
if domain.IsSentinelAgent(id) {
|
|
return nil, ErrAgentIsSentinel
|
|
}
|
|
|
|
// Step 2 — existence check. Missing agent surfaces the repo's not-found
|
|
// error verbatim so the handler can map it to 404 via its existing
|
|
// detection path (the handler layer already has "not found" mapping
|
|
// logic inherited from the pre-I-004 Delete endpoint).
|
|
agent, err := s.agentRepo.Get(ctx, id)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to fetch agent: %w", err)
|
|
}
|
|
|
|
// Step 3 — idempotency. A retired agent returns AlreadyRetired=true
|
|
// WITHOUT emitting a fresh audit event. Handler maps this to HTTP 204.
|
|
// Guarding here (before preflight) means a re-retire of an agent that
|
|
// now has zero deps doesn't spuriously "succeed again" and double-log.
|
|
if agent.IsRetired() {
|
|
return &AgentRetirementResult{
|
|
AlreadyRetired: true,
|
|
RetiredAt: *agent.RetiredAt,
|
|
}, nil
|
|
}
|
|
|
|
// Step 4 — preflight counts. All three run even when force=true: we
|
|
// need them to populate AgentRetirementResult.Counts (the pre-cascade
|
|
// snapshot). A repo failure here aborts the whole operation — partial
|
|
// preflight is worse than no preflight.
|
|
counts, err := s.collectAgentDependencyCounts(ctx, id)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to collect agent dependency counts: %w", err)
|
|
}
|
|
|
|
// Step 5 — force-reason guard. Positioned AFTER preflight so operators
|
|
// who forgot --reason still see accurate counts when they retry. The
|
|
// empty-reason rejection fires before any mutation, so the rejected
|
|
// attempt leaves no audit noise.
|
|
if force && reason == "" {
|
|
return nil, ErrForceReasonRequired
|
|
}
|
|
|
|
// Step 6 — default path: block on any non-zero bucket. Wrapping the
|
|
// sentinel in *BlockedByDependenciesError lets the handler use errors.As
|
|
// to surface counts in the 409 body while tests use errors.Is against
|
|
// the sentinel. Both callers are satisfied by the single Unwrap chain.
|
|
if !force && counts.HasDependencies() {
|
|
return nil, &BlockedByDependenciesError{Counts: counts}
|
|
}
|
|
|
|
// Step 7 — mutation. Pin retiredAt once so the audit event, the agent
|
|
// row, and (on cascade) every deployment_targets row share the same
|
|
// timestamp. Callers querying "what happened at T?" can correlate
|
|
// retirement rows across tables without clock-skew tie-breaking.
|
|
retiredAt := time.Now()
|
|
cascade := force && counts.HasDependencies()
|
|
|
|
if cascade {
|
|
if err := s.agentRepo.RetireAgentWithCascade(ctx, id, retiredAt, reason); err != nil {
|
|
return nil, fmt.Errorf("failed to retire agent with cascade: %w", err)
|
|
}
|
|
} else {
|
|
if err := s.agentRepo.SoftRetire(ctx, id, retiredAt, reason); err != nil {
|
|
return nil, fmt.Errorf("failed to soft-retire agent: %w", err)
|
|
}
|
|
}
|
|
|
|
// Step 8 — audit. Two events on the cascade path so forensics can
|
|
// distinguish "agent was retired" (agent_retired) from "downstream
|
|
// targets were flipped" (agent_retirement_cascaded). Details on the
|
|
// cascaded event carry the pre-cascade counts so a reviewer looking
|
|
// only at the audit log knows how much state was affected. Emission
|
|
// is best-effort — audit is observability, not a correctness barrier.
|
|
actorType := s.resolveActorType(actor)
|
|
details := map[string]interface{}{
|
|
"actor": actor,
|
|
"reason": reason,
|
|
"force": force,
|
|
"active_targets": counts.ActiveTargets,
|
|
"active_certificates": counts.ActiveCertificates,
|
|
"pending_jobs": counts.PendingJobs,
|
|
}
|
|
if err := s.auditService.RecordEvent(ctx, actor, actorType,
|
|
"agent_retired", "agent", id, details); err != nil {
|
|
slog.Error("failed to record agent_retired audit event", "agent_id", id, "error", err)
|
|
}
|
|
if cascade {
|
|
cascadeDetails := map[string]interface{}{
|
|
"actor": actor,
|
|
"reason": reason,
|
|
"active_targets": counts.ActiveTargets,
|
|
"active_certificates": counts.ActiveCertificates,
|
|
"pending_jobs": counts.PendingJobs,
|
|
}
|
|
if err := s.auditService.RecordEvent(ctx, actor, actorType,
|
|
"agent_retirement_cascaded", "agent", id, cascadeDetails); err != nil {
|
|
slog.Error("failed to record agent_retirement_cascaded audit event", "agent_id", id, "error", err)
|
|
}
|
|
}
|
|
|
|
return &AgentRetirementResult{
|
|
AlreadyRetired: false,
|
|
Cascade: cascade,
|
|
RetiredAt: retiredAt,
|
|
Counts: counts,
|
|
}, nil
|
|
}
|
|
|
|
// ListRetiredAgents returns the paginated list of retired agents in
|
|
// retired_at DESC order. This is the companion to ListAgents — which
|
|
// hides retired rows — so the operator UI can render a dedicated
|
|
// "Retired" tab without leaking retired rows into every other listing.
|
|
// Pagination defaults (page<1→1, perPage<1→50) are applied here as
|
|
// well as in the repo, so callers can pass 0s when they want defaults.
|
|
//
|
|
// Return shape harmonizes with handler.AgentService: a value slice
|
|
// (not pointer slice) and int64 total. The repo returns []*domain.Agent;
|
|
// this method dereferences into a value slice so the handler's
|
|
// PagedResponse marshals straight objects and so the compile-time
|
|
// interface assertion in agent_retire_handler_test.go:387 is satisfied.
|
|
// Nil repo entries are skipped defensively — the repo should never
|
|
// return them, but the handler contract is more important than the
|
|
// repo's (pointer-slice) convenience.
|
|
func (s *AgentService) ListRetiredAgents(ctx context.Context, page, perPage int) ([]domain.Agent, int64, error) {
|
|
if page < 1 {
|
|
page = 1
|
|
}
|
|
if perPage < 1 {
|
|
perPage = 50
|
|
}
|
|
agents, total, err := s.agentRepo.ListRetired(ctx, page, perPage)
|
|
if err != nil {
|
|
return nil, 0, fmt.Errorf("failed to list retired agents: %w", err)
|
|
}
|
|
out := make([]domain.Agent, 0, len(agents))
|
|
for _, a := range agents {
|
|
if a == nil {
|
|
continue
|
|
}
|
|
out = append(out, *a)
|
|
}
|
|
return out, int64(total), nil
|
|
}
|
|
|
|
// collectAgentDependencyCounts runs the three preflight COUNT queries in
|
|
// sequence and bundles the result. Sequential (not parallel) because the
|
|
// queries are cheap (<1ms each on the indexed columns added in 000015) and
|
|
// sequential keeps error handling simple. Any repo error short-circuits
|
|
// — we prefer to refuse the retire than make a half-informed decision.
|
|
func (s *AgentService) collectAgentDependencyCounts(ctx context.Context, id string) (domain.AgentDependencyCounts, error) {
|
|
var counts domain.AgentDependencyCounts
|
|
|
|
targets, err := s.agentRepo.CountActiveTargets(ctx, id)
|
|
if err != nil {
|
|
return counts, fmt.Errorf("count active targets: %w", err)
|
|
}
|
|
counts.ActiveTargets = targets
|
|
|
|
certs, err := s.agentRepo.CountActiveCertificates(ctx, id)
|
|
if err != nil {
|
|
return counts, fmt.Errorf("count active certificates: %w", err)
|
|
}
|
|
counts.ActiveCertificates = certs
|
|
|
|
jobs, err := s.agentRepo.CountPendingJobs(ctx, id)
|
|
if err != nil {
|
|
return counts, fmt.Errorf("count pending jobs: %w", err)
|
|
}
|
|
counts.PendingJobs = jobs
|
|
|
|
return counts, nil
|
|
}
|
|
|
|
// resolveActorType maps an opaque actor string into the typed ActorType
|
|
// used by the audit schema. Matches the conventions the rest of the
|
|
// service layer uses: "system" → System, anything that looks like an
|
|
// agent identity → Agent, everything else → User.
|
|
func (s *AgentService) resolveActorType(actor string) domain.ActorType {
|
|
switch {
|
|
case actor == "system":
|
|
return domain.ActorTypeSystem
|
|
case len(actor) > 6 && actor[:6] == "agent-":
|
|
return domain.ActorTypeAgent
|
|
default:
|
|
return domain.ActorTypeUser
|
|
}
|
|
}
|