WIP: M-1 handler sentinel error mapping (checkpoint before branch cleanup)

Uncommitted migration work at the time of branch cleanup. Tagged as
checkpoint/m1-migration-wip so the commit survives git gc --prune=now.

Session context: Phase 3 Part B+C of the M-1 sentinel error migration
was in progress. 38 modified files, 4 new files (errors.go + errors_test.go
in internal/service/ and internal/api/handler/). Resume from this commit
via 'git checkout checkpoint/m1-migration-wip'.
This commit is contained in:
shankar0123
2026-04-24 00:35:12 +00:00
parent d6959a75c1
commit 36e722ba12
42 changed files with 1319 additions and 294 deletions
+12
View File
@@ -6,10 +6,12 @@ import (
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"errors"
"fmt"
"log/slog"
"time"
"github.com/lib/pq"
"github.com/shankar0123/certctl/internal/domain"
"github.com/shankar0123/certctl/internal/repository"
)
@@ -409,6 +411,16 @@ func (s *AgentService) RegisterAgent(ctx context.Context, agent domain.Agent) (*
agent.LastHeartbeatAt = &now
if err := s.agentRepo.Create(ctx, &agent); err != nil {
// M-1 (P2): explicit pg-23505 dispatch wraps unique-constraint
// violations on the agent name with ErrConflict so the handler's
// errors.Is(err, service.ErrConflict) arm fires 409 deterministically
// instead of relying on the errToStatus *pq.Error SQLSTATE fallback.
// Mirrors the duplicate-name paths in IssuerService and
// RenewalPolicyService.
var pgErr *pq.Error
if errors.As(err, &pgErr) && pgErr.Code == "23505" {
return nil, fmt.Errorf("%w: agent name already exists", ErrConflict)
}
return nil, fmt.Errorf("failed to register agent: %w", err)
}
return &agent, nil
+10 -2
View File
@@ -143,12 +143,20 @@ func (s *AgentGroupService) ListMembers(ctx context.Context, id string) ([]domai
}
// validateAgentGroup checks that an agent group's configuration is valid.
//
// M-1 (P2): every return wraps ErrValidation via %w so the handler's
// errToStatus choke point dispatches these to HTTP 400 via errors.Is without
// the substring-matching on "invalid"/"required" that the pre-M-1
// agent_groups handler relied on at handler/agent_groups.go:126. The composed
// Error() string still contains the original human-readable text, so the
// handler safely passes err.Error() through to the response body on the 400
// arm. Mirrors validateProfile.
func validateAgentGroup(g *domain.AgentGroup) error {
if g.Name == "" {
return fmt.Errorf("agent group name is required")
return fmt.Errorf("%w: agent group name is required", ErrValidation)
}
if len(g.Name) > 255 {
return fmt.Errorf("agent group name exceeds 255 characters")
return fmt.Errorf("%w: agent group name exceeds 255 characters", ErrValidation)
}
return nil
}
+24 -7
View File
@@ -26,8 +26,12 @@ import (
// three cloud secret-manager discovery sources; retiring any of them orphans
// its subsystem. The guard fires unconditionally — force=true does not bypass
// it, because a sentinel is a structural invariant of the deployment, not
// a piece of fleet state the operator owns. Handler maps this to HTTP 403.
var ErrAgentIsSentinel = errors.New("agent is a reserved sentinel and cannot be retired")
// a piece of fleet state the operator owns.
//
// M-1 (P2): wraps [ErrForbidden] so the handler-layer errToStatus choke point
// resolves HTTP 403 via errors.Is. Call sites can still errors.Is against
// ErrAgentIsSentinel for domain-specific branches.
var ErrAgentIsSentinel = fmt.Errorf("%w: agent is a reserved sentinel and cannot be retired", ErrForbidden)
// ErrBlockedByDependencies is returned by RetireAgent when at least one of
// (active targets, active certificates, pending jobs) referencing the agent
@@ -35,15 +39,22 @@ var ErrAgentIsSentinel = errors.New("agent is a reserved sentinel and cannot be
// a *BlockedByDependenciesError (see below), so handlers doing errors.As
// can surface the per-bucket counts in the 409 body for operator
// troubleshooting. Tests use errors.Is; handlers use errors.As.
var ErrBlockedByDependencies = errors.New("agent has active downstream dependencies")
//
// M-1 (P2): wraps [ErrConflict] so errToStatus resolves HTTP 409 via
// errors.Is, keeping the per-bucket errors.As path unchanged for the
// detailed 409 body. BlockedByDependenciesError.Unwrap still returns the
// domain-specific sentinel for chained Is checks.
var ErrBlockedByDependencies = fmt.Errorf("%w: agent has active downstream dependencies", ErrConflict)
// ErrForceReasonRequired is returned when force=true is supplied without a
// non-empty reason. The force escape hatch is deliberately chatty: operators
// pulling the emergency cord must leave an auditable breadcrumb explaining
// why a cascade was justified. Handler maps this to HTTP 400 so the operator
// retries with --reason rather than silently skipping the guard. Checked
// before any DB mutation to keep the no-reason path transactionally clean.
var ErrForceReasonRequired = errors.New("force=true requires a non-empty reason")
// why a cascade was justified. Operators must retry with --reason rather
// than silently skipping the guard. Checked before any DB mutation to keep
// the no-reason path transactionally clean.
//
// M-1 (P2): wraps [ErrValidation] so errToStatus resolves HTTP 400.
var ErrForceReasonRequired = fmt.Errorf("%w: force=true requires a non-empty reason", ErrValidation)
// ErrAgentRetired is returned by Heartbeat (and any future agent-authenticated
// call site) when a retired agent is still polling. The handler layer maps
@@ -52,6 +63,12 @@ var ErrForceReasonRequired = errors.New("force=true requires a non-empty reason"
// forever on a soft-retired identity. IsRetired() on the domain model is
// the single source of truth; the sentinel exists so service and handler
// callers can errors.Is against one symbol.
//
// M-1 (P2): deliberately NOT wrapped under any generic sentinel. 410 Gone is
// semantically distinct from 403/404/409 — it signals a permanently-terminated
// resource identity and drives deterministic agent-binary shutdown (see
// cmd/agent/main.go). errToStatus tests it FIRST so it cannot be demoted by
// the generic cascade.
var ErrAgentRetired = errors.New("agent has been retired")
// BlockedByDependenciesError wraps ErrBlockedByDependencies and carries the
+10 -1
View File
@@ -143,6 +143,15 @@ func (s *AuditService) ListAuditEvents(ctx context.Context, page, perPage int) (
}
// GetAuditEvent returns a single audit event (handler interface method).
//
// M-1 (P2): the pre-M-1 zero-events path returned a bare
// `fmt.Errorf("audit event not found")` and the handler dispatched via a
// blanket `any error → 404 Audit event not found` shortcut. That silently
// demoted transient DB failures from the auditRepo.List wrap (line 154 above)
// to 404 Not Found. Now the zero-events path wraps ErrNotFound via %w so
// errors.Is(err, service.ErrNotFound) picks up the real 404 at the handler's
// errToStatus choke point, and the repo.List wrap surfaces as 500 with
// server-side slog.Error capture (F-002 redacted-500 pattern preserved).
func (s *AuditService) GetAuditEvent(ctx context.Context, id string) (*domain.AuditEvent, error) {
filter := &repository.AuditFilter{
ResourceID: id,
@@ -155,7 +164,7 @@ func (s *AuditService) GetAuditEvent(ctx context.Context, id string) (*domain.Au
}
if len(events) == 0 {
return nil, fmt.Errorf("audit event not found")
return nil, fmt.Errorf("%w: audit event not found", ErrNotFound)
}
return events[0], nil
+16 -2
View File
@@ -43,6 +43,15 @@ func (s *CAOperationsSvc) SetIssuerRegistry(registry *IssuerRegistry) {
// GenerateDERCRL generates a DER-encoded X.509 CRL for the given issuer.
// Short-lived certificates (profile TTL < 1 hour) are excluded from the CRL.
//
// M-1 (P2): the pre-M-1 issuer-not-found path returned a bare
// `fmt.Errorf("issuer not found: %s", issuerID)` with no sentinel wrap. A
// pre-M-1 handler strings.Contains(err.Error(), "not found") classifier would
// sweep both this path and any transient error whose text happened to mention
// "not found" into 404. Post-M-1 this path wraps service.ErrNotFound via %w so
// errors.Is picks up the genuine 404 at errToStatus, and truly-unexpected
// errors (e.g., registry misconfigured) surface as 500 via the generic-error
// fallback.
func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) ([]byte, error) {
if s.revocationRepo == nil {
return nil, fmt.Errorf("revocation repository not configured")
@@ -53,7 +62,7 @@ func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) (
issuerConn, ok := s.issuerRegistry.Get(issuerID)
if !ok {
return nil, fmt.Errorf("issuer not found: %s", issuerID)
return nil, fmt.Errorf("%w: issuer %s", ErrNotFound, issuerID)
}
// Scope the query to this issuer so the migration 000012 composite index
@@ -98,6 +107,11 @@ func (s *CAOperationsSvc) GenerateDERCRL(ctx context.Context, issuerID string) (
}
// GetOCSPResponse generates a signed OCSP response for the given certificate serial.
//
// M-1 (P2): see GenerateDERCRL above — same sentinel-wrap rationale applies.
// The issuer-not-found path wraps service.ErrNotFound via %w so errors.Is
// picks up the genuine 404 at errToStatus; transient/misconfiguration errors
// surface as 500.
func (s *CAOperationsSvc) GetOCSPResponse(ctx context.Context, issuerID string, serialHex string) ([]byte, error) {
if s.revocationRepo == nil {
return nil, fmt.Errorf("revocation repository not configured")
@@ -108,7 +122,7 @@ func (s *CAOperationsSvc) GetOCSPResponse(ctx context.Context, issuerID string,
issuerConn, ok := s.issuerRegistry.Get(issuerID)
if !ok {
return nil, fmt.Errorf("issuer not found: %s", issuerID)
return nil, fmt.Errorf("%w: issuer %s", ErrNotFound, issuerID)
}
serial := new(big.Int)
+19 -4
View File
@@ -376,10 +376,21 @@ func (s *CertificateService) CreateCertificate(ctx context.Context, cert domain.
// UpdateCertificate modifies a certificate (handler interface method).
func (s *CertificateService) UpdateCertificate(ctx context.Context, id string, patch domain.ManagedCertificate) (*domain.ManagedCertificate, error) {
// Fetch existing certificate so partial updates don't zero out fields
// Fetch existing certificate so partial updates don't zero out fields.
//
// M-1 (P2): the pre-M-1 wrap was `"certificate not found: %w"` on every
// certRepo.Get error — which coupled to the handler's strings.Contains
// substring classifier on "not found" and gave false positives on transient
// DB failures (connection refused, context deadline, etc.), demoting a 500
// to a 404. Now the repo wraps only the genuine sql.ErrNoRows path with
// repository.ErrNotFound (certificate.go Get), so the errors.Is walk
// through the handler's errToStatus choke point discriminates correctly:
// truly-missing → 404, everything else → 500 (the intended outcome). The
// wrap text is changed from "certificate not found" to "failed to get
// certificate" to match the semantic.
existing, err := s.certRepo.Get(ctx, id)
if err != nil {
return nil, fmt.Errorf("certificate not found: %w", err)
return nil, fmt.Errorf("failed to get certificate: %w", err)
}
// Merge non-zero fields from patch into existing
@@ -501,10 +512,14 @@ func (s *CertificateService) GetOCSPResponse(ctx context.Context, issuerID strin
// GetCertificateDeployments returns all deployment targets for a certificate (M20).
func (s *CertificateService) GetCertificateDeployments(ctx context.Context, certID string) ([]domain.DeploymentTarget, error) {
// Verify certificate exists
// Verify certificate exists. See M-1 (P2) note on UpdateCertificate for
// the wrap-text correction rationale — same treatment applies here: the
// repo's sql.ErrNoRows wrap with repository.ErrNotFound escapes cleanly
// through fmt.Errorf("%w", ...) so errors.Is picks up the genuine 404
// while transient DB errors correctly surface as 500.
_, err := s.certRepo.Get(ctx, certID)
if err != nil {
return nil, fmt.Errorf("certificate not found: %w", err)
return nil, fmt.Errorf("failed to get certificate: %w", err)
}
if s.targetRepo == nil {
+13 -2
View File
@@ -166,9 +166,20 @@ func (s *DiscoveryService) ClaimDiscovered(ctx context.Context, id string, manag
return fmt.Errorf("failed to get discovered certificate: %w", err)
}
// Verify the managed cert exists
// Verify the managed cert exists.
//
// M-1 (P2): the pre-M-1 wrap was a bare `fmt.Errorf("managed certificate not
// found: %s", managedCertID)` with no %w and no sentinel. That threw away the
// underlying error entirely — a transient DB failure from certRepo.Get (e.g.,
// connection dropped, query timeout) would be flattened into the same "not
// found" string as a genuine sql.ErrNoRows, which the pre-M-1 handler
// strings.Contains classifier then swept into 404. Post-M-1 we propagate the
// underlying error via %w so errors.Is walks the chain correctly:
// certRepo.Get wraps genuine sql.ErrNoRows with repository.ErrNotFound (see
// postgres/certificate.go Get), and errToStatus dispatches that to 404;
// transient errors surface as 500 via the generic-error fallback.
if _, err := s.certRepo.Get(ctx, managedCertID); err != nil {
return fmt.Errorf("managed certificate not found: %s", managedCertID)
return fmt.Errorf("failed to get managed certificate %s: %w", managedCertID, err)
}
if err := s.discoveryRepo.UpdateDiscoveredStatus(ctx, id, domain.DiscoveryStatusManaged, managedCertID); err != nil {
+73
View File
@@ -0,0 +1,73 @@
package service
import "errors"
// M-1 (P2) coverage-gap closure: generic service-layer error sentinels.
//
// Before M-1, API handlers classified service errors by substring-matching the
// wrapped message text (`strings.Contains(err.Error(), "not found")` and
// friends). That made every HTTP status mapping one `fmt.Errorf` reword away
// from silently regressing — including the M-003 self-approval privilege
// boundary, where `handler/jobs.go:174/:220` ignored the already-defined
// ErrSelfApproval sentinel and relied on the literal string "cannot approve".
//
// These six generic sentinels form the type-safe surface the handler layer
// dispatches against via `errors.Is`. Domain-specific sentinels (ErrSelfApproval,
// ErrAgentIsSentinel, ErrBlockedByDependencies, ErrForceReasonRequired,
// ErrAgentNotFound) are declared in their own topical files (job.go,
// agent_retire.go, target.go) and wrap one of these generics via
// `fmt.Errorf("%w: ...", ErrForbidden)`. The wrap chain lets call sites continue
// to `errors.Is(err, ErrSelfApproval)` for domain-specific logic while the
// handler's single choke point in `api/handler/errors.go` can match on the
// generic sentinel to pick the HTTP status.
//
// Dispatch order in errToStatus matters — see the doc block at the top of
// `internal/api/handler/errors.go`.
//
// ErrAgentRetired is deliberately NOT wrapped here. 410 Gone is semantically
// distinct from 403/404/409 and must short-circuit the generic dispatch. Keep
// its standalone declaration in agent_retire.go untouched; errToStatus tests
// it first.
var (
// ErrNotFound indicates a lookup for a resource that does not exist.
// Handlers translate this to HTTP 404.
ErrNotFound = errors.New("not found")
// ErrValidation indicates malformed, missing, or out-of-range input from
// the caller. Handlers translate this to HTTP 400.
ErrValidation = errors.New("validation failed")
// ErrConflict indicates a state conflict: unique-constraint violation,
// foreign-key dependency, or a state machine transition that is not
// allowed from the current state. Handlers translate this to HTTP 409.
ErrConflict = errors.New("conflict")
// ErrForbidden indicates an authorization / privilege-boundary denial.
// The caller is authenticated but is not permitted to perform the action.
// Handlers translate this to HTTP 403.
ErrForbidden = errors.New("forbidden")
// ErrUnauthenticated indicates the caller failed to authenticate — most
// commonly a SCEP challenge-password mismatch, where the transport itself
// is valid but the application-layer credential is wrong. Handlers
// translate this to HTTP 401.
ErrUnauthenticated = errors.New("unauthenticated")
// ErrNotImplemented indicates the requested operation is defined but not
// yet wired up — reserved for feature-flag-gated code paths. Handlers
// translate this to HTTP 501.
ErrNotImplemented = errors.New("not implemented")
// ErrUnprocessable indicates the request was well-formed and the
// referenced resource exists, but server-side stored data could not be
// processed — e.g., a certificate PEM in inventory that fails X.509
// decoding because the stored blob is corrupt or was inserted with the
// wrong encoding. Distinct from ErrValidation: ErrValidation means the
// caller sent bad input (400), while ErrUnprocessable means the caller's
// input was fine but our own data cannot satisfy the operation (422
// Unprocessable Entity). Today the only call site is ExportPKCS12's parse
// path in internal/service/export.go; keeping the sentinel generic so
// other "stored-data-unparseable" paths can reuse it without inventing a
// second variant.
ErrUnprocessable = errors.New("unprocessable")
)
+110
View File
@@ -0,0 +1,110 @@
package service
import (
"errors"
"fmt"
"testing"
)
// TestGenericSentinels_IdentityDistinct guards against an accidental
// `var ErrX = ErrY` alias where two generic sentinels share identity. Each
// must be a distinct error value so errors.Is dispatch in errToStatus can
// route them to different HTTP status codes.
func TestGenericSentinels_IdentityDistinct(t *testing.T) {
sentinels := []struct {
name string
err error
}{
{"ErrNotFound", ErrNotFound},
{"ErrValidation", ErrValidation},
{"ErrConflict", ErrConflict},
{"ErrForbidden", ErrForbidden},
{"ErrUnauthenticated", ErrUnauthenticated},
{"ErrNotImplemented", ErrNotImplemented},
}
for i := range sentinels {
for j := range sentinels {
if i == j {
continue
}
if errors.Is(sentinels[i].err, sentinels[j].err) {
t.Errorf("%s and %s alias the same error value — each generic sentinel must be distinct",
sentinels[i].name, sentinels[j].name)
}
}
}
}
// TestWrappedSentinels_ChainWalk is the core M-1 invariant: wrapping a
// domain-specific sentinel under a generic sentinel via fmt.Errorf("%w: ...")
// must preserve BOTH identities on the wrap chain. Call sites that check
// errors.Is(err, ErrSelfApproval) for domain logic AND the handler-layer
// errToStatus that checks errors.Is(err, ErrForbidden) for the HTTP status
// both need to succeed on the same error value.
//
// If this test fails, every handler dispatch that routes through errToStatus
// is silently broken.
func TestWrappedSentinels_ChainWalk(t *testing.T) {
cases := []struct {
name string
err error
generic error
}{
{"ErrSelfApproval wraps ErrForbidden", ErrSelfApproval, ErrForbidden},
{"ErrAgentIsSentinel wraps ErrForbidden", ErrAgentIsSentinel, ErrForbidden},
{"ErrBlockedByDependencies wraps ErrConflict", ErrBlockedByDependencies, ErrConflict},
{"ErrForceReasonRequired wraps ErrValidation", ErrForceReasonRequired, ErrValidation},
{"ErrAgentNotFound wraps ErrValidation", ErrAgentNotFound, ErrValidation},
}
for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
if !errors.Is(c.err, c.generic) {
t.Errorf("errors.Is(%v, %v) = false; want true", c.err, c.generic)
}
if !errors.Is(c.err, c.err) {
t.Errorf("errors.Is(err, err) = false; want true — domain sentinel lost self-identity after wrap")
}
})
}
}
// TestErrAgentRetired_StandaloneGone locks the 410 Gone semantics in place.
// ErrAgentRetired MUST NOT wrap any generic sentinel — 410 is semantically
// distinct from 403/404/409 (permanently-terminated resource identity) and
// the errToStatus dispatch tests it FIRST before any generic check. If this
// test goes red because someone wrapped it under ErrForbidden or ErrNotFound,
// the agent-binary shutdown behavior at cmd/agent/main.go:1291 silently
// regresses.
func TestErrAgentRetired_StandaloneGone(t *testing.T) {
if errors.Is(ErrAgentRetired, ErrForbidden) {
t.Error("ErrAgentRetired must NOT wrap ErrForbidden — 410 Gone would demote to 403")
}
if errors.Is(ErrAgentRetired, ErrNotFound) {
t.Error("ErrAgentRetired must NOT wrap ErrNotFound — 410 Gone would demote to 404")
}
if errors.Is(ErrAgentRetired, ErrConflict) {
t.Error("ErrAgentRetired must NOT wrap ErrConflict — 410 Gone would demote to 409")
}
if errors.Is(ErrAgentRetired, ErrValidation) {
t.Error("ErrAgentRetired must NOT wrap ErrValidation — 410 Gone would demote to 400")
}
if !errors.Is(ErrAgentRetired, ErrAgentRetired) {
t.Error("ErrAgentRetired lost self-identity")
}
}
// TestSentinels_SurviveDoubleWrap verifies that wrapping a sentinel-wrapped
// error a SECOND time (e.g., a call site doing fmt.Errorf("%w: ctx-info",
// ErrSelfApproval)) preserves both the domain sentinel and the generic
// sentinel. This is critical because the errToStatus dispatch order places
// ErrForbidden BEFORE ErrValidation — if double-wrapping broke the chain,
// the M-003 gate would demote to 400.
func TestSentinels_SurviveDoubleWrap(t *testing.T) {
doubled := fmt.Errorf("%w: additional context from call site", ErrSelfApproval)
if !errors.Is(doubled, ErrSelfApproval) {
t.Error("double-wrapped ErrSelfApproval lost domain identity")
}
if !errors.Is(doubled, ErrForbidden) {
t.Error("double-wrapped ErrSelfApproval lost ErrForbidden wrap — M-003 gate would demote to 500")
}
}
+37 -11
View File
@@ -38,16 +38,31 @@ type ExportPEMResult struct {
// ExportPEM returns the PEM-encoded certificate and chain for the latest version.
func (s *ExportService) ExportPEM(ctx context.Context, certID string) (*ExportPEMResult, error) {
// Verify certificate exists
// Verify certificate exists.
//
// M-1 (P2): the pre-M-1 wrap was `"certificate not found: %w"` on every
// certRepo.Get error — which gave the handler's strings.Contains(err.Error(),
// "not found") check a false positive on transient DB failures (connection
// refused, context deadline, etc.), demoting a 500 to a 404. Now the repo
// wraps only the genuine sql.ErrNoRows path with repository.ErrNotFound
// (certificate.go Get), so the errors.Is walk through the handler's
// errToStatus discriminates correctly: truly-missing → 404, everything else
// → 500 (the intended outcome). The wrap text is changed from "certificate
// not found" to "failed to get certificate" to match the semantic.
cert, err := s.certRepo.Get(ctx, certID)
if err != nil {
return nil, fmt.Errorf("certificate not found: %w", err)
return nil, fmt.Errorf("failed to get certificate: %w", err)
}
// Get latest version (contains the PEM chain)
// Get latest version (contains the PEM chain).
//
// M-1 (P2): same wrap-text correction as above — pre-M-1 any
// GetLatestVersion error surfaced as "no certificate version found",
// which bled into the handler's substring classifier. Now the repo wraps
// sql.ErrNoRows with repository.ErrNotFound; the wrap chain walks cleanly.
version, err := s.certRepo.GetLatestVersion(ctx, certID)
if err != nil {
return nil, fmt.Errorf("no certificate version found: %w", err)
return nil, fmt.Errorf("failed to get latest certificate version: %w", err)
}
// Split PEM chain into leaf cert + chain
@@ -73,26 +88,37 @@ func (s *ExportService) ExportPEM(ctx context.Context, certID string) (*ExportPE
// The private key is NOT included — it lives on the agent and never touches the control plane.
// The PKCS#12 bundle is encrypted with the provided password (can be empty for cert-only bundles).
func (s *ExportService) ExportPKCS12(ctx context.Context, certID string, password string) ([]byte, error) {
// Verify certificate exists
// Verify certificate exists. See M-1 (P2) note on ExportPEM for the wrap-text
// correction — same rationale applies here.
cert, err := s.certRepo.Get(ctx, certID)
if err != nil {
return nil, fmt.Errorf("certificate not found: %w", err)
return nil, fmt.Errorf("failed to get certificate: %w", err)
}
// Get latest version
// Get latest version. Same wrap-text correction as ExportPEM.
version, err := s.certRepo.GetLatestVersion(ctx, certID)
if err != nil {
return nil, fmt.Errorf("no certificate version found: %w", err)
return nil, fmt.Errorf("failed to get latest certificate version: %w", err)
}
// Parse PEM chain into x509.Certificate objects
// Parse PEM chain into x509.Certificate objects.
//
// M-1 (P2): wrap both parse-failure paths with ErrUnprocessable so the
// handler's errToStatus choke point dispatches to 422 Unprocessable Entity
// via errors.Is instead of the pre-M-1 two-term substring net
// (`"cannot be parsed"|"no certificates found"`) at handler/export.go:101.
// 422 is the correct status here — the caller's request is syntactically
// fine; the stored PEM chain is what can't be processed. The composed
// Error() string still carries the "certificate data cannot be parsed as
// X.509"/"no certificates found in PEM chain" wording so server-side
// slog.Error capture and any future 422 body propagation stay readable.
certs, err := parsePEMCertificates(version.PEMChain)
if err != nil {
return nil, fmt.Errorf("certificate data cannot be parsed as X.509: %w", err)
return nil, fmt.Errorf("%w: certificate data cannot be parsed as X.509: %v", ErrUnprocessable, err)
}
if len(certs) == 0 {
return nil, fmt.Errorf("no certificates found in PEM chain")
return nil, fmt.Errorf("%w: no certificates found in PEM chain", ErrUnprocessable)
}
// Build PKCS#12 bundle: leaf cert + CA chain (no private key)
+9 -1
View File
@@ -262,10 +262,18 @@ func (s *IssuerService) Delete(ctx context.Context, id string, actor string) err
// TestConnection tests the connection to an issuer by instantiating a throwaway
// connector and calling ValidateConfig. Records the result in the database.
//
// M-1 (P2): the pre-M-1 wrap was `"issuer not found: %w"` on every
// issuerRepo.Get error — which coupled to handler substring classifiers on
// "not found" and could have demoted transient DB failures to 404. Now the
// repo wraps only the genuine sql.ErrNoRows path with repository.ErrNotFound
// (postgres/issuer.go Get), so errors.Is walks the wrap chain correctly:
// truly-missing → 404, everything else → 500. The wrap text is changed from
// "issuer not found" to "failed to get issuer" to match the semantic.
func (s *IssuerService) TestConnection(ctx context.Context, id string) error {
iss, err := s.issuerRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("issuer not found: %w", err)
return fmt.Errorf("failed to get issuer: %w", err)
}
// Get the decrypted config
+24 -5
View File
@@ -3,7 +3,6 @@ package service
import (
"time"
"context"
"errors"
"fmt"
"log/slog"
"strings"
@@ -16,8 +15,16 @@ import (
// approve a renewal job is the same person listed as the owner of the
// underlying certificate. M-003 enforces separation of duties: the owner who
// requested (or benefits from) the renewal must not be the same identity that
// approves it. Handlers map this sentinel to HTTP 403 Forbidden.
var ErrSelfApproval = errors.New("self-approval forbidden: actor is the owner of the certificate")
// approves it.
//
// M-1 (P2): wraps [ErrForbidden] via fmt.Errorf("%w", ...) so the single
// handler-layer errToStatus choke point resolves HTTP 403 via
// errors.Is(err, ErrForbidden). Call sites can continue to
// errors.Is(err, ErrSelfApproval) for domain-specific logging — both succeed
// because %w builds a wrap chain. Pre-M-1, handler/jobs.go classified the
// failure by substring-matching "cannot approve" in the message; a reword
// would have silently demoted 403 to 500 with no observable signal.
var ErrSelfApproval = fmt.Errorf("%w: self-approval forbidden: actor is the owner of the certificate", ErrForbidden)
// JobService manages job processing and status tracking.
// It coordinates between the scheduler and various job-specific services.
@@ -403,9 +410,17 @@ func (s *JobService) GetJob(ctx context.Context, id string) (*domain.Job, error)
// duties. Callers must pass a non-empty actor; empty actor is treated as an
// anonymous system caller and permitted (internal/system paths).
func (s *JobService) ApproveJob(ctx context.Context, id, actor string) error {
// M-1 (P2): the pre-M-1 wrap was `"job not found: %w"` on every jobRepo.Get
// error — which coupled to the handler's strings.Contains substring classifier
// on "not found" and gave false positives on transient DB failures, demoting a
// 500 to a 404. Now the repo wraps only the genuine sql.ErrNoRows path with
// repository.ErrNotFound (postgres/job.go Get), so errors.Is walks the wrap
// chain correctly: truly-missing → 404, everything else → 500. The wrap text
// is changed from "job not found" to "failed to get job" to match the
// semantic.
job, err := s.jobRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("job not found: %w", err)
return fmt.Errorf("failed to get job: %w", err)
}
if job.Status != domain.JobStatusAwaitingApproval {
@@ -435,9 +450,13 @@ func (s *JobService) ApproveJob(ctx context.Context, id, actor string) error {
// owner is permitted to cancel their own pending renewal. actor is recorded
// on the log line for audit attribution.
func (s *JobService) RejectJob(ctx context.Context, id, reason, actor string) error {
// M-1 (P2): wrap-text correction — see ApproveJob for rationale. Same
// repo.Get sql.ErrNoRows → repository.ErrNotFound propagation; text changed
// from "job not found" to "failed to get job" so transient DB errors stay
// 500 while genuine 404s are picked up via errors.Is at errToStatus.
job, err := s.jobRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("job not found: %w", err)
return fmt.Errorf("failed to get job: %w", err)
}
if job.Status != domain.JobStatusAwaitingApproval {
+17 -1
View File
@@ -383,6 +383,15 @@ func (s *NotificationService) ListNotifications(ctx context.Context, page, perPa
}
// GetNotification returns a single notification (handler interface method).
//
// M-1 (P2): the not-found branch now wraps service.ErrNotFound via %w so the
// handler's errToStatus choke point dispatches to 404 via errors.Is. Pre-M-1
// this returned a raw fmt.Errorf("notification not found") and the handler
// relied on a blanket "any error from this service call → 404" shortcut —
// which silently demoted transient DB failures (the List() error wrap above)
// to 404. Post-M-1 the handler uses errToStatus; the List() error wrap still
// surfaces as 500 (no sentinel attached), and only this explicit not-found
// branch maps to 404.
func (s *NotificationService) GetNotification(ctx context.Context, id string) (*domain.NotificationEvent, error) {
filter := &repository.NotificationFilter{
PerPage: 1,
@@ -400,7 +409,7 @@ func (s *NotificationService) GetNotification(ctx context.Context, id string) (*
}
}
return nil, fmt.Errorf("notification not found")
return nil, fmt.Errorf("%w: notification %s", ErrNotFound, id)
}
// MarkAsRead marks a notification as read (handler interface method).
@@ -568,6 +577,13 @@ func (s *NotificationService) RetryFailedNotifications(ctx context.Context) erro
// "pg: deadlock detected" surfaces in the handler response and the
// operator UI. The service has no fallback — a silent "success" that
// didn't actually mutate the row would be worse than a loud error.
//
// M-1 (P2): the repo's Requeue (postgres/notification.go) now wraps
// repository.ErrNotFound on zero-rows-affected. Because this method wraps
// the repo error with %w, errors.Is(err, repository.ErrNotFound) walks
// cleanly through both wrap layers — so the handler's errToStatus choke
// point dispatches a genuine missing-row Requeue to 404 and a transient
// "pg: deadlock detected" to 500 without substring matching.
func (s *NotificationService) RequeueNotification(ctx context.Context, id string) error {
if err := s.notifRepo.Requeue(ctx, id); err != nil {
return fmt.Errorf("failed to requeue notification: %w", err)
+17 -8
View File
@@ -139,42 +139,51 @@ func (s *ProfileService) Get(ctx context.Context, id string) (*domain.Certificat
}
// validateProfile checks that a profile's configuration is valid.
//
// M-1: every return here wraps ErrValidation via %w so the handler's
// errToStatus choke point dispatches these to HTTP 400 via errors.Is without
// the substring-matching on "invalid"/"required"/"must be"/"cannot" that the
// pre-M-1 profile handler relied on. The composed Error() string still
// contains the original human-readable text (e.g. "RSA minimum key size must
// be at least 2048"), so the handler safely passes err.Error() through to the
// response body on the 400 arm. Substring assertions in profile_test.go
// continue to match for the same reason.
func validateProfile(p *domain.CertificateProfile) error {
if p.Name == "" {
return fmt.Errorf("profile name is required")
return fmt.Errorf("%w: profile name is required", ErrValidation)
}
if len(p.Name) > 255 {
return fmt.Errorf("profile name exceeds 255 characters")
return fmt.Errorf("%w: profile name exceeds 255 characters", ErrValidation)
}
// Validate key algorithms
for _, alg := range p.AllowedKeyAlgorithms {
if !domain.ValidKeyAlgorithms[alg.Algorithm] {
return fmt.Errorf("invalid key algorithm: %s (allowed: RSA, ECDSA, Ed25519)", alg.Algorithm)
return fmt.Errorf("%w: invalid key algorithm: %s (allowed: RSA, ECDSA, Ed25519)", ErrValidation, alg.Algorithm)
}
if alg.Algorithm == domain.KeyAlgorithmRSA && alg.MinSize < 2048 {
return fmt.Errorf("RSA minimum key size must be at least 2048, got %d", alg.MinSize)
return fmt.Errorf("%w: RSA minimum key size must be at least 2048, got %d", ErrValidation, alg.MinSize)
}
if alg.Algorithm == domain.KeyAlgorithmECDSA && alg.MinSize < 256 {
return fmt.Errorf("ECDSA minimum key size must be at least 256, got %d", alg.MinSize)
return fmt.Errorf("%w: ECDSA minimum key size must be at least 256, got %d", ErrValidation, alg.MinSize)
}
}
// Validate EKUs
for _, eku := range p.AllowedEKUs {
if !domain.ValidEKUs[eku] {
return fmt.Errorf("invalid EKU: %s", eku)
return fmt.Errorf("%w: invalid EKU: %s", ErrValidation, eku)
}
}
// Validate max TTL
if p.MaxTTLSeconds < 0 {
return fmt.Errorf("max_ttl_seconds cannot be negative")
return fmt.Errorf("%w: max_ttl_seconds cannot be negative", ErrValidation)
}
// Validate short-lived consistency
if p.AllowShortLived && p.MaxTTLSeconds >= 3600 {
return fmt.Errorf("allow_short_lived is true but max_ttl_seconds (%d) is >= 3600; short-lived certs must have TTL under 1 hour", p.MaxTTLSeconds)
return fmt.Errorf("%w: allow_short_lived is true but max_ttl_seconds (%d) is >= 3600; short-lived certs must have TTL under 1 hour", ErrValidation, p.MaxTTLSeconds)
}
return nil
+14 -2
View File
@@ -84,10 +84,22 @@ func (s *SCEPService) PKCSReq(ctx context.Context, csrPEM string, challengePassw
// this branch also protects future call sites (tests, library reuse, a
// future REST-over-HTTPS wrapper) from silently accepting unauthenticated
// CSRs.
//
// M-1 (P2): both failure modes now wrap service.ErrUnauthenticated via %w so
// the handler's errToStatus choke point dispatches them to HTTP 401 via
// errors.Is instead of relying on a `strings.Contains(err.Error(), "challenge
// password")` substring branch at handler/scep.go:174. This is a deliberate
// semantic correction: the pre-M-1 substring branch returned 403 Forbidden,
// but SCEP challenge-password failure is an authentication failure (the
// caller has no valid credential at the application layer), not an
// authorization denial (the caller has a credential but is not permitted),
// and 401 Unauthorized is the correct RFC 7235 status. The errToStatus doc
// block explicitly cites this site as the canonical ErrUnauthenticated use
// case.
if s.challengePassword == "" {
s.logger.Warn("SCEP enrollment rejected: server has no challenge password configured",
"transaction_id", transactionID)
return nil, fmt.Errorf("SCEP challenge password not configured on server")
return nil, fmt.Errorf("%w: SCEP challenge password not configured on server", ErrUnauthenticated)
}
// Constant-time compare avoids leaking the configured secret through
// response-time variance. ConstantTimeCompare returns 1 only when both
@@ -96,7 +108,7 @@ func (s *SCEPService) PKCSReq(ctx context.Context, csrPEM string, challengePassw
if subtle.ConstantTimeCompare([]byte(challengePassword), []byte(s.challengePassword)) != 1 {
s.logger.Warn("SCEP enrollment rejected: invalid challenge password",
"transaction_id", transactionID)
return nil, fmt.Errorf("invalid challenge password")
return nil, fmt.Errorf("%w: invalid challenge password", ErrUnauthenticated)
}
return s.processEnrollment(ctx, csrPEM, transactionID, "scep_pkcsreq")
+19
View File
@@ -148,6 +148,13 @@ func TestSCEPService_PKCSReq_ChallengePassword_Invalid(t *testing.T) {
if err == nil {
t.Fatal("expected error for invalid challenge password")
}
// M-1 (P2): pin the sentinel wrap so the handler's errToStatus dispatch
// (errors.Is → 401 Unauthorized) stays contract-stable. The parallel
// substring check is retained because it also pins the user-facing reason
// that service.ErrUnauthenticated composes with via fmt.Errorf("%w: ...").
if !errors.Is(err, ErrUnauthenticated) {
t.Errorf("expected errors.Is(err, ErrUnauthenticated), got: %v", err)
}
if !strings.Contains(err.Error(), "challenge password") {
t.Errorf("expected 'challenge password' in error, got: %v", err)
}
@@ -171,6 +178,12 @@ func TestSCEPService_PKCSReq_ChallengePassword_EmptyServerConfigRejected(t *test
if err == nil {
t.Fatalf("expected rejection when server challenge password is empty (client=%q)", clientPassword)
}
// M-1 (P2): pin the sentinel wrap on the "no shared secret configured"
// branch so both H-2 failure modes route to the same 401 arm at the
// handler. Preserves the explanatory substring assertion alongside.
if !errors.Is(err, ErrUnauthenticated) {
t.Errorf("expected errors.Is(err, ErrUnauthenticated) for client=%q, got: %v", clientPassword, err)
}
if !strings.Contains(err.Error(), "not configured") {
t.Errorf("expected 'not configured' in error, got: %v", err)
}
@@ -193,6 +206,12 @@ func TestSCEPService_PKCSReq_ChallengePassword_ConstantTimeLengthIndependence(t
if err == nil {
t.Fatalf("expected rejection for bad password %q", bad)
}
// M-1 (P2): pin sentinel-wrap on every bad-password input so both the
// length-mismatch and content-mismatch ConstantTimeCompare paths map to
// the same 401 arm.
if !errors.Is(err, ErrUnauthenticated) {
t.Errorf("expected errors.Is(err, ErrUnauthenticated) for %q, got: %v", bad, err)
}
if !strings.Contains(err.Error(), "invalid challenge password") {
t.Errorf("expected 'invalid challenge password' for %q, got: %v", bad, err)
}
+21 -4
View File
@@ -3,7 +3,6 @@ package service
import (
"context"
"encoding/json"
"errors"
"fmt"
"log/slog"
"time"
@@ -18,7 +17,15 @@ import (
// agent. The handler layer maps this to HTTP 400 via [errors.Is]. See C-002 in
// cowork/certctl-coverage-gap-audit.md — this sentinel replaces a silent
// Postgres FK violation (23503 → HTTP 500) with a deterministic 400.
var ErrAgentNotFound = errors.New("referenced agent does not exist")
//
// M-1 (P2): wraps [ErrValidation] via fmt.Errorf("%w", ...) so the single
// handler-layer errToStatus choke point resolves HTTP 400 via
// errors.Is(err, ErrValidation). "Referenced FK does not exist in POST body"
// is invalid input — not a missing-resource lookup — which is why this wraps
// ErrValidation (400) rather than ErrNotFound (404). Pre-M-1, target handlers
// classified this via strings.Contains(err.Error(), "not found"); a message
// reword would have silently promoted 400 to 500 with no observable signal.
var ErrAgentNotFound = fmt.Errorf("%w: referenced agent does not exist", ErrValidation)
// validTargetTypes is the set of allowed target types for validation.
var validTargetTypes = map[domain.TargetType]bool{
@@ -215,10 +222,20 @@ func (s *TargetService) Delete(ctx context.Context, id string, actor string) err
// TestConnection tests a target's connectivity by checking the assigned agent's heartbeat status.
// Target connectors run on agents, not on the server, so we can't instantiate a connector here.
// Instead, we verify the agent is online and reachable.
//
// M-1 (P2): the pre-M-1 wraps were `"target not found: %w"` and
// `"assigned agent not found: %w"` on every targetRepo.Get / agentRepo.Get error
// — which coupled to pre-M-1 handler strings.Contains substring classifiers on
// "not found" and gave false positives on transient DB failures. Now both
// repos wrap only the genuine sql.ErrNoRows path with repository.ErrNotFound,
// so errors.Is walks the chain correctly: truly-missing → 404, everything else
// → 500. The wrap text is changed to "failed to get target" / "failed to get
// agent" to match the semantic (agent-misconfigured stays 500 — "assigned
// agent" is a data-integrity issue, not a consumer 404).
func (s *TargetService) TestConnection(ctx context.Context, id string) error {
target, err := s.targetRepo.Get(ctx, id)
if err != nil {
return fmt.Errorf("target not found: %w", err)
return fmt.Errorf("failed to get target: %w", err)
}
if target.AgentID == "" {
@@ -229,7 +246,7 @@ func (s *TargetService) TestConnection(ctx context.Context, id string) error {
agent, err := s.agentRepo.Get(ctx, target.AgentID)
if err != nil {
s.updateTestStatus(ctx, target, "failed")
return fmt.Errorf("assigned agent not found: %w", err)
return fmt.Errorf("failed to get assigned agent: %w", err)
}
// I-004: AgentRepository.Get intentionally surfaces retired rows (the banner