mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-07 13:51:36 +00:00
envoy: atomic SDS JSON write + post-deploy watcher pickup poll
Closes Bundle 3 of the 2026-05-02 deployment-target coverage audit (see cowork/deployment-target-audit-2026-05-02/RESULTS.md). The audit ranked this fix #3 by acquirer impact behind the K8s real client (#1) and the docs realignment (#2 / Bundle 1). Two production-grade gaps closed: 1. SDS JSON config write was non-atomic. Cert/key/chain at envoy.go L155/L168/L183 went through deploy.AtomicWriteFile (atomic + backups + ownership preservation), but the SDS JSON at L260 went through os.WriteFile directly. A power loss / OOM / process-kill mid-write of the SDS JSON produces a torn file Envoy cannot parse, and Envoy's file-based SDS watcher refuses to load any cert (not just the rotating one) until the JSON is repaired by hand. Replaced with deploy.AtomicWriteFile and threaded ctx through writeSDSConfig. 2. No watcher pickup confirmation before returning success. Pre-fix, DeployCertificate returned the moment file writes completed. Envoy's SDS watcher is asynchronous; a caller running post-deploy TLS verify immediately after DeployCertificate could see Envoy still serving the old cert (watcher latency, load-balanced replica hit one that hadn't reloaded yet). Added the canonical post-deploy verify pattern (mirrors nginx.go::runPostDeployVerify L416): probe seam + retry/backoff + SHA-256 fingerprint compare against request.CertPEM. On verify failure, restore from per-file backups via the new restoreFromBackups helper. Envoy has no PostCommit reload to re-run; the watcher auto-reloads on the restored files. Config additions to envoy.Config (mirror nginx.Config L84-93): - PostDeployVerify *PostDeployVerifyConfig (Enabled, Endpoint, Timeout) - PostDeployVerifyAttempts int (default 3 in runPostDeployVerify) - PostDeployVerifyBackoff time.Duration (default 2s) - BackupRetention int (mirrors nginx; passed to AtomicWriteFile per file) Default behaviour unchanged for callers that don't set PostDeployVerify — verify is opt-in. nil or Enabled=false skips it entirely. Probe seam: c.probe = tlsprobe.ProbeTLS at construction; tests inject via the new SetTestProbe method. Same shape NGINX uses (nginx.go:130); also mirrors the existing Traefik SetTestProbe at traefik.go:62. WriteResult retention: every AtomicWriteFile call now retains its *deploy.WriteResult in a local []*deploy.WriteResult slice so the rollback path can restore from BackupPath across all four files (cert, key, chain, SDS JSON), not just the cert. Pre-fix the cert's WriteResult was discarded. restoreFromBackups (envoy.go new): iterates the WriteResults from a successful per-file pass, rewrites each non-idempotent destination from its BackupPath via AtomicWriteFile{SkipIdempotent:true, BackupRetention:-1}. The -1 prevents backup-of-the-backup pollution. For files that didn't exist pre-deploy (BackupPath == ""), restore = remove. Mirrors nginx.go::rollbackToBackups (L487-515) with the reload step elided. Idempotency gate: shouldRunVerify returns true unless EVERY WriteResult was Idempotent — same all-files semantics NGINX gets from res.SkippedAsIdempotent. Pre-fix Envoy had no verify at all, so there was no gate to get wrong; this introduces the correct all-files shape from the start. Tests added to envoy_atomic_test.go: - TestEnvoy_Atomic_SDSConfigWriteIsAtomic — pre-writes a sentinel SDS JSON, runs DeployCertificate, asserts a backup file with deploy.BackupSuffix appears alongside the new sds.json (proves AtomicWriteFile is now in the SDS path). - TestEnvoy_Atomic_WatcherPickupRetries — stub probe returns wrong fingerprint on attempts 1+2 and correct on attempt 3; deploy succeeds; probe called exactly 3 times. - TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack — pre-writes SENTINEL bytes for cert+key, stub probe always wrong; deploy returns wrapped error AND the destination files contain the sentinel bytes (rollback restored). - TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault — Config with nil PostDeployVerify; asserts probe is never called (opt-in default preserved). A small certPEMFingerprint helper added to the test file mirrors the production envoy.certPEMToFingerprint (which is package-private — external tests can't call it). docs/deployment-atomicity.md L87 row already documents "TLS handshake | atomic-write replaces os.WriteFile" — pre-fix the claim was aspirational (verify happened in the agent verify-and-report path, not the connector; SDS JSON wasn't atomic). Post-fix the claim is honest. No doc change required. Verified locally: - gofmt -l ./internal/connector/target/envoy/ clean - go vet ./internal/connector/target/envoy/... clean - staticcheck ./internal/connector/target/envoy/... clean - go build ./... clean - go test -race -count=1 ./internal/connector/target/envoy/... green (5 pre-existing tests + 4 new = 9 total) - go test -short -count=1 ./internal/connector/target/... green Audit reference: cowork/deployment-target-audit-2026-05-02/RESULTS.md Bundle 3.
This commit is contained in:
@@ -2,7 +2,11 @@ package envoy
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"os"
|
||||
@@ -12,6 +16,7 @@ import (
|
||||
|
||||
"github.com/shankar0123/certctl/internal/connector/target"
|
||||
"github.com/shankar0123/certctl/internal/deploy"
|
||||
"github.com/shankar0123/certctl/internal/tlsprobe"
|
||||
)
|
||||
|
||||
// Config represents the Envoy deployment target configuration.
|
||||
@@ -24,6 +29,45 @@ type Config struct {
|
||||
KeyFilename string `json:"key_filename"` // Filename for private key (default: key.pem)
|
||||
ChainFilename string `json:"chain_filename"` // Optional filename for chain (if set, chain written separately)
|
||||
SDSConfig bool `json:"sds_config"` // If true, write an SDS discovery JSON file for file-based SDS
|
||||
|
||||
// Bundle 3 (deployment-target audit 2026-05-02): post-deploy TLS
|
||||
// verification. Defends against Envoy's SDS file watcher's natural
|
||||
// pickup latency — without this, DeployCertificate returned the
|
||||
// moment file writes completed and a caller running post-deploy
|
||||
// verify could see Envoy still serving the old cert (watcher
|
||||
// hadn't reloaded yet, load-balanced replica hit one that hadn't
|
||||
// reloaded yet, etc.). Same shape as nginx.go::PostDeployVerify.
|
||||
// Default behavior is opt-in: nil PostDeployVerify or
|
||||
// PostDeployVerify.Enabled=false skips the verify step entirely.
|
||||
PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"`
|
||||
PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"`
|
||||
PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"`
|
||||
|
||||
// Bundle 3: backup retention. Zero =
|
||||
// deploy.DefaultBackupRetention (3); -1 = disable backups. Mirrors
|
||||
// the per-Plan setting on file-write connectors that already use
|
||||
// deploy.Apply (nginx/apache/haproxy/postfix). Envoy uses
|
||||
// AtomicWriteFile per file so this gets passed via WriteOptions.
|
||||
BackupRetention int `json:"backup_retention,omitempty"`
|
||||
}
|
||||
|
||||
// PostDeployVerifyConfig controls the post-deploy TLS handshake verification
|
||||
// step. Mirrors nginx.PostDeployVerifyConfig so the Envoy + NGINX shapes are
|
||||
// interchangeable for operators reading docs.
|
||||
type PostDeployVerifyConfig struct {
|
||||
// Enabled toggles the verify; false = skip even when the struct
|
||||
// is non-nil.
|
||||
Enabled bool `json:"enabled"`
|
||||
|
||||
// Endpoint is the host:port to dial for the TLS handshake. When
|
||||
// empty, the connector logs a warning and skips verify (V2:
|
||||
// operator-explicit configuration required; no defaulting to
|
||||
// localhost which would be wrong for sidecar deployments).
|
||||
Endpoint string `json:"endpoint,omitempty"`
|
||||
|
||||
// Timeout caps each individual probe attempt. Zero defaults to
|
||||
// 10s (matches nginx default).
|
||||
Timeout time.Duration `json:"timeout,omitempty"`
|
||||
}
|
||||
|
||||
// SDSResource represents an Envoy SDS tls_certificate resource for file-based SDS.
|
||||
@@ -57,6 +101,11 @@ type DataSource struct {
|
||||
type Connector struct {
|
||||
config *Config
|
||||
logger *slog.Logger
|
||||
|
||||
// Bundle 3: probe seam for post-deploy TLS verify. Same shape NGINX
|
||||
// uses (nginx.go:130) — tlsprobe.ProbeTLS in production; tests
|
||||
// inject a stub via SetTestProbe.
|
||||
probe func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult
|
||||
}
|
||||
|
||||
// New creates a new Envoy target connector with the given configuration and logger.
|
||||
@@ -64,9 +113,18 @@ func New(config *Config, logger *slog.Logger) *Connector {
|
||||
return &Connector{
|
||||
config: config,
|
||||
logger: logger,
|
||||
probe: tlsprobe.ProbeTLS,
|
||||
}
|
||||
}
|
||||
|
||||
// SetTestProbe overrides the post-deploy TLS probe for tests. Production code
|
||||
// gets tlsprobe.ProbeTLS via New; tests inject a stub that returns canned
|
||||
// ProbeResults to exercise watcher-pickup retry/backoff paths without standing
|
||||
// up a real TLS server.
|
||||
func (c *Connector) SetTestProbe(fn func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult) {
|
||||
c.probe = fn
|
||||
}
|
||||
|
||||
// ValidateConfig checks that the certificate directory is configured and valid.
|
||||
func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessage) error {
|
||||
var cfg Config
|
||||
@@ -126,10 +184,20 @@ func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessag
|
||||
// and automatically picks up changes without requiring a reload command.
|
||||
//
|
||||
// Steps:
|
||||
// 1. Write certificate (+ chain if chain_filename not set) to cert_filename with mode 0644
|
||||
// 2. Write private key to key_filename with mode 0600
|
||||
// 3. If chain_filename set and chain provided, write chain separately with mode 0644
|
||||
// 4. If sds_config is true, write SDS JSON file pointing to cert/key paths
|
||||
// 1. Atomic-write certificate (+ chain if chain_filename not set) to
|
||||
// cert_filename with mode 0644.
|
||||
// 2. Atomic-write private key to key_filename with mode 0600.
|
||||
// 3. If chain_filename set and chain provided, atomic-write chain
|
||||
// separately with mode 0644.
|
||||
// 4. If sds_config is true, atomic-write SDS JSON file pointing to
|
||||
// cert/key paths (Bundle 3: previously os.WriteFile, now
|
||||
// deploy.AtomicWriteFile so the JSON itself is atomic — torn JSON
|
||||
// mid-write would make Envoy refuse to load any cert).
|
||||
// 5. If PostDeployVerify enabled, dial the configured TLS endpoint and
|
||||
// poll until the served leaf-cert SHA-256 matches the deployed
|
||||
// fingerprint, with retry/backoff to absorb watcher latency. On
|
||||
// mismatch after all attempts, restore from the WriteResults'
|
||||
// BackupPaths and return a wrapped error (Bundle 3).
|
||||
func (c *Connector) DeployCertificate(ctx context.Context, request target.DeploymentRequest) (*target.DeploymentResult, error) {
|
||||
c.logger.Info("deploying certificate to Envoy",
|
||||
"cert_dir", c.config.CertDir,
|
||||
@@ -148,11 +216,19 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
certData += request.ChainPEM + "\n"
|
||||
}
|
||||
|
||||
// Phase 7 (deploy-hardening I): atomic-write via
|
||||
// deploy.AtomicWriteFile so cert/key/chain swap atomically and
|
||||
// have backup files for rollback. Envoy's SDS file watcher
|
||||
// picks up the rename atomically — no torn config.
|
||||
if _, err := deploy.AtomicWriteFile(ctx, certPath, []byte(certData), deploy.WriteOptions{Mode: 0644}); err != nil {
|
||||
// Bundle 3 contract: track WriteResults for every atomic write so
|
||||
// the post-deploy-verify rollback path can restore from backups
|
||||
// across all four files (cert, key, chain, SDS JSON) — not just
|
||||
// the cert.
|
||||
results := make([]*deploy.WriteResult, 0, 4)
|
||||
|
||||
writeOpts := func(mode os.FileMode) deploy.WriteOptions {
|
||||
return deploy.WriteOptions{Mode: mode, BackupRetention: c.config.BackupRetention}
|
||||
}
|
||||
|
||||
// 1. Cert (+ inline chain if no separate chain filename).
|
||||
certRes, err := deploy.AtomicWriteFile(ctx, certPath, []byte(certData), writeOpts(0644))
|
||||
if err != nil {
|
||||
errMsg := fmt.Sprintf("failed to write certificate: %v", err)
|
||||
c.logger.Error("certificate deployment failed", "error", err)
|
||||
return &target.DeploymentResult{
|
||||
@@ -162,10 +238,12 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
DeployedAt: time.Now(),
|
||||
}, fmt.Errorf("%s", errMsg)
|
||||
}
|
||||
results = append(results, certRes)
|
||||
|
||||
// Write private key with secure permissions (0600: rw-------)
|
||||
// 2. Key (mode 0600 — private material).
|
||||
if request.KeyPEM != "" {
|
||||
if _, err := deploy.AtomicWriteFile(ctx, keyPath, []byte(request.KeyPEM), deploy.WriteOptions{Mode: 0600}); err != nil {
|
||||
keyRes, err := deploy.AtomicWriteFile(ctx, keyPath, []byte(request.KeyPEM), writeOpts(0600))
|
||||
if err != nil {
|
||||
errMsg := fmt.Sprintf("failed to write private key: %v", err)
|
||||
c.logger.Error("key deployment failed", "error", err)
|
||||
return &target.DeploymentResult{
|
||||
@@ -175,12 +253,14 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
DeployedAt: time.Now(),
|
||||
}, fmt.Errorf("%s", errMsg)
|
||||
}
|
||||
results = append(results, keyRes)
|
||||
}
|
||||
|
||||
// Write chain separately if chain_filename is configured
|
||||
// 3. Optional separate chain file.
|
||||
if c.config.ChainFilename != "" && request.ChainPEM != "" {
|
||||
chainPath := filepath.Join(c.config.CertDir, c.config.ChainFilename)
|
||||
if _, err := deploy.AtomicWriteFile(ctx, chainPath, []byte(request.ChainPEM+"\n"), deploy.WriteOptions{Mode: 0644}); err != nil {
|
||||
chainRes, err := deploy.AtomicWriteFile(ctx, chainPath, []byte(request.ChainPEM+"\n"), writeOpts(0644))
|
||||
if err != nil {
|
||||
errMsg := fmt.Sprintf("failed to write chain: %v", err)
|
||||
c.logger.Error("chain deployment failed", "error", err)
|
||||
return &target.DeploymentResult{
|
||||
@@ -190,11 +270,13 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
DeployedAt: time.Now(),
|
||||
}, fmt.Errorf("%s", errMsg)
|
||||
}
|
||||
results = append(results, chainRes)
|
||||
}
|
||||
|
||||
// Write SDS JSON file if configured
|
||||
// 4. SDS JSON (Bundle 3: was os.WriteFile, now atomic).
|
||||
if c.config.SDSConfig {
|
||||
if err := c.writeSDSConfig(); err != nil {
|
||||
sdsRes, err := c.writeSDSConfig(ctx)
|
||||
if err != nil {
|
||||
errMsg := fmt.Sprintf("failed to write SDS config: %v", err)
|
||||
c.logger.Error("SDS config deployment failed", "error", err)
|
||||
return &target.DeploymentResult{
|
||||
@@ -204,19 +286,50 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
DeployedAt: time.Now(),
|
||||
}, fmt.Errorf("%s", errMsg)
|
||||
}
|
||||
results = append(results, sdsRes)
|
||||
}
|
||||
|
||||
// 5. Post-deploy TLS verify (Bundle 3). Skip when all four files
|
||||
// were idempotent (no actual change to verify) — same gate NGINX
|
||||
// uses on res.SkippedAsIdempotent.
|
||||
if c.shouldRunVerify(results) {
|
||||
if vErr := c.runPostDeployVerify(ctx, request.CertPEM); vErr != nil {
|
||||
c.logger.Error("post-deploy TLS verify failed; rolling back", "error", vErr)
|
||||
rbErr := c.restoreFromBackups(ctx, results)
|
||||
if rbErr != nil {
|
||||
return c.failureResult(certPath, "post-deploy verify + rollback both failed",
|
||||
fmt.Errorf("verify: %w; rollback: %v", vErr, rbErr), startTime), rbErr
|
||||
}
|
||||
return c.failureResult(certPath, "post-deploy verify failed; rolled back",
|
||||
vErr, startTime), vErr
|
||||
}
|
||||
}
|
||||
|
||||
deploymentDuration := time.Since(startTime)
|
||||
allIdempotent := true
|
||||
for _, r := range results {
|
||||
if !r.Idempotent {
|
||||
allIdempotent = false
|
||||
break
|
||||
}
|
||||
}
|
||||
idemNote := ""
|
||||
if allIdempotent {
|
||||
idemNote = " (idempotent skip — all bytes unchanged)"
|
||||
}
|
||||
|
||||
c.logger.Info("certificate deployed to Envoy successfully",
|
||||
"duration", deploymentDuration.String(),
|
||||
"cert_path", certPath,
|
||||
"key_path", keyPath,
|
||||
"sds_config", c.config.SDSConfig)
|
||||
"sds_config", c.config.SDSConfig,
|
||||
"idempotent", allIdempotent)
|
||||
|
||||
metadata := map[string]string{
|
||||
"cert_path": certPath,
|
||||
"key_path": keyPath,
|
||||
"duration_ms": fmt.Sprintf("%d", deploymentDuration.Milliseconds()),
|
||||
"idempotent": fmt.Sprintf("%t", allIdempotent),
|
||||
}
|
||||
if c.config.SDSConfig {
|
||||
metadata["sds_config_path"] = filepath.Join(c.config.CertDir, "sds.json")
|
||||
@@ -226,15 +339,30 @@ func (c *Connector) DeployCertificate(ctx context.Context, request target.Deploy
|
||||
Success: true,
|
||||
TargetAddress: certPath,
|
||||
DeploymentID: fmt.Sprintf("envoy-%d", time.Now().Unix()),
|
||||
Message: "Certificate deployed to Envoy (file-based SDS will auto-reload)",
|
||||
Message: "Certificate deployed to Envoy (file-based SDS will auto-reload)" + idemNote,
|
||||
DeployedAt: time.Now(),
|
||||
Metadata: metadata,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// writeSDSConfig writes an Envoy SDS JSON file that references the cert/key file paths.
|
||||
// This file is consumed by Envoy's file-based SDS provider (path_config_source).
|
||||
func (c *Connector) writeSDSConfig() error {
|
||||
// shouldRunVerify reports whether the post-deploy verify step should fire.
|
||||
// Returns false when every WriteResult was idempotent (nothing actually
|
||||
// changed; the operator's prior deploy already succeeded), mirroring
|
||||
// NGINX's res.SkippedAsIdempotent gate.
|
||||
func (c *Connector) shouldRunVerify(results []*deploy.WriteResult) bool {
|
||||
for _, r := range results {
|
||||
if !r.Idempotent {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// writeSDSConfig writes an Envoy SDS JSON file that references the cert/key
|
||||
// file paths. The write goes through deploy.AtomicWriteFile (Bundle 3) so
|
||||
// power loss / OOM mid-write cannot leave a torn JSON file — Envoy's SDS
|
||||
// watcher refuses to load any cert against a malformed JSON.
|
||||
func (c *Connector) writeSDSConfig(ctx context.Context) (*deploy.WriteResult, error) {
|
||||
certPath := filepath.Join(c.config.CertDir, c.config.CertFilename)
|
||||
keyPath := filepath.Join(c.config.CertDir, c.config.KeyFilename)
|
||||
|
||||
@@ -253,18 +381,184 @@ func (c *Connector) writeSDSConfig() error {
|
||||
|
||||
sdsJSON, err := json.MarshalIndent(sdsResource, "", " ")
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal SDS config: %w", err)
|
||||
return nil, fmt.Errorf("failed to marshal SDS config: %w", err)
|
||||
}
|
||||
|
||||
sdsPath := filepath.Join(c.config.CertDir, "sds.json")
|
||||
if err := os.WriteFile(sdsPath, sdsJSON, 0644); err != nil {
|
||||
return fmt.Errorf("failed to write SDS config file: %w", err)
|
||||
res, err := deploy.AtomicWriteFile(ctx, sdsPath, sdsJSON, deploy.WriteOptions{
|
||||
Mode: 0644,
|
||||
BackupRetention: c.config.BackupRetention,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to write SDS config file: %w", err)
|
||||
}
|
||||
|
||||
c.logger.Info("SDS config file written", "path", sdsPath)
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// runPostDeployVerify dials the configured endpoint, performs a TLS handshake,
|
||||
// and asserts the leaf cert's SHA-256 matches the SHA-256 of the bytes we just
|
||||
// deployed. Retries with backoff per PostDeployVerifyAttempts to absorb the
|
||||
// natural latency between SDS file write and Envoy's watcher picking up the
|
||||
// change.
|
||||
//
|
||||
// Returns nil on match; returns a wrapped error on any failure mode (mismatch
|
||||
// after all attempts, dial timeout, handshake failure, DNS resolution failure).
|
||||
// The caller decides whether to roll back. Same shape as nginx.go:416.
|
||||
//
|
||||
// Bundle 3 of the 2026-05-02 deployment-target audit.
|
||||
func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM string) error {
|
||||
verify := c.config.PostDeployVerify
|
||||
if verify == nil || !verify.Enabled {
|
||||
return nil
|
||||
}
|
||||
|
||||
endpoint := verify.Endpoint
|
||||
timeout := verify.Timeout
|
||||
if timeout <= 0 {
|
||||
timeout = 10 * time.Second
|
||||
}
|
||||
if endpoint == "" {
|
||||
c.logger.Warn("post-deploy verify enabled but no endpoint configured; skipping",
|
||||
"hint", "set Config.PostDeployVerify.Endpoint = host:port")
|
||||
return nil
|
||||
}
|
||||
|
||||
want, err := certPEMToFingerprint(deployedCertPEM)
|
||||
if err != nil {
|
||||
return fmt.Errorf("compute deployed cert fingerprint: %w", err)
|
||||
}
|
||||
want = strings.ToLower(want)
|
||||
|
||||
attempts := c.config.PostDeployVerifyAttempts
|
||||
if attempts <= 0 {
|
||||
attempts = 3
|
||||
}
|
||||
backoff := c.config.PostDeployVerifyBackoff
|
||||
if backoff <= 0 {
|
||||
backoff = 2 * time.Second
|
||||
}
|
||||
|
||||
var lastErr error
|
||||
for i := 0; i < attempts; i++ {
|
||||
if i > 0 {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return ctx.Err()
|
||||
case <-time.After(backoff):
|
||||
}
|
||||
}
|
||||
res := c.probe(ctx, endpoint, timeout)
|
||||
if !res.Success {
|
||||
lastErr = fmt.Errorf("TLS probe failed: %s", res.Error)
|
||||
continue
|
||||
}
|
||||
got := strings.ToLower(res.Fingerprint)
|
||||
if got == want {
|
||||
c.logger.Info("post-deploy TLS verify succeeded",
|
||||
"endpoint", endpoint,
|
||||
"fingerprint", got,
|
||||
"attempt", i+1)
|
||||
return nil
|
||||
}
|
||||
lastErr = fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, want)
|
||||
}
|
||||
return lastErr
|
||||
}
|
||||
|
||||
// restoreFromBackups iterates the WriteResults from a successful per-file
|
||||
// AtomicWriteFile pass and rewrites each destination from its BackupPath. Used
|
||||
// when post-deploy TLS verify fails — the writes already succeeded, so we undo
|
||||
// them by rewriting the backup bytes via AtomicWriteFile{SkipIdempotent:true,
|
||||
// BackupRetention:-1}.
|
||||
//
|
||||
// Traefik has no PostCommit reload to retry — Envoy's SDS file watcher will
|
||||
// pick up the restored bytes naturally on its next tick. The verify retry/
|
||||
// backoff in this same DeployCertificate call would have absorbed that watcher
|
||||
// cycle; on rollback we trust the watcher and return.
|
||||
//
|
||||
// Mirrors nginx.go::rollbackToBackups (L487-515) with the reload step elided.
|
||||
//
|
||||
// Bundle 3 of the 2026-05-02 deployment-target audit.
|
||||
func (c *Connector) restoreFromBackups(ctx context.Context, results []*deploy.WriteResult) error {
|
||||
for _, r := range results {
|
||||
if r == nil || r.Idempotent {
|
||||
// Idempotent writes did not modify the destination, so
|
||||
// there is nothing to restore.
|
||||
continue
|
||||
}
|
||||
if r.BackupPath == "" {
|
||||
// File did not exist before this deploy — restore = remove.
|
||||
if err := os.Remove(r.Path); err != nil && !errors.Is(err, os.ErrNotExist) {
|
||||
return fmt.Errorf("rollback remove %s: %w", r.Path, err)
|
||||
}
|
||||
continue
|
||||
}
|
||||
bytes, err := os.ReadFile(r.BackupPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("rollback read backup %s: %w", r.BackupPath, err)
|
||||
}
|
||||
if _, err := deploy.AtomicWriteFile(ctx, r.Path, bytes, deploy.WriteOptions{
|
||||
SkipIdempotent: true,
|
||||
BackupRetention: -1, // don't backup the rollback (no chain explosion)
|
||||
}); err != nil {
|
||||
return fmt.Errorf("rollback write %s: %w", r.Path, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// failureResult builds a target.DeploymentResult for the various error paths.
|
||||
// Centralized so the field set stays consistent. Same shape as nginx.go:519.
|
||||
func (c *Connector) failureResult(addr, stage string, err error, startTime time.Time) *target.DeploymentResult {
|
||||
return &target.DeploymentResult{
|
||||
Success: false,
|
||||
TargetAddress: addr,
|
||||
Message: fmt.Sprintf("%s: %v", stage, err),
|
||||
DeployedAt: time.Now(),
|
||||
Metadata: map[string]string{
|
||||
"stage": stage,
|
||||
"duration_ms": fmt.Sprintf("%d", time.Since(startTime).Milliseconds()),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// certPEMToFingerprint extracts the SHA-256 hex fingerprint of the first
|
||||
// certificate block in a PEM bundle. Mirrors nginx.go's helper of the same
|
||||
// name (and tlsprobe.CertFingerprint's output format) so equality compare
|
||||
// works against the probe's served fingerprint.
|
||||
func certPEMToFingerprint(pemBytes string) (string, error) {
|
||||
der, err := firstPEMBlock(pemBytes, "CERTIFICATE")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
h := sha256.Sum256(der)
|
||||
return hex.EncodeToString(h[:]), nil
|
||||
}
|
||||
|
||||
// firstPEMBlock pulls the bytes of the first PEM block of the requested type.
|
||||
// Mirrors nginx.go:548 (kept inline rather than a shared helper because the
|
||||
// nginx version is package-private; cross-package import would force exposure).
|
||||
func firstPEMBlock(pemBytes, blockType string) ([]byte, error) {
|
||||
begin := "-----BEGIN " + blockType + "-----"
|
||||
end := "-----END " + blockType + "-----"
|
||||
beginIdx := strings.Index(pemBytes, begin)
|
||||
if beginIdx < 0 {
|
||||
return nil, fmt.Errorf("no %s PEM block found", blockType)
|
||||
}
|
||||
rest := pemBytes[beginIdx+len(begin):]
|
||||
endIdx := strings.Index(rest, end)
|
||||
if endIdx < 0 {
|
||||
return nil, fmt.Errorf("PEM block not terminated")
|
||||
}
|
||||
body := strings.TrimSpace(rest[:endIdx])
|
||||
body = strings.ReplaceAll(body, "\n", "")
|
||||
body = strings.ReplaceAll(body, "\r", "")
|
||||
body = strings.ReplaceAll(body, " ", "")
|
||||
return base64.StdEncoding.DecodeString(body)
|
||||
}
|
||||
|
||||
// ValidateDeployment verifies that the deployed certificate files are readable.
|
||||
// It checks that both the certificate and key files exist and are accessible.
|
||||
func (c *Connector) ValidateDeployment(ctx context.Context, request target.ValidationRequest) (*target.ValidationResult, error) {
|
||||
|
||||
@@ -2,16 +2,22 @@ package envoy_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/base64"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"log/slog"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync/atomic"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/shankar0123/certctl/internal/connector/target"
|
||||
"github.com/shankar0123/certctl/internal/connector/target/envoy"
|
||||
"github.com/shankar0123/certctl/internal/deploy"
|
||||
"github.com/shankar0123/certctl/internal/tlsprobe"
|
||||
)
|
||||
|
||||
// Phase 7 of the deploy-hardening I master bundle: atomic-write
|
||||
@@ -93,3 +99,235 @@ func TestEnvoy_ValidateOnly_Sentinel(t *testing.T) {
|
||||
t.Errorf("got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Bundle 3 (deployment-target audit 2026-05-02): SDS atomicity + post-deploy
|
||||
// watcher pickup confirmation.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// certPEMFingerprint mirrors envoy.certPEMToFingerprint (which is package-
|
||||
// private). Computes SHA-256 of the first PEM block's DER bytes; matches what
|
||||
// tlsprobe.CertFingerprint emits for a served leaf cert.
|
||||
func certPEMFingerprint(t *testing.T, pemBytes string) string {
|
||||
t.Helper()
|
||||
const begin = "-----BEGIN CERTIFICATE-----"
|
||||
const end = "-----END CERTIFICATE-----"
|
||||
bi := strings.Index(pemBytes, begin)
|
||||
if bi < 0 {
|
||||
t.Fatalf("no CERTIFICATE block in PEM")
|
||||
}
|
||||
rest := pemBytes[bi+len(begin):]
|
||||
ei := strings.Index(rest, end)
|
||||
if ei < 0 {
|
||||
t.Fatalf("no END CERTIFICATE in PEM")
|
||||
}
|
||||
body := strings.TrimSpace(rest[:ei])
|
||||
body = strings.ReplaceAll(body, "\n", "")
|
||||
body = strings.ReplaceAll(body, "\r", "")
|
||||
der, err := base64.StdEncoding.DecodeString(body)
|
||||
if err != nil {
|
||||
t.Fatalf("base64: %v", err)
|
||||
}
|
||||
h := sha256.Sum256(der)
|
||||
return hex.EncodeToString(h[:])
|
||||
}
|
||||
|
||||
// TestEnvoy_Atomic_SDSConfigWriteIsAtomic pins the wiring change at envoy.go's
|
||||
// writeSDSConfig — pre-Bundle-3 the SDS JSON went through os.WriteFile (no
|
||||
// backup, torn-write hazard). Post-fix it goes through deploy.AtomicWriteFile,
|
||||
// which produces a sibling backup with deploy.BackupSuffix when an existing
|
||||
// SDS JSON is replaced.
|
||||
func TestEnvoy_Atomic_SDSConfigWriteIsAtomic(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
sdsPath := filepath.Join(dir, "sds.json")
|
||||
// Pre-write a sentinel SDS JSON so the connector's write produces
|
||||
// a backup we can assert on.
|
||||
if err := os.WriteFile(sdsPath, []byte(`{"resources":[{"name":"old"}]}`), 0644); err != nil {
|
||||
t.Fatalf("seed sds: %v", err)
|
||||
}
|
||||
cfg := envoy.Config{
|
||||
CertDir: dir,
|
||||
CertFilename: "cert.pem",
|
||||
KeyFilename: "key.pem",
|
||||
SDSConfig: true,
|
||||
}
|
||||
c := envoy.New(&cfg, newTestLogger())
|
||||
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||
CertPEM: certA,
|
||||
KeyPEM: keyA,
|
||||
})
|
||||
if err != nil || !res.Success {
|
||||
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
|
||||
}
|
||||
|
||||
// SDS JSON should be the new bytes (i.e. NOT match the sentinel).
|
||||
got, err := os.ReadFile(sdsPath)
|
||||
if err != nil {
|
||||
t.Fatalf("read sds: %v", err)
|
||||
}
|
||||
if strings.Contains(string(got), `"old"`) {
|
||||
t.Errorf("SDS JSON not replaced; still contains sentinel")
|
||||
}
|
||||
if !strings.Contains(string(got), "server_cert") {
|
||||
t.Errorf("SDS JSON missing expected resource name; got %s", string(got))
|
||||
}
|
||||
|
||||
// AtomicWriteFile produces a backup file with deploy.BackupSuffix
|
||||
// when replacing an existing destination. Pre-Bundle-3 (os.WriteFile
|
||||
// path) no backup would exist for sds.json.
|
||||
entries, _ := os.ReadDir(dir)
|
||||
foundBak := false
|
||||
for _, e := range entries {
|
||||
if strings.HasPrefix(e.Name(), "sds.json"+deploy.BackupSuffix) {
|
||||
foundBak = true
|
||||
}
|
||||
}
|
||||
if !foundBak {
|
||||
t.Errorf("no SDS JSON backup created — atomic-write wiring missing? entries=%v", entryNames(entries))
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnvoy_Atomic_WatcherPickupRetries pins the retry/backoff loop in the
|
||||
// post-deploy verify path. Stub the probe so attempts 1+2 return the wrong
|
||||
// fingerprint and attempt 3 returns the correct one — DeployCertificate must
|
||||
// succeed and the probe must have been called exactly 3 times.
|
||||
func TestEnvoy_Atomic_WatcherPickupRetries(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := envoy.Config{
|
||||
CertDir: dir,
|
||||
CertFilename: "cert.pem",
|
||||
KeyFilename: "key.pem",
|
||||
PostDeployVerify: &envoy.PostDeployVerifyConfig{
|
||||
Enabled: true,
|
||||
Endpoint: "envoy.test.invalid:443",
|
||||
Timeout: 100 * time.Millisecond,
|
||||
},
|
||||
PostDeployVerifyAttempts: 3,
|
||||
PostDeployVerifyBackoff: time.Millisecond, // tight loop for tests
|
||||
}
|
||||
c := envoy.New(&cfg, newTestLogger())
|
||||
|
||||
want := certPEMFingerprint(t, certA)
|
||||
var calls atomic.Int64
|
||||
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
||||
n := calls.Add(1)
|
||||
if n < 3 {
|
||||
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
|
||||
}
|
||||
return tlsprobe.ProbeResult{Success: true, Fingerprint: want}
|
||||
})
|
||||
|
||||
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||
CertPEM: certA,
|
||||
KeyPEM: keyA,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("deploy returned error after retries should have succeeded: %v", err)
|
||||
}
|
||||
if !res.Success {
|
||||
t.Fatalf("deploy.Success=false; message=%s", res.Message)
|
||||
}
|
||||
if got := calls.Load(); got != 3 {
|
||||
t.Errorf("probe called %d times, want 3", got)
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack pins the verify-
|
||||
// failure rollback path. Pre-write sentinel cert + key; stub probe to always
|
||||
// return the wrong fingerprint; assert DeployCertificate returns a wrapped
|
||||
// error AND the destination files contain the sentinel bytes (restored from
|
||||
// backups).
|
||||
func TestEnvoy_Atomic_WatcherPickupAllAttemptsFail_RollsBack(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
certPath := filepath.Join(dir, "cert.pem")
|
||||
keyPath := filepath.Join(dir, "key.pem")
|
||||
sentCert := []byte("SENTINEL-CERT-BYTES")
|
||||
sentKey := []byte("SENTINEL-KEY-BYTES")
|
||||
if err := os.WriteFile(certPath, sentCert, 0644); err != nil {
|
||||
t.Fatalf("seed cert: %v", err)
|
||||
}
|
||||
if err := os.WriteFile(keyPath, sentKey, 0600); err != nil {
|
||||
t.Fatalf("seed key: %v", err)
|
||||
}
|
||||
|
||||
cfg := envoy.Config{
|
||||
CertDir: dir,
|
||||
CertFilename: "cert.pem",
|
||||
KeyFilename: "key.pem",
|
||||
PostDeployVerify: &envoy.PostDeployVerifyConfig{
|
||||
Enabled: true,
|
||||
Endpoint: "envoy.test.invalid:443",
|
||||
Timeout: 100 * time.Millisecond,
|
||||
},
|
||||
PostDeployVerifyAttempts: 2,
|
||||
PostDeployVerifyBackoff: time.Millisecond,
|
||||
}
|
||||
c := envoy.New(&cfg, newTestLogger())
|
||||
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
||||
return tlsprobe.ProbeResult{Success: true, Fingerprint: "deadbeef"}
|
||||
})
|
||||
|
||||
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||
CertPEM: certA,
|
||||
KeyPEM: keyA,
|
||||
})
|
||||
if err == nil {
|
||||
t.Fatalf("expected verify-mismatch error, got nil; res=%+v", res)
|
||||
}
|
||||
if res.Success {
|
||||
t.Errorf("expected Success=false on verify failure")
|
||||
}
|
||||
if !strings.Contains(strings.ToLower(res.Message), "verify") {
|
||||
t.Errorf("expected message to mention verify; got %q", res.Message)
|
||||
}
|
||||
|
||||
// Both files must be restored to sentinel bytes.
|
||||
gotCert, _ := os.ReadFile(certPath)
|
||||
if string(gotCert) != string(sentCert) {
|
||||
t.Errorf("cert not restored on rollback; got %q want %q", string(gotCert), string(sentCert))
|
||||
}
|
||||
gotKey, _ := os.ReadFile(keyPath)
|
||||
if string(gotKey) != string(sentKey) {
|
||||
t.Errorf("key not restored on rollback; got %q want %q", string(gotKey), string(sentKey))
|
||||
}
|
||||
}
|
||||
|
||||
// TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault pins the opt-in default.
|
||||
// A Config with no PostDeployVerify set must NOT call the probe — preserving
|
||||
// pre-Bundle-3 behaviour for callers that don't opt in.
|
||||
func TestEnvoy_Atomic_PostDeployVerifyDisabledByDefault(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
cfg := envoy.Config{
|
||||
CertDir: dir,
|
||||
CertFilename: "cert.pem",
|
||||
KeyFilename: "key.pem",
|
||||
// PostDeployVerify intentionally nil.
|
||||
}
|
||||
c := envoy.New(&cfg, newTestLogger())
|
||||
var calls atomic.Int64
|
||||
c.SetTestProbe(func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult {
|
||||
calls.Add(1)
|
||||
return tlsprobe.ProbeResult{Success: false, Error: "probe should not be called"}
|
||||
})
|
||||
|
||||
res, err := c.DeployCertificate(context.Background(), target.DeploymentRequest{
|
||||
CertPEM: certA,
|
||||
KeyPEM: keyA,
|
||||
})
|
||||
if err != nil || !res.Success {
|
||||
t.Fatalf("deploy: err=%v success=%v", err, res != nil && res.Success)
|
||||
}
|
||||
if got := calls.Load(); got != 0 {
|
||||
t.Errorf("probe called %d times when PostDeployVerify is nil; want 0", got)
|
||||
}
|
||||
}
|
||||
|
||||
// entryNames is a tiny helper for log-friendly directory listings in test
|
||||
// failure messages.
|
||||
func entryNames(entries []os.DirEntry) []string {
|
||||
names := make([]string, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
names = append(names, e.Name())
|
||||
}
|
||||
return names
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user