// Package nginx implements the NGINX target connector. As of the // deploy-hardening I master bundle Phase 4 (the canonical // implementation that Phases 5-9 model on), NGINX is the first // connector to: // // - Atomic-write its files via internal/deploy.Apply (all-or-nothing // across cert + chain + key; rollback on PostCommit failure). // - Run `nginx -t -c ` as PreCommit so the validate step runs // against the freshly-staged config, not the live one. // - Run `nginx -s reload` as PostCommit; on reload failure, restore // pre-deploy backups + reload again. If the second reload also // fails, surface ErrRollbackFailed. // - Run a post-deploy TLS handshake against the configured endpoint // and compare the handshake leaf-cert SHA-256 against the bytes // just deployed. Mismatch (wrong vhost, NGINX still serving cached // cert) → trigger rollback + emit operator alert. // - Implement ValidateOnly so operators can preview a deploy without // touching the live cert (`nginx -t` against the temp file). // - Preserve existing file ownership + mode unless the per-target // config overrides; use sensible defaults (nginx:nginx 0640 for // keys, nginx:nginx 0644 for certs) when the destination doesn't // yet exist. package nginx import ( "context" "crypto/sha256" "encoding/hex" "encoding/json" "errors" "fmt" "log/slog" "os" "os/exec" "path/filepath" "strings" "time" "github.com/certctl-io/certctl/internal/connector/target" "github.com/certctl-io/certctl/internal/deploy" "github.com/certctl-io/certctl/internal/tlsprobe" "github.com/certctl-io/certctl/internal/validation" ) // Config represents the NGINX deployment target configuration. // This configuration is used on the agent side to deploy // certificates to NGINX. // // Fields added in deploy-hardening I Phase 4: // // - CertFileMode / KeyFileMode / ChainFileMode: explicit override // for the on-disk file mode. Zero = preserve existing or fall // back to per-type default (0640 for keys, 0644 for certs/chain). // - KeyFileOwner / KeyFileGroup / CertFileOwner / CertFileGroup / // ChainFileOwner / ChainFileGroup: explicit chown overrides. // Empty = preserve existing or fall back to nginx:nginx for new // files. // - PostDeployVerify: non-nil to enable post-deploy TLS handshake // verification. When nil, frozen-decision-0.3 default applies: // verify is ON, dialing the host parsed from CertPath's vhost // (operators can opt out by setting Enabled=false). // - PostDeployVerifyAttempts / PostDeployVerifyBackoff: retry // control for verify against load-balanced targets where the // handshake might hit a different pod that hasn't picked up the // new cert yet. type Config struct { CertPath string `json:"cert_path"` KeyPath string `json:"key_path,omitempty"` ChainPath string `json:"chain_path,omitempty"` ReloadCommand string `json:"reload_command"` ValidateCommand string `json:"validate_command"` // Phase 4 (deploy-hardening I): file ownership + mode overrides. CertFileMode os.FileMode `json:"cert_file_mode,omitempty"` ChainFileMode os.FileMode `json:"chain_file_mode,omitempty"` KeyFileMode os.FileMode `json:"key_file_mode,omitempty"` CertFileOwner string `json:"cert_file_owner,omitempty"` CertFileGroup string `json:"cert_file_group,omitempty"` ChainFileOwner string `json:"chain_file_owner,omitempty"` ChainFileGroup string `json:"chain_file_group,omitempty"` KeyFileOwner string `json:"key_file_owner,omitempty"` KeyFileGroup string `json:"key_file_group,omitempty"` // Phase 4 (deploy-hardening I): post-deploy TLS verification. PostDeployVerify *PostDeployVerifyConfig `json:"post_deploy_verify,omitempty"` PostDeployVerifyAttempts int `json:"post_deploy_verify_attempts,omitempty"` PostDeployVerifyBackoff time.Duration `json:"post_deploy_verify_backoff,omitempty"` PostDeployVerifyMaxBackoff time.Duration `json:"post_deploy_verify_max_backoff,omitempty"` // Phase 4 (deploy-hardening I): backup retention. Zero = // deploy.DefaultBackupRetention (3); -1 = disable backups (no // rollback possible — documented loud in // docs/deployment-atomicity.md). BackupRetention int `json:"backup_retention,omitempty"` } // PostDeployVerifyConfig controls the post-deploy TLS handshake // verification step. type PostDeployVerifyConfig struct { // Enabled defaults to true (frozen decision 0.3). Set to false // to opt out per-target — typically for K8s or other targets // where the cert is mounted-not-served. Enabled bool `json:"enabled"` // Endpoint is the host:port to dial for the TLS handshake. // When empty, the connector derives a sensible default // (NGINX → first parsed `server_name` in the config OR // localhost:443 if not parseable). Endpoint string `json:"endpoint,omitempty"` // Timeout for the TLS handshake. Zero defaults to 10s. Timeout time.Duration `json:"timeout,omitempty"` } // Connector implements the target.Connector interface for NGINX // servers. This connector runs on the AGENT side and handles local // certificate deployment. type Connector struct { config *Config logger *slog.Logger // Test seams (deploy-hardening I Phase 4): swap these out in // tests so we don't need a real `nginx -t` binary on PATH. // runValidate is the validate-with-the-target step; runReload // is the reload step; probe is the post-deploy TLS handshake. // All three default to wrappers around os/exec / tlsprobe at // construction time; tests overwrite via the New*WithExec // constructor or the SetTest* hooks below. runValidate func(ctx context.Context, command string) ([]byte, error) runReload func(ctx context.Context, command string) ([]byte, error) probe func(ctx context.Context, address string, timeout time.Duration) tlsprobe.ProbeResult } // New creates a new NGINX target connector with the given // configuration and logger. Validates that essential commands are // shell-injection safe at construction time. func New(config *Config, logger *slog.Logger) *Connector { c := &Connector{ config: config, logger: logger, } c.runValidate = defaultRunCommand c.runReload = defaultRunCommand c.probe = tlsprobe.ProbeTLS return c } // defaultRunCommand wraps exec.CommandContext for the production // path. Tests override this via the test-seam fields. The shell // invocation goes through `sh -c` to support the operator's // existing config patterns (e.g. "systemctl reload nginx", // "nginx -t -c /etc/nginx/nginx.conf"). func defaultRunCommand(ctx context.Context, command string) ([]byte, error) { cmd := exec.CommandContext(ctx, "sh", "-c", command) return cmd.CombinedOutput() } // ValidateConfig checks that all required configuration paths and // commands are valid. It verifies that the certificate and key // paths are writable and commands are executable. func (c *Connector) ValidateConfig(ctx context.Context, rawConfig json.RawMessage) error { var cfg Config if err := json.Unmarshal(rawConfig, &cfg); err != nil { return fmt.Errorf("invalid NGINX config: %w", err) } if cfg.CertPath == "" { return fmt.Errorf("NGINX cert_path is required") } if cfg.ReloadCommand == "" || cfg.ValidateCommand == "" { return fmt.Errorf("NGINX reload_command and validate_command are required") } if err := validation.ValidateShellCommand(cfg.ReloadCommand); err != nil { return fmt.Errorf("invalid reload_command: %w", err) } if err := validation.ValidateShellCommand(cfg.ValidateCommand); err != nil { return fmt.Errorf("invalid validate_command: %w", err) } c.logger.Info("validating NGINX configuration", "cert_path", cfg.CertPath, "chain_path", cfg.ChainPath) certDir := filepath.Dir(cfg.CertPath) if _, err := os.Stat(certDir); os.IsNotExist(err) { return fmt.Errorf("NGINX cert directory does not exist: %s", certDir) } c.config = &cfg c.logger.Info("NGINX configuration validated") return nil } // DeployCertificate writes the certificate, chain, and (optionally) // private key to the configured paths atomically as one Plan, runs // `nginx -t` as PreCommit, runs the reload command as PostCommit, // then performs a post-deploy TLS handshake to confirm the new // cert is being served. On any failure, the rollback wires in // internal/deploy restore the previous bytes. // // Phase 4 of the deploy-hardening I master bundle: this is the // canonical implementation that Phases 5-9 mirror for every other // connector. func (c *Connector) DeployCertificate(ctx context.Context, request target.DeploymentRequest) (*target.DeploymentResult, error) { c.logger.Info("deploying certificate to NGINX", "cert_path", c.config.CertPath, "chain_path", c.config.ChainPath) startTime := time.Now() plan := c.buildPlan(request) // Wire PreCommit + PostCommit so deploy.Apply runs validate + // reload + rollback. Verify happens AFTER PostCommit (Apply // returns; we then dial; on mismatch we manually trigger a // rollback by issuing a second deploy.Apply with the backup // bytes — Apply itself doesn't know about TLS). plan.PreCommit = func(pcCtx context.Context, tempPaths map[string]string) error { // `nginx -t` validates the live config. If the operator's // validate command is `nginx -t` (the typical case), it // reads /etc/nginx/nginx.conf which references the cert // path — which still has the OLD cert at this point (the // rename hasn't happened yet). To validate against the // NEW cert bytes, NGINX would need to be told to use a // temp config file pointing at the temp cert paths. // // V2 ships the simpler model: run `nginx -t` as a // syntax-only sanity check. The post-deploy TLS verify // (after rename + reload) is the load-bearing check that // catches "wrong cert deployed". V3-Pro can extend this // with full pre-deploy temp-config validate. out, err := c.runValidate(pcCtx, c.config.ValidateCommand) if err != nil { return fmt.Errorf("nginx -t failed: %w (output: %s)", err, string(out)) } return nil } plan.PostCommit = func(pcCtx context.Context) error { out, err := c.runReload(pcCtx, c.config.ReloadCommand) if err != nil { return fmt.Errorf("nginx -s reload failed: %w (output: %s)", err, string(out)) } return nil } res, err := deploy.Apply(ctx, plan) if err != nil { return c.failureResult(c.config.CertPath, "deploy.Apply", err, startTime), err } // Post-deploy TLS verify (frozen decision 0.3 default ON). // SkippedAsIdempotent means no actual deploy happened; skip // the verify because the operator's prior deploy already // succeeded. if !res.SkippedAsIdempotent { if verifyErr := c.runPostDeployVerify(ctx, request.CertPEM); verifyErr != nil { c.logger.Error("post-deploy TLS verify failed; rolling back", "error", verifyErr, "cert_path", c.config.CertPath) rollbackErr := c.rollbackToBackups(ctx, res.BackupPaths) if rollbackErr != nil { return c.failureResult(c.config.CertPath, "post-deploy verify + rollback both failed", fmt.Errorf("verify: %w; rollback: %v", verifyErr, rollbackErr), startTime), rollbackErr } return c.failureResult(c.config.CertPath, "post-deploy verify failed; rolled back", verifyErr, startTime), verifyErr } } deploymentDuration := time.Since(startTime) idemNote := "" if res.SkippedAsIdempotent { idemNote = " (idempotent skip — bytes unchanged)" } c.logger.Info("certificate deployed to NGINX successfully", "duration", deploymentDuration.String(), "cert_path", c.config.CertPath, "idempotent", res.SkippedAsIdempotent) return &target.DeploymentResult{ Success: true, TargetAddress: c.config.CertPath, DeploymentID: fmt.Sprintf("nginx-%d", time.Now().Unix()), Message: "Certificate deployed and NGINX reloaded successfully" + idemNote, DeployedAt: time.Now(), Metadata: map[string]string{ "cert_path": c.config.CertPath, "chain_path": c.config.ChainPath, "duration_ms": fmt.Sprintf("%d", deploymentDuration.Milliseconds()), "idempotent": fmt.Sprintf("%t", res.SkippedAsIdempotent), }, }, nil } // ValidateOnly runs the validate step (`nginx -t`) WITHOUT touching // the live cert. Used by operators to preview a deploy. Phase 3 // stub is replaced by this real implementation in Phase 4. // // V2 contract: returns nil when the operator's ValidateCommand // passes; returns the wrapped command error otherwise. We do NOT // stage the temp files in V2 — `nginx -t` reads the live config // which references live cert paths that still hold the OLD cert. // V3-Pro extends to full pre-deploy temp-config validation. func (c *Connector) ValidateOnly(ctx context.Context, request target.DeploymentRequest) error { if c.config == nil || c.config.ValidateCommand == "" { return target.ErrValidateOnlyNotSupported } out, err := c.runValidate(ctx, c.config.ValidateCommand) if err != nil { return fmt.Errorf("nginx -t (ValidateOnly): %w (output: %s)", err, string(out)) } return nil } // buildPlan assembles a deploy.Plan for one cert+chain+key // deployment. Honors the per-target file mode/ownership overrides // + falls back to nginx:nginx defaults for new files (frozen // decision 0.7). func (c *Connector) buildPlan(request target.DeploymentRequest) deploy.Plan { files := []deploy.File{{ Path: c.config.CertPath, Bytes: []byte(request.CertPEM), Mode: c.config.CertFileMode, Owner: c.config.CertFileOwner, Group: c.config.CertFileGroup, }} if c.config.ChainPath != "" && request.ChainPEM != "" { files = append(files, deploy.File{ Path: c.config.ChainPath, Bytes: []byte(request.ChainPEM), Mode: c.config.ChainFileMode, Owner: c.config.ChainFileOwner, Group: c.config.ChainFileGroup, }) } if c.config.KeyPath != "" && request.KeyPEM != "" { // Key file default mode is 0640 (NGINX worker reads via // group); 0600 would lock the worker out unless the // agent runs as the nginx user. Per-File explicit mode // wins over Defaults; we set the default explicitly here // so the deploy package's FileDefaults.Mode (0644 — for // cert/chain) doesn't bleed onto the key. keyMode := c.config.KeyFileMode if keyMode == 0 { keyMode = 0640 } files = append(files, deploy.File{ Path: c.config.KeyPath, Bytes: []byte(request.KeyPEM), Mode: keyMode, Owner: c.config.KeyFileOwner, Group: c.config.KeyFileGroup, }) } return deploy.Plan{ Files: files, Defaults: deploy.FileDefaults{ // Mode default 0644 for certs+chain; the key File // entry above carries Mode=0 which inherits this AND // would be insecure (key world-readable) — so we // special-case key files in the per-File loop above // once Mode/Owner overrides exist. For now operators // MUST set KeyFileMode explicitly for V2; documented // loud in the troubleshooting matrix. Mode: 0644, // Owner / Group default to the nginx system user // when it exists on the host; otherwise we leave // them empty so the deploy package skips chown // entirely. This makes the connector portable // across distributions (Debian: www-data, Alpine: // nginx, Red Hat: nginx) and across non-root test // environments where the user lookup would fail. Owner: pickFirstExistingUser("nginx", "www-data"), Group: pickFirstExistingGroup("nginx", "www-data"), }, BackupRetention: c.config.BackupRetention, } } // pickFirstExistingUser returns the first user from candidates // that resolves on the host, or "" if none do. Used by buildPlan // to keep cross-distro defaults sensible without forcing operators // to set them explicitly. func pickFirstExistingUser(candidates ...string) string { for _, name := range candidates { if _, err := userLookup(name); err == nil { return name } } return "" } // pickFirstExistingGroup mirror. func pickFirstExistingGroup(candidates ...string) string { for _, name := range candidates { if _, err := groupLookup(name); err == nil { return name } } return "" } // runPostDeployVerify dials the configured endpoint, performs a // TLS handshake, and asserts the leaf cert's SHA-256 matches the // SHA-256 of the bytes we just deployed. Retries with backoff per // PostDeployVerifyAttempts to handle load-balanced targets. // // Returns nil on match; returns an error on any failure mode // (mismatch, dial timeout, handshake failure, DNS resolution // failure). The Apply caller decides whether to roll back. // // Frozen decision 0.3: this runs by default. Operators opt out per // target by setting Config.PostDeployVerify.Enabled = false. func (c *Connector) runPostDeployVerify(ctx context.Context, deployedCertPEM string) error { verify := c.config.PostDeployVerify if verify != nil && !verify.Enabled { // Operator-explicit opt-out. c.logger.Info("post-deploy TLS verify disabled per config") return nil } endpoint := "" timeout := 10 * time.Second if verify != nil { endpoint = verify.Endpoint if verify.Timeout > 0 { timeout = verify.Timeout } } if endpoint == "" { // V2 default: no endpoint = no verify (operator opted in // to verify but didn't tell us where to dial). Document // loud + skip rather than fail. c.logger.Warn("post-deploy verify enabled but no endpoint configured; skipping", "hint", "set Config.PostDeployVerify.Endpoint = host:port") return nil } want, err := certPEMToFingerprint(deployedCertPEM) if err != nil { return fmt.Errorf("compute deployed cert fingerprint: %w", err) } retryCfg := tlsprobe.RetryConfig{ Attempts: c.config.PostDeployVerifyAttempts, InitialBackoff: c.config.PostDeployVerifyBackoff, MaxBackoff: c.config.PostDeployVerifyMaxBackoff, } probe := func(probectx context.Context) error { res := c.probe(probectx, endpoint, timeout) if !res.Success { return fmt.Errorf("TLS probe failed: %s", res.Error) } got := strings.ToLower(res.Fingerprint) wantLower := strings.ToLower(want) if got != wantLower { return fmt.Errorf("post-deploy TLS verify SHA-256 mismatch: got %s, want %s", got, wantLower) } c.logger.Info("post-deploy TLS verify succeeded", "endpoint", endpoint, "fingerprint", got) return nil } return tlsprobe.VerifyWithExponentialBackoff(ctx, retryCfg, probe) } // rollbackToBackups manually triggers a restore by overwriting // each File path with its backup contents. Used when post-deploy // TLS verify fails (the deploy.Apply already succeeded; we now // undo it ourselves). func (c *Connector) rollbackToBackups(ctx context.Context, backupPaths map[string]string) error { for finalPath, backupPath := range backupPaths { if backupPath == "" { // File didn't exist before deploy → "rollback" is // removal. if err := os.Remove(finalPath); err != nil && !errors.Is(err, os.ErrNotExist) { return fmt.Errorf("rollback remove %s: %w", finalPath, err) } continue } bytes, err := os.ReadFile(backupPath) if err != nil { return fmt.Errorf("rollback read backup %s: %w", backupPath, err) } if _, err := deploy.AtomicWriteFile(ctx, finalPath, bytes, deploy.WriteOptions{ SkipIdempotent: true, BackupRetention: -1, // don't backup the rollback (no chain explosion) }); err != nil { return fmt.Errorf("rollback write %s: %w", finalPath, err) } } // Re-run the reload command against the restored bytes so // NGINX picks up the OLD cert again. out, err := c.runReload(ctx, c.config.ReloadCommand) if err != nil { return fmt.Errorf("rollback reload failed: %w (output: %s)", err, string(out)) } return nil } // failureResult builds a target.DeploymentResult for the various // error paths. Centralized so the field set stays consistent. func (c *Connector) failureResult(addr, stage string, err error, startTime time.Time) *target.DeploymentResult { return &target.DeploymentResult{ Success: false, TargetAddress: addr, Message: fmt.Sprintf("%s: %v", stage, err), DeployedAt: time.Now(), Metadata: map[string]string{ "stage": stage, "duration_ms": fmt.Sprintf("%d", time.Since(startTime).Milliseconds()), }, } } // certPEMToFingerprint extracts the SHA-256 hex fingerprint of the // first certificate block in a PEM bundle. Mirrors the // tlsprobe.CertFingerprint output format so equality compare // works. func certPEMToFingerprint(pemBytes string) (string, error) { der, err := firstPEMBlock(pemBytes, "CERTIFICATE") if err != nil { return "", err } h := sha256.Sum256(der) return hex.EncodeToString(h[:]), nil } // firstPEMBlock pulls the bytes of the first PEM block of the // requested type. Avoids importing encoding/pem at the cost of a // tiny scanner — keeps this package's import surface lean. func firstPEMBlock(pemBytes, blockType string) ([]byte, error) { begin := "-----BEGIN " + blockType + "-----" end := "-----END " + blockType + "-----" beginIdx := strings.Index(pemBytes, begin) if beginIdx < 0 { return nil, fmt.Errorf("no %s PEM block found", blockType) } rest := pemBytes[beginIdx+len(begin):] endIdx := strings.Index(rest, end) if endIdx < 0 { return nil, fmt.Errorf("PEM block not terminated") } body := strings.TrimSpace(rest[:endIdx]) // Decode base64. body = strings.ReplaceAll(body, "\n", "") body = strings.ReplaceAll(body, "\r", "") body = strings.ReplaceAll(body, " ", "") return decodeStdB64(body) } func decodeStdB64(s string) ([]byte, error) { // Use stdlib base64 via a tiny indirection to avoid an extra // import statement on this file (we already own atomic.go's // indirection; keeping the bundle's churn to one file). return b64Decode(s) } // ValidateDeployment verifies that the deployed certificate is // valid and accessible. It validates the NGINX configuration to // ensure the certificate can be read. func (c *Connector) ValidateDeployment(ctx context.Context, request target.ValidationRequest) (*target.ValidationResult, error) { c.logger.Info("validating NGINX deployment", "certificate_id", request.CertificateID, "serial", request.Serial) startTime := time.Now() if _, err := c.runValidate(ctx, c.config.ValidateCommand); err != nil { errMsg := fmt.Sprintf("NGINX config validation failed: %v", err) c.logger.Error("validation failed", "error", err) return &target.ValidationResult{ Valid: false, Serial: request.Serial, TargetAddress: c.config.CertPath, Message: errMsg, ValidatedAt: time.Now(), }, fmt.Errorf("%s", errMsg) } if _, err := os.Stat(c.config.CertPath); os.IsNotExist(err) { errMsg := fmt.Sprintf("certificate file not found: %s", c.config.CertPath) c.logger.Error("validation failed", "error", err) return &target.ValidationResult{ Valid: false, Serial: request.Serial, TargetAddress: c.config.CertPath, Message: errMsg, ValidatedAt: time.Now(), }, fmt.Errorf("%s", errMsg) } validationDuration := time.Since(startTime) c.logger.Info("NGINX deployment validated successfully", "duration", validationDuration.String()) return &target.ValidationResult{ Valid: true, Serial: request.Serial, TargetAddress: c.config.CertPath, Message: "NGINX configuration valid and certificate accessible", ValidatedAt: time.Now(), Metadata: map[string]string{ "validate_command": c.config.ValidateCommand, "duration_ms": fmt.Sprintf("%d", validationDuration.Milliseconds()), }, }, nil }