mirror of
https://github.com/shankar0123/certctl.git
synced 2026-06-08 08:28:54 +00:00
Compare commits
88 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 151107c969 | |||
| b1ca046fdf | |||
| 28f93f1f46 | |||
| 569aea255f | |||
| c70bb071f9 | |||
| f7fcd1e187 | |||
| 9155ec9174 | |||
| 58a15e0b3d | |||
| d64c1821a5 | |||
| c8e77fdeca | |||
| 1b95709d4b | |||
| 35277c0f2c | |||
| 5c5bbedc7e | |||
| d7546aedca | |||
| 5ea45a19b9 | |||
| 374ec574c5 | |||
| 4f2d865b51 | |||
| 578ac4ec68 | |||
| 7e2481b225 | |||
| 2e9262cfb7 | |||
| 5d7bc86451 | |||
| c4ed3da30b | |||
| 663b14bfd8 | |||
| 43836aca7c | |||
| 8c2d3c844e | |||
| c7f3ec6290 | |||
| 6acf3559a3 | |||
| 3e09401502 | |||
| 38f1200f26 | |||
| e1ab1db65a | |||
| c95685f8ab | |||
| a0404f2d21 | |||
| 34d5200904 | |||
| 3ce05ab0a8 | |||
| 360eaa75bc | |||
| b721596213 | |||
| 6a640ac3e7 | |||
| 15fedbaa06 | |||
| c40690e42d | |||
| 657a699564 | |||
| 183c56f6c5 | |||
| a485e31f63 | |||
| 8f2e5771db | |||
| 037876fa0f | |||
| 7d2e7043b9 | |||
| 037dab7b6f | |||
| e6cfd756ac | |||
| 67dbd18fda | |||
| 5a1dbce6d5 | |||
| 76e9380389 | |||
| 7268d12a17 | |||
| 9ba5ee41be | |||
| 8e84527ba2 | |||
| 622c19cafe | |||
| bc417fc458 | |||
| ac5bb71b61 | |||
| fc237de357 | |||
| b22cdb3405 | |||
| 03f0e08a77 | |||
| 38f86bca86 | |||
| af5c39252f | |||
| 6c00f7b0d3 | |||
| 49096914d2 | |||
| aa1c12ae2d | |||
| 5231609f26 | |||
| c146e8f75b | |||
| a9e229bd2a | |||
| 700c399367 | |||
| 1fcb05181d | |||
| 508c7530e9 | |||
| c9f932be65 | |||
| 868f1c25be | |||
| 9ce2d8ca8f | |||
| 0987e222dd | |||
| e761ae40a4 | |||
| 1daae5d709 | |||
| 7c01f811a1 | |||
| c1b581b047 | |||
| e37403edf1 | |||
| 93e00f6a5e | |||
| c8985cf868 | |||
| 155f1fec98 | |||
| 29cb13e7a2 | |||
| 9135c44908 | |||
| 952682ebec | |||
| a41fc2d75c | |||
| c8347d742d | |||
| 67f346cd87 |
@@ -0,0 +1,118 @@
|
|||||||
|
# Acquisition-audit DEPL-005 + DATA-012 closure (Sprint 4 ACQ,
|
||||||
|
# 2026-05-16). Weekly backup-restore smoke test.
|
||||||
|
#
|
||||||
|
# Why
|
||||||
|
# ===
|
||||||
|
# The Helm CronJob at deploy/helm/certctl/templates/backup-cronjob.yaml
|
||||||
|
# and the operator runbook at docs/operator/runbooks/postgres-backup.md
|
||||||
|
# both document a pg_dump -Fc -based backup strategy, but the dump has
|
||||||
|
# never been restored end-to-end under CI. A backup procedure that has
|
||||||
|
# never been restore-tested is not a backup procedure. This workflow
|
||||||
|
# adds the missing assertion.
|
||||||
|
#
|
||||||
|
# What
|
||||||
|
# ====
|
||||||
|
# Each Monday at 07:00 UTC (1h offset from loadtest.yml's 06:00 UTC
|
||||||
|
# slot so they don't fight for runners), boot a real Postgres
|
||||||
|
# 16-alpine container against the same digest pin as the production
|
||||||
|
# deploy/docker-compose.yml, exercise the audit_events hash chain
|
||||||
|
# with a small synthetic workload, pg_dump the database, drop the
|
||||||
|
# schema, pg_restore, and assert the chain head + row count
|
||||||
|
# round-trip byte-for-byte.
|
||||||
|
#
|
||||||
|
# The chain head round-trip property is the load-bearing assertion.
|
||||||
|
# Migration 000047 hashes each audit_events row's canonical payload
|
||||||
|
# with `to_char(timestamp AT TIME ZONE 'UTC',
|
||||||
|
# 'YYYY-MM-DD"T"HH24:MI:SS.US"Z"')`. Any TIMESTAMPTZ-precision loss
|
||||||
|
# in the dump→restore path (a real concern across major Postgres
|
||||||
|
# upgrades or with --format=plain) would corrupt the hash. The whole
|
||||||
|
# point of testing instead of trusting docs is to PROVE the property
|
||||||
|
# under a real workload.
|
||||||
|
#
|
||||||
|
# Workflow boundaries
|
||||||
|
# ===================
|
||||||
|
# - Does not exercise PITR / WAL archiving (DR runbook owns that).
|
||||||
|
# - Does not exercise the Helm CronJob's S3 sink or scheduling
|
||||||
|
# (operator-side concern, not a property of the dump shape).
|
||||||
|
# - Does not deploy or boot the certctl-server itself — the smoke
|
||||||
|
# harness talks to Postgres directly; we're testing the dump,
|
||||||
|
# not the server.
|
||||||
|
|
||||||
|
name: backup-restore-smoke
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Manual trigger from the Actions tab — useful before tagging a
|
||||||
|
# release that touches the audit_events schema, or after a dep
|
||||||
|
# bump that could affect canonical-payload formatting.
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
schedule:
|
||||||
|
# Mondays at 07:00 UTC. Off-peak, off-set 1h from loadtest.yml
|
||||||
|
# (06:00 UTC) so the two jobs don't fight for runners on the
|
||||||
|
# GitHub-hosted ubuntu-latest pool.
|
||||||
|
- cron: '0 7 * * 1'
|
||||||
|
|
||||||
|
# Defense-in-depth: this job reads source and exercises a database;
|
||||||
|
# it never needs write access to PRs, branches, releases, or
|
||||||
|
# packages. Pin permissions to the minimum.
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
backup-restore:
|
||||||
|
name: pg_dump / pg_restore smoke
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
# 15-minute hard cap. The actual workload + dump + restore + verify
|
||||||
|
# cycle runs in well under a minute on a warm runner; 15 minutes
|
||||||
|
# absorbs cold image pulls, slow runner provisioning, and the
|
||||||
|
# Postgres service-container readiness wait without letting a stuck
|
||||||
|
# job consume the runner indefinitely.
|
||||||
|
timeout-minutes: 15
|
||||||
|
|
||||||
|
# Postgres service container. Pin to the same digest as
|
||||||
|
# deploy/docker-compose.yml so the smoke runs against the exact
|
||||||
|
# image the production deploy uses — a regression that surfaces
|
||||||
|
# only on a specific Postgres minor bump shows up here on the
|
||||||
|
# next image refresh in compose, not silently on a customer site.
|
||||||
|
services:
|
||||||
|
postgres:
|
||||||
|
image: postgres:16-alpine@sha256:890480b08124ce7f79960a9bb16fe39729aa302bd384bfd7c408fee6c8f7adb7
|
||||||
|
env:
|
||||||
|
POSTGRES_DB: certctl
|
||||||
|
POSTGRES_USER: certctl
|
||||||
|
POSTGRES_PASSWORD: certctl
|
||||||
|
ports:
|
||||||
|
- 5432:5432
|
||||||
|
# GitHub's services-container health check. The smoke shell
|
||||||
|
# also waits for pg_isready as a belt-and-suspenders guard.
|
||||||
|
options: >-
|
||||||
|
--health-cmd "pg_isready -U certctl -d certctl"
|
||||||
|
--health-interval 5s
|
||||||
|
--health-timeout 3s
|
||||||
|
--health-retries 10
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||||
|
|
||||||
|
- name: Set up Go
|
||||||
|
uses: actions/setup-go@40f1582b2485089dde7abd97c1529aa768e1baff # v5
|
||||||
|
with:
|
||||||
|
go-version: '1.25.10'
|
||||||
|
# Cache go-build + go-mod for the weekly run. Keep the
|
||||||
|
# cache key bound to go.sum so a dep bump invalidates it.
|
||||||
|
cache: true
|
||||||
|
|
||||||
|
- name: Run backup-restore smoke
|
||||||
|
env:
|
||||||
|
PGHOST: 127.0.0.1
|
||||||
|
PGPORT: '5432'
|
||||||
|
PGUSER: certctl
|
||||||
|
PGPASSWORD: certctl
|
||||||
|
PGDATABASE: certctl
|
||||||
|
# Insert enough rows to exercise the chain over a non-trivial
|
||||||
|
# length. 24 ≫ 1 — large enough to surface ordering bugs,
|
||||||
|
# small enough that the dump finishes in seconds.
|
||||||
|
SMOKE_ROWS: '24'
|
||||||
|
run: bash deploy/test/backup-restore-smoke.sh
|
||||||
@@ -132,6 +132,18 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out
|
go test ./internal/service/... ./internal/api/handler/... ./internal/api/middleware/... ./internal/api/router/... ./internal/auth/... ./internal/integration/... ./internal/connector/issuer/... ./internal/connector/target/... ./internal/connector/notifier/... ./internal/connector/discovery/... ./internal/crypto/... ./internal/mcp/... ./internal/cli/... ./internal/domain/... ./internal/validation/... ./internal/tlsprobe/... ./internal/ciparity/... -count=1 -cover -coverprofile=coverage.out
|
||||||
|
|
||||||
|
- name: Multi-replica rate-limit integration test (Phase 13 Sprint 13.2/13.3 — ARCH-M1 closure proof)
|
||||||
|
# The falsifiable proof that CERTCTL_RATE_LIMIT_BACKEND=postgres
|
||||||
|
# enforces caps cluster-wide. testcontainers-go spins one
|
||||||
|
# Postgres container; 3 *PostgresSlidingWindowLimiter instances
|
||||||
|
# share it; 100 concurrent Allow("test-key") with cap=10 must
|
||||||
|
# see exactly 10 succeed + 90 ErrRateLimited. Failure here =
|
||||||
|
# the row-lock arbitration broke; ARCH-M1 closure is invalid.
|
||||||
|
run: |
|
||||||
|
go test -tags=integration -race -count=1 -timeout=300s \
|
||||||
|
-run TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas \
|
||||||
|
./internal/integration/...
|
||||||
|
|
||||||
- name: Check Coverage Thresholds
|
- name: Check Coverage Thresholds
|
||||||
# ci-pipeline-cleanup Phase 2: per-package floors moved to
|
# ci-pipeline-cleanup Phase 2: per-package floors moved to
|
||||||
# .github/coverage-thresholds.yml. Each entry has `floor:` +
|
# .github/coverage-thresholds.yml. Each entry has `floor:` +
|
||||||
@@ -412,6 +424,15 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||||
|
with:
|
||||||
|
# ARCH-001-A closure (Sprint 5, 2026-05-16). The
|
||||||
|
# openapi-version-tag-parity guard needs the v* tags to
|
||||||
|
# be present locally so it can confirm openapi.yaml's
|
||||||
|
# info.version matches the latest release. Without
|
||||||
|
# fetch-tags, the guard falls back to the GitHub API —
|
||||||
|
# works but adds a network round-trip per CI run.
|
||||||
|
fetch-tags: true
|
||||||
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Set up Node.js
|
- name: Set up Node.js
|
||||||
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
||||||
@@ -445,6 +466,17 @@ jobs:
|
|||||||
working-directory: web
|
working-directory: web
|
||||||
run: npx vite build
|
run: npx vite build
|
||||||
|
|
||||||
|
- name: Frontend bundle-size budget (size-limit)
|
||||||
|
# Acquisition-audit SCALE-007 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
# Per-chunk + per-tier budgets in web/.size-limit.json; brotli-
|
||||||
|
# compressed sizes match real-world download cost. A regression
|
||||||
|
# that bloats a chunk past its cap fails this step and forces
|
||||||
|
# an explicit operator decision (fix vs raise cap with rationale).
|
||||||
|
# The script wrapper at scripts/ci-guards/G-frontend-bundle-budget.sh
|
||||||
|
# is the local-runnable counterpart; both invoke `npm run size`.
|
||||||
|
working-directory: web
|
||||||
|
run: npm run size
|
||||||
|
|
||||||
- name: Regression guards (extracted to scripts/ci-guards/)
|
- name: Regression guards (extracted to scripts/ci-guards/)
|
||||||
# All named regression guards live at scripts/ci-guards/<id>.sh per
|
# All named regression guards live at scripts/ci-guards/<id>.sh per
|
||||||
# ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally:
|
# ci-pipeline-cleanup bundle Phase 1. Each guard is callable locally:
|
||||||
|
|||||||
@@ -0,0 +1,112 @@
|
|||||||
|
# Phase 8 closure (TEST-H1 + TEST-H2): browser-driven E2E + visual
|
||||||
|
# regression.
|
||||||
|
#
|
||||||
|
# TEST-003 closure (Sprint 5, 2026-05-16): the suite has accumulated
|
||||||
|
# the empirical green-run evidence the Phase 8 prompt required. 14
|
||||||
|
# consecutive green runs across 2026-05-14 to 2026-05-15 (sampled
|
||||||
|
# via api.github.com/repos/certctl-io/certctl/actions/runs) during
|
||||||
|
# heavy Sprint 1-4 frontend churn confirm stability. The job is
|
||||||
|
# now part of the merge gate (continue-on-error: false below).
|
||||||
|
#
|
||||||
|
# Operator action still required AFTER this commit pushes:
|
||||||
|
# - Add this job's "id" to the branch-protection required-checks
|
||||||
|
# list at https://github.com/certctl-io/certctl/settings/branches.
|
||||||
|
# Without that, the workflow's failure-blocks-merge contract
|
||||||
|
# only fires on PRs whose author is configured to honour the
|
||||||
|
# status check; configured required-checks make it universal.
|
||||||
|
#
|
||||||
|
# Visual regression: the 04-visual-regression.spec.ts file uses
|
||||||
|
# Playwright `toHaveScreenshot()`. First-run on a new branch
|
||||||
|
# regenerates baselines via the `--update-snapshots` flag; the
|
||||||
|
# operator commits the resulting PNG bytes to git. Subsequent runs
|
||||||
|
# pixel-diff. The dispatch input below provides an explicit knob
|
||||||
|
# for that initial baseline pass without needing to edit the
|
||||||
|
# workflow file. See docs/operator/runbooks/e2e-snapshot-update.md
|
||||||
|
# for the snapshot-bump workflow.
|
||||||
|
|
||||||
|
name: Frontend E2E
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches: [master]
|
||||||
|
paths:
|
||||||
|
- 'web/**'
|
||||||
|
- '.github/workflows/e2e.yml'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'web/**'
|
||||||
|
- '.github/workflows/e2e.yml'
|
||||||
|
workflow_dispatch:
|
||||||
|
inputs:
|
||||||
|
update_snapshots:
|
||||||
|
description: 'Regenerate visual-regression baselines (use sparingly)'
|
||||||
|
type: boolean
|
||||||
|
default: false
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
e2e:
|
||||||
|
name: Playwright E2E + visual regression
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
# TEST-003 closure (Sprint 5, 2026-05-16): flipped from
|
||||||
|
# continue-on-error: true after 14 consecutive green runs across
|
||||||
|
# 2026-05-14 to 2026-05-15 confirmed stability. Failures here
|
||||||
|
# now fail the workflow, which (combined with the branch
|
||||||
|
# protection update the operator owns post-merge) blocks merge.
|
||||||
|
continue-on-error: false
|
||||||
|
timeout-minutes: 15
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
|
||||||
|
|
||||||
|
- name: Set up Node.js
|
||||||
|
uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4
|
||||||
|
with:
|
||||||
|
node-version: '22'
|
||||||
|
|
||||||
|
- name: Install Dependencies
|
||||||
|
working-directory: web
|
||||||
|
run: npm ci
|
||||||
|
|
||||||
|
- name: Install Playwright browsers
|
||||||
|
working-directory: web
|
||||||
|
# --with-deps installs OS packages (libnss3, libatk1.0-0, etc.)
|
||||||
|
# the chromium browser needs. Skipping this is the #1 source
|
||||||
|
# of "tests pass locally but fail on CI" for new Playwright
|
||||||
|
# users. The browser binary downloads to ~/.cache/ms-playwright;
|
||||||
|
# the actions/setup-node cache key does NOT include it, so each
|
||||||
|
# CI run re-downloads. Add an actions/cache step targeting
|
||||||
|
# ~/.cache/ms-playwright keyed by the @playwright/test version
|
||||||
|
# in package-lock.json once the suite is stable.
|
||||||
|
run: npx playwright install --with-deps chromium
|
||||||
|
|
||||||
|
- name: Run Playwright E2E + visual regression
|
||||||
|
working-directory: web
|
||||||
|
# The webServer block in playwright.config.ts boots `npm run dev`
|
||||||
|
# automatically and waits for http://localhost:5173 to be
|
||||||
|
# responsive before the first test fires. No separate "start
|
||||||
|
# server" step needed.
|
||||||
|
run: |
|
||||||
|
if [[ "${{ github.event.inputs.update_snapshots }}" == "true" ]]; then
|
||||||
|
echo "::warning::Regenerating visual-regression baselines"
|
||||||
|
npx playwright test --update-snapshots
|
||||||
|
else
|
||||||
|
npx playwright test
|
||||||
|
fi
|
||||||
|
|
||||||
|
- name: Upload Playwright report on failure
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4
|
||||||
|
with:
|
||||||
|
name: playwright-report
|
||||||
|
path: web/playwright-report/
|
||||||
|
retention-days: 7
|
||||||
|
|
||||||
|
- name: Upload visual-regression diffs on failure
|
||||||
|
if: failure()
|
||||||
|
uses: actions/upload-artifact@b4b15b8c7c6ac21ea08fcf65892d2ee8f75cf882 # v4
|
||||||
|
with:
|
||||||
|
name: visual-regression-diffs
|
||||||
|
path: web/test-results/
|
||||||
|
retention-days: 7
|
||||||
@@ -10,6 +10,7 @@ bin/
|
|||||||
# Frontend
|
# Frontend
|
||||||
web/node_modules/
|
web/node_modules/
|
||||||
web/dist/
|
web/dist/
|
||||||
|
web/.storybook-static/
|
||||||
|
|
||||||
# Test binary, built with `go test -c`
|
# Test binary, built with `go test -c`
|
||||||
*.test
|
*.test
|
||||||
|
|||||||
@@ -46,6 +46,29 @@
|
|||||||
manually. Production deploys: this guard is irrelevant
|
manually. Production deploys: this guard is irrelevant
|
||||||
(`CERTCTL_DEMO_MODE_ACK` should not be set in production).
|
(`CERTCTL_DEMO_MODE_ACK` should not be set in production).
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
|
||||||
|
- **GitHub #13 / Hotfix #19 — GUI "Something went wrong" after browser
|
||||||
|
refresh on a real (non-demo) install.** Refresh-after-login wipes the
|
||||||
|
in-memory `apiKey` (deliberate — the GUI never persists it to
|
||||||
|
localStorage as a security posture). The next API call returns a
|
||||||
|
bare 401 with no `WWW-Authenticate` header. Pre-Hotfix-19 the
|
||||||
|
AuthProvider 401 handler only hard-navigated to `/login` when `cause`
|
||||||
|
was a recognised OIDC session-expiry category (`idle_timeout` /
|
||||||
|
`absolute_timeout` / `back_channel_revoked`); bare 401s
|
||||||
|
(`cause === ''`) and `invalid_token` causes fell through to an
|
||||||
|
in-place `AuthGate` state flip that unmounted `BrowserRouter` under
|
||||||
|
an in-flight `<Link>`, triggering a `react-router-dom` invariant
|
||||||
|
that surfaced via `ErrorBoundary` as the "Something went wrong"
|
||||||
|
screen. **Fix:** every 401 now hard-navigates to `/login` regardless
|
||||||
|
of cause; the cause-aware UX is preserved by forwarding
|
||||||
|
`?session_expired=<cause>` only when cause is non-empty (bare 401s
|
||||||
|
redirect to plain `/login`). Three-line change in
|
||||||
|
`web/src/components/AuthProvider.tsx`; 4 regression tests added to
|
||||||
|
`AuthProvider.test.tsx` (empty cause from `/targets`, `invalid_token`
|
||||||
|
cause, `idle_timeout` cause, already-on-`/login` no-op guard).
|
||||||
|
Closes #13.
|
||||||
|
|
||||||
### Security
|
### Security
|
||||||
|
|
||||||
- **Alg-downgrade defense relaxed for Keycloak-shape IdPs (v2.1.0 pre-tag fix).**
|
- **Alg-downgrade defense relaxed for Keycloak-shape IdPs (v2.1.0 pre-tag fix).**
|
||||||
|
|||||||
@@ -9,7 +9,7 @@
|
|||||||
[](https://github.com/certctl-io/certctl/releases)
|
[](https://github.com/certctl-io/certctl/releases)
|
||||||
[](https://github.com/certctl-io/certctl/stargazers)
|
[](https://github.com/certctl-io/certctl/stargazers)
|
||||||
|
|
||||||
certctl is a self-hosted platform that automates the entire TLS certificate lifecycle, from issuance through renewal to deployment, with zero human intervention. Twelve native CA connectors plus an OpenSSL / shell-script adapter for custom CAs; fifteen native deployment-target connectors plus a proxy-agent pattern for network appliances and agentless targets. Private keys stay on your infrastructure where they belong. Free, source-available under BSL 1.1, covers the same lifecycle that enterprise platforms charge $100K+/year for.
|
certctl is a self-hosted platform that automates the entire TLS certificate lifecycle, from issuance through renewal to deployment, with zero human intervention. Twelve native CA connectors plus an OpenSSL / shell-script adapter for custom CAs; fourteen production-ready native deployment-target connectors plus Kubernetes Secrets (preview) and a proxy-agent pattern for network appliances and agentless targets. In agent-mode (the default), private keys stay on the host they were generated on and never touch the control plane; a demo-only `CERTCTL_KEYGEN_MODE=server` flag mints keys server-side, refuses to start without an explicit `CERTCTL_DEMO_MODE_ACK=true` acknowledgement. Free, source-available under BSL 1.1, covers the same lifecycle that enterprise platforms charge $100K+/year for.
|
||||||
|
|
||||||
The CA/Browser Forum's [Ballot SC-081v3](https://cabforum.org/2025/04/11/ballot-sc081v3-introduce-schedule-of-reducing-validity-and-data-reuse-periods/) caps public TLS certificates at **200 days by March 2026**, **100 days by 2027**, and **47 days by 2029**. At 47-day lifespans, a team managing 100 certificates is processing 7+ renewals per week, every week, forever. Manual workflows stop being a choice.
|
The CA/Browser Forum's [Ballot SC-081v3](https://cabforum.org/2025/04/11/ballot-sc081v3-introduce-schedule-of-reducing-validity-and-data-reuse-periods/) caps public TLS certificates at **200 days by March 2026**, **100 days by 2027**, and **47 days by 2029**. At 47-day lifespans, a team managing 100 certificates is processing 7+ renewals per week, every week, forever. Manual workflows stop being a choice.
|
||||||
|
|
||||||
@@ -64,7 +64,7 @@ Built for **platform engineering and DevOps teams** managing 10 to 500+ certific
|
|||||||
certctl handles the full certificate lifecycle in one self-hosted control plane:
|
certctl handles the full certificate lifecycle in one self-hosted control plane:
|
||||||
|
|
||||||
- **Issue and renew** from any CA. Let's Encrypt and any ACME provider, an embedded ACME server you can point cert-manager / certbot / lego at directly, a built-in local CA with sub-CA mode (chains under your enterprise root like ADCS), step-ca, Vault PKI, EJBCA, AWS ACM PCA, Google CAS, DigiCert, Sectigo, GlobalSign, Entrust, plus an OpenSSL / shell-script adapter for anything custom. Twelve native issuer connectors. See the [connector reference](docs/reference/connectors/index.md).
|
- **Issue and renew** from any CA. Let's Encrypt and any ACME provider, an embedded ACME server you can point cert-manager / certbot / lego at directly, a built-in local CA with sub-CA mode (chains under your enterprise root like ADCS), step-ca, Vault PKI, EJBCA, AWS ACM PCA, Google CAS, DigiCert, Sectigo, GlobalSign, Entrust, plus an OpenSSL / shell-script adapter for anything custom. Twelve native issuer connectors. See the [connector reference](docs/reference/connectors/index.md).
|
||||||
- **Deploy automatically** to NGINX, Apache, HAProxy, Caddy, Traefik, Envoy, IIS, Windows Cert Store, Java keystore, Kubernetes Secrets, AWS ACM, Azure Key Vault, SSH known-hosts, Postfix + Dovecot, F5 BIG-IP. Fifteen native target connectors. File-based targets share an atomic-write + SHA-256 idempotency + on-failure rollback + per-target Prometheus counters primitive (the `deploy.Apply` path covers 12 of 13 file-based connectors). Cloud / API targets (AWS ACM, Azure Key Vault) use vendor-SDK semantics rather than the file primitive; F5 uses iControl REST transactions; Kubernetes Secrets is preview. For the per-target guarantee matrix, see [`docs/reference/deployment-model.md`](docs/reference/deployment-model.md). The reload / validate commands operators configure for shell-using targets (NGINX, Apache, HAProxy, Postfix, JavaKeystore, SSH) are validated server-side AND agent-side against shell-metacharacter injection before execution (see [`internal/connector/target/configcheck`](internal/connector/target/configcheck)).
|
- **Deploy automatically** to NGINX, Apache, HAProxy, Caddy, Traefik, Envoy, IIS, Windows Cert Store, Java keystore, AWS ACM, Azure Key Vault, SSH known-hosts, Postfix + Dovecot, F5 BIG-IP. **Fourteen production-ready native target connectors plus Kubernetes Secrets (preview).** File-based targets share an atomic-write + SHA-256 idempotency + on-failure rollback + per-target Prometheus counters primitive (the `deploy.Apply` path covers 12 of 13 file-based connectors). Cloud / API targets (AWS ACM, Azure Key Vault) use vendor-SDK semantics rather than the file primitive; F5 uses iControl REST transactions. The Kubernetes Secrets connector is shipped as preview because the production `client-go` integration is incomplete — see [`docs/reference/deployment-model.md`](docs/reference/deployment-model.md) for the per-target guarantee matrix. The reload / validate commands operators configure for shell-using targets (NGINX, Apache, HAProxy, Postfix, JavaKeystore, SSH) are validated server-side AND agent-side against shell-metacharacter injection before execution (see [`internal/connector/target/configcheck`](internal/connector/target/configcheck)).
|
||||||
- **Run as an ACME server** so existing client tooling plugs in directly. RFC 8555 + RFC 9773 ARI, two per-profile auth modes (public-trust-style validation or trust_authenticated for internal PKI), doubly-signed key rollover, revoke-cert on both kid path and jwk path, per-account rate limiting. Cert-manager / certbot / lego all work pointed at it. See [`docs/reference/protocols/acme-server.md`](docs/reference/protocols/acme-server.md).
|
- **Run as an ACME server** so existing client tooling plugs in directly. RFC 8555 + RFC 9773 ARI, two per-profile auth modes (public-trust-style validation or trust_authenticated for internal PKI), doubly-signed key rollover, revoke-cert on both kid path and jwk path, per-account rate limiting. Cert-manager / certbot / lego all work pointed at it. See [`docs/reference/protocols/acme-server.md`](docs/reference/protocols/acme-server.md).
|
||||||
- **Run as a SCEP server** for Microsoft Intune-managed phones, ChromeOS devices, network appliances. RFC 8894 native with full PKIMessage wire format, native Intune challenge dispatch with replay protection, per-profile dispatch with separate RA cert per profile. See [`docs/reference/protocols/scep-server.md`](docs/reference/protocols/scep-server.md).
|
- **Run as a SCEP server** for Microsoft Intune-managed phones, ChromeOS devices, network appliances. RFC 8894 native with full PKIMessage wire format, native Intune challenge dispatch with replay protection, per-profile dispatch with separate RA cert per profile. See [`docs/reference/protocols/scep-server.md`](docs/reference/protocols/scep-server.md).
|
||||||
- **Run as an EST server** for HTTPS-based PKCS#10 enrollment. 802.1X / Wi-Fi authentication, IoT device enrollment, RFC 9266 channel binding. See [`docs/reference/protocols/est.md`](docs/reference/protocols/est.md).
|
- **Run as an EST server** for HTTPS-based PKCS#10 enrollment. 802.1X / Wi-Fi authentication, IoT device enrollment, RFC 9266 channel binding. See [`docs/reference/protocols/est.md`](docs/reference/protocols/est.md).
|
||||||
@@ -75,11 +75,11 @@ certctl handles the full certificate lifecycle in one self-hosted control plane:
|
|||||||
- **Discover** existing certs across your fleet via filesystem scanning on agents, network TLS probing across CIDR ranges, and cloud secret manager imports (AWS Secrets Manager, Azure Key Vault, GCP Secret Manager). Triage workflow for claim / dismiss / investigate.
|
- **Discover** existing certs across your fleet via filesystem scanning on agents, network TLS probing across CIDR ranges, and cloud secret manager imports (AWS Secrets Manager, Azure Key Vault, GCP Secret Manager). Triage workflow for claim / dismiss / investigate.
|
||||||
- **Revoke** with full RFC 5280 reason codes, DER CRL generation per issuer (scheduler-pre-generated and ETag-cached), and an embedded RFC 6960 OCSP responder with dedicated per-issuer responder certs. Single + bulk revocation. See [`docs/reference/protocols/crl-ocsp.md`](docs/reference/protocols/crl-ocsp.md).
|
- **Revoke** with full RFC 5280 reason codes, DER CRL generation per issuer (scheduler-pre-generated and ETag-cached), and an embedded RFC 6960 OCSP responder with dedicated per-issuer responder certs. Single + bulk revocation. See [`docs/reference/protocols/crl-ocsp.md`](docs/reference/protocols/crl-ocsp.md).
|
||||||
- **Alert** via Slack, Microsoft Teams, PagerDuty, OpsGenie, email, webhooks. Per-policy multi-channel routing matrix with severity tiers and fault-isolating per-channel dispatch. See [`docs/operator/runbooks/expiry-alerts.md`](docs/operator/runbooks/expiry-alerts.md).
|
- **Alert** via Slack, Microsoft Teams, PagerDuty, OpsGenie, email, webhooks. Per-policy multi-channel routing matrix with severity tiers and fault-isolating per-channel dispatch. See [`docs/operator/runbooks/expiry-alerts.md`](docs/operator/runbooks/expiry-alerts.md).
|
||||||
- **Drive the platform from natural language** via the bundled MCP (Model Context Protocol) server. The full REST API is exposed as MCP tools — ask your AI client "show me all expiring certificates", "revoke the VPN cert, key compromised", or "what agents are offline?" and it translates to API calls. Stateless stdio-transport binary at `cmd/mcp-server/`; same auth as the REST API; no extra attack surface. See [`docs/reference/mcp.md`](docs/reference/mcp.md).
|
- **Drive the platform from natural language** via the bundled MCP (Model Context Protocol) server. The bulk of the REST API surface is exposed as MCP tools — ask your AI client "show me all expiring certificates", "revoke the VPN cert, key compromised", or "what agents are offline?" and it translates to API calls. Stateless stdio-transport binary at `cmd/mcp-server/`; same auth as the REST API; no extra attack surface. MCP-vs-REST parity (162 tools covering 221 routes; the gap is a small allowlist of streaming + protocol-conformance endpoints that don't fit the request-response tool shape) is tracked in [`docs/reference/mcp-coverage.md`](docs/reference/mcp-coverage.md) with a CI guard that fails the build if a new REST route lands without either an MCP tool or an explicit allowlist entry. See [`docs/reference/mcp.md`](docs/reference/mcp.md).
|
||||||
|
|
||||||
## Architecture and security
|
## Architecture and security
|
||||||
|
|
||||||
Go 1.25 control plane with handler → service → repository layering. PostgreSQL 16 backend with idempotent migrations. Pull-only deployment model — the server never initiates outbound connections. Agents poll for work and generate ECDSA P-256 keys locally so private keys never touch the control plane. For network appliances and agentless servers, a proxy agent in the same network zone handles deployment via the target's API (WinRM, iControl REST, SSH/SFTP). See the [Architecture Guide](docs/reference/architecture.md) for full system diagrams.
|
Go 1.25 control plane with handler → service → repository layering. PostgreSQL 16 backend with idempotent migrations. Pull-only deployment model — the server never initiates outbound connections. **In agent-keygen mode (the production default), agents poll for work and generate ECDSA P-256 keys locally, so private keys never touch the control plane.** The opposite path (`CERTCTL_KEYGEN_MODE=server`) is demo-only and refuses to boot in production without an explicit `CERTCTL_DEMO_MODE_ACK=true` acknowledgement. For network appliances and agentless servers, a proxy agent in the same network zone handles deployment via the target's API (WinRM, iControl REST, SSH/SFTP). See the [Architecture Guide](docs/reference/architecture.md) for full system diagrams.
|
||||||
|
|
||||||
Security: three authentication paths — API keys (SHA-256 hashed + constant-time compared), [OIDC SSO](docs/operator/oidc-runbooks/index.md) (Keycloak / Authentik / Okta / Auth0 / Entra ID / Google Workspace), and Argon2id [break-glass admin](docs/operator/security.md) for SSO-outage recovery. Successful OIDC login mints an HMAC-signed server-side session with `__Host-` cookies, CSRF rotation on every privileged write, and [RFC OIDC Back-Channel Logout](docs/reference/auth-standards-implemented.md) for IdP-driven session revoke. Role-based authorization on every gated handler with global / per-profile / per-issuer scope. Auditor split keeps regulator-class actors strictly read-only on the audit trail. Day-0 admin via a one-shot bootstrap token; granting or revoking roles requires the dedicated `auth.role.assign` permission. CORS deny-by-default. Shell injection prevention on all connector scripts. SSRF protection (reserved IP filtering) on the network scanner. Issuer + target + OIDC client_secret credentials encrypted at rest with AES-256-GCM. HTTPS-only control plane with TLS 1.3 pinned and a fail-closed startup gate that refuses to boot if the TLS bundle is unusable. Every API call recorded to an immutable audit trail with actor attribution, body hash, and latency tracking. CI runs race detection, static analysis, and vulnerability scanning on every commit. See [`docs/operator/security.md`](docs/operator/security.md) for the full posture and [`docs/operator/auth-threat-model.md`](docs/operator/auth-threat-model.md) for what's defended vs deferred.
|
Security: three authentication paths — API keys (SHA-256 hashed + constant-time compared), [OIDC SSO](docs/operator/oidc-runbooks/index.md) (Keycloak / Authentik / Okta / Auth0 / Entra ID / Google Workspace), and Argon2id [break-glass admin](docs/operator/security.md) for SSO-outage recovery. Successful OIDC login mints an HMAC-signed server-side session with `__Host-` cookies, CSRF rotation on every privileged write, and [RFC OIDC Back-Channel Logout](docs/reference/auth-standards-implemented.md) for IdP-driven session revoke. Role-based authorization on every gated handler with global / per-profile / per-issuer scope. Auditor split keeps regulator-class actors strictly read-only on the audit trail. Day-0 admin via a one-shot bootstrap token; granting or revoking roles requires the dedicated `auth.role.assign` permission. CORS deny-by-default. Shell injection prevention on all connector scripts. SSRF protection (reserved IP filtering) on the network scanner. Issuer + target + OIDC client_secret credentials encrypted at rest with AES-256-GCM. HTTPS-only control plane with TLS 1.3 pinned and a fail-closed startup gate that refuses to boot if the TLS bundle is unusable. Every API call recorded to an immutable audit trail with actor attribution, body hash, and latency tracking. CI runs race detection, static analysis, and vulnerability scanning on every commit. See [`docs/operator/security.md`](docs/operator/security.md) for the full posture and [`docs/operator/auth-threat-model.md`](docs/operator/auth-threat-model.md) for what's defended vs deferred.
|
||||||
|
|
||||||
@@ -92,10 +92,12 @@ Security: three authentication paths — API keys (SHA-256 hashed + constant-tim
|
|||||||
```bash
|
```bash
|
||||||
git clone https://github.com/certctl-io/certctl.git
|
git clone https://github.com/certctl-io/certctl.git
|
||||||
cd certctl
|
cd certctl
|
||||||
docker compose -f deploy/docker-compose.yml -f deploy/docker-compose.demo.yml up -d --build
|
./deploy/demo-up.sh -d --build
|
||||||
```
|
```
|
||||||
|
|
||||||
Wait ~30 seconds, then open **https://localhost:8443** in your browser. The demo overlay flips the base into demo-mode auth (every request served as the synthetic admin actor `actor-demo-anon` — the server emits a prominent ⚠ DEMO MODE banner at boot reminding you this posture is for evaluation only) and seeds 180 days of realistic history across 13 issuers, 8 agents, managed + discovered certs, jobs, deploys, audit, and notification events. The `certctl-tls-init` init container self-signs an ECDSA-P256 cert on first boot — accept the browser warning for the demo, or feed the generated `ca.crt` to your client.
|
Wait ~30 seconds, then open **https://localhost:8443** in your browser. The `demo-up.sh` wrapper exports a fresh `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` and forwards the remaining args to `docker compose -f docker-compose.yml -f docker-compose.demo.yml up`. The timestamp export is required by the Phase 2 SEC-H3 fail-closed guard in `internal/config/config.go::Validate` — demo deploys must re-ACK every 24h so a forgotten demo container never silently ends up serving production traffic with `auth-type=none`. The bare `docker compose ... up` command without the timestamp refuses to boot; the wrapper script is the supported entry point.
|
||||||
|
|
||||||
|
The demo overlay flips the base into demo-mode auth (every request served as the synthetic admin actor `actor-demo-anon` — the server emits a prominent ⚠ DEMO MODE banner at boot reminding you this posture is for evaluation only) and seeds 180 days of realistic history across 13 issuers, 8 agents, managed + discovered certs, jobs, deploys, audit, and notification events. The `certctl-tls-init` init container self-signs an ECDSA-P256 cert on first boot — accept the browser warning for the demo, or feed the generated `ca.crt` to your client.
|
||||||
|
|
||||||
**Production path — `.env` required, fail-closed on placeholders:**
|
**Production path — `.env` required, fail-closed on placeholders:**
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
0
|
||||||
@@ -1,48 +1,100 @@
|
|||||||
# Routes registered in internal/api/router/router.go that are intentionally
|
# Routes registered in internal/api/router/router.go that are intentionally
|
||||||
# NOT in api/openapi.yaml. Each entry needs a one-line `why:` justification.
|
# NOT in api/openapi.yaml. Each entry needs a one-line `why:` justification
|
||||||
|
# AND a required `category:` field (added in Phase 13 Sprint 13.1,
|
||||||
|
# 2026-05-14, architecture diligence audit ARCH-H1).
|
||||||
|
#
|
||||||
# Adding a new entry requires PR-time review.
|
# Adding a new entry requires PR-time review.
|
||||||
#
|
#
|
||||||
# OpenAPI-shaped REST endpoints belong in api/openapi.yaml, NOT here.
|
# OpenAPI-shaped REST endpoints belong in api/openapi.yaml, NOT here.
|
||||||
# This list is for protocol-shaped (SCEP wire endpoints) and operational
|
# This list is for protocol-shaped (SCEP/ACME/EST wire endpoints) and
|
||||||
# (health, metrics, pprof) routes only.
|
# operational (health, metrics, pprof) routes only.
|
||||||
#
|
#
|
||||||
# Per ci-pipeline-cleanup bundle Phase 9 / frozen decision 0.11.
|
# Per ci-pipeline-cleanup bundle Phase 9 / frozen decision 0.11.
|
||||||
#
|
#
|
||||||
# Phase 5 reconciliation (2026-05-13, architecture diligence audit
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
# ARCH-H1): of the 64 entries below, 35 are legitimate wire-protocol
|
# The two-bucket contract (Phase 13 Sprint 13.1)
|
||||||
# carve-outs (SCEP RFC 8894 = 8 entries, ACME RFC 8555 default + per-
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
# profile = 27 entries) that MUST stay. The remaining 29 are REST-
|
|
||||||
# shaped routes whose OpenAPI ops were deferred during their original
|
|
||||||
# Bundle 2 / audit-2026-05-10 / 2026-05-11 work. Burn-down plan:
|
|
||||||
#
|
#
|
||||||
# Sprint A (per-cluster, ~7-8 ops each):
|
# category: wire-protocol
|
||||||
# Cluster 1: auth/sessions + auth/oidc (12 ops)
|
# The route's wire shape is dictated by an IETF RFC (SCEP RFC 8894,
|
||||||
# Cluster 2: auth/breakglass + auth/users + auth/runtime-config (8 ops)
|
# ACME RFC 8555, ACME ARI RFC 9773, EST RFC 7030) or it's a
|
||||||
# Cluster 3: audit/export + demo-residual/cleanup + auth/logout +
|
# sibling/shorthand variant of such a route (same wire semantics,
|
||||||
# auth/breakglass/login + auth/oidc/{login,callback,bcl} (9 ops)
|
# different cosmetic path — e.g. trailing-slash forms, default-
|
||||||
|
# profile shorthands). Documenting these as REST operations in
|
||||||
|
# openapi.yaml would duplicate the RFC with no information gain;
|
||||||
|
# the canonical operator references live in docs/acme-server.md +
|
||||||
|
# docs/operator/scep.md + docs/operator/est.md. These entries
|
||||||
|
# NEVER burn down — they're protocol contracts, not gaps.
|
||||||
|
#
|
||||||
|
# category: rest-deferred
|
||||||
|
# The route is REST-shaped (resource CRUD, JSON request/response,
|
||||||
|
# RBAC-gated) but its OpenAPI operation was deferred when the
|
||||||
|
# handler shipped. These MUST monotonically decrease to zero.
|
||||||
|
# Phase 13 Sprints 13.4-13.6 author the OpenAPI ops + delete the
|
||||||
|
# corresponding exception entries; the
|
||||||
|
# openapi-rest-deferred-monotonic.sh CI guard fails any PR that
|
||||||
|
# grows the rest-deferred bucket vs the checked-in baseline at
|
||||||
|
# api/openapi-handler-exceptions-baseline.txt.
|
||||||
|
#
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
# Phase 13 Sprint 13.1 categorization (2026-05-14)
|
||||||
|
# ──────────────────────────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# Current split, re-derived by the parity script's bucket-reporting
|
||||||
|
# subcommand (post-Sprint-13.6 / 2026-05-14):
|
||||||
|
#
|
||||||
|
# total entries: 36
|
||||||
|
# wire-protocol: 36
|
||||||
|
# rest-deferred: 0 ← THE FLOOR — ARCH-H1 substantive close
|
||||||
|
#
|
||||||
|
# Burn-down progress:
|
||||||
|
#
|
||||||
|
# Sprint 13.4 SHIPPED — 28 - 13 = 15 (auth/sessions cluster 3 ops +
|
||||||
|
# auth/oidc CRUD + JWKS + test + refresh
|
||||||
|
# + group-mappings cluster, 10 ops)
|
||||||
|
# Sprint 13.5 SHIPPED — 15 - 8 = 7 (auth/breakglass admin 4 ops +
|
||||||
|
# auth/users 3 ops + auth/runtime-config
|
||||||
|
# 1 op, 8 ops total)
|
||||||
|
# Sprint 13.6 SHIPPED — 7 - 7 = 0 (audit/export 1 op + demo-
|
||||||
|
# residual/cleanup 1 op + auth/logout 1 op +
|
||||||
|
# auth/breakglass/login 1 op + 3 OIDC
|
||||||
|
# browser-flow endpoints, 7 ops total)
|
||||||
|
#
|
||||||
|
# Sprint 13.7 next tightens the parity-script's rest-deferred floor
|
||||||
|
# from monotonic-decrease to a hard zero-exact pin. After that, any
|
||||||
|
# new REST route MUST land with an OpenAPI op or fail CI — no escape
|
||||||
|
# hatch via `category: rest-deferred`.
|
||||||
#
|
#
|
||||||
# Each authored OpenAPI op needs request/response schemas (not
|
# Each authored OpenAPI op needs request/response schemas (not
|
||||||
# placeholders) so the generated client at web/orval.config.ts emits
|
# placeholders) so the generated client at web/orval.config.ts emits
|
||||||
# typed signatures. When an op lands, delete the corresponding entry
|
# typed signatures. When an op lands, delete the corresponding entry
|
||||||
# below + bump the openapi-handler-parity.sh expected counts.
|
# below + bump api/openapi-handler-exceptions-baseline.txt downward.
|
||||||
|
|
||||||
documented_exceptions:
|
documented_exceptions:
|
||||||
- route: "GET /scep"
|
- route: "GET /scep"
|
||||||
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; serves CA certs via GetCACert/GetCACaps query params, NOT a REST resource."
|
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; serves CA certs via GetCACert/GetCACaps query params, NOT a REST resource."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep"
|
- route: "POST /scep"
|
||||||
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; receives PKCSReq / RenewalReq PKIMessages, NOT a REST resource."
|
why: "SCEP wire-protocol endpoint per RFC 8894 §3.1; receives PKCSReq / RenewalReq PKIMessages, NOT a REST resource."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep/"
|
- route: "GET /scep/"
|
||||||
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep/"
|
- route: "POST /scep/"
|
||||||
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
why: "SCEP wire-protocol endpoint with trailing-slash variant; ChromeOS clients send the trailing-slash form."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep-mtls"
|
- route: "GET /scep-mtls"
|
||||||
why: "SCEP-mTLS sibling endpoint per ci-pipeline-cleanup-prerequisite EST RFC 7030 hardening Phase 6.5; same wire-protocol semantics, mutually-authenticated TLS variant."
|
why: "SCEP-mTLS sibling endpoint per ci-pipeline-cleanup-prerequisite EST RFC 7030 hardening Phase 6.5; same wire-protocol semantics, mutually-authenticated TLS variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep-mtls"
|
- route: "POST /scep-mtls"
|
||||||
why: "SCEP-mTLS sibling endpoint, POST variant."
|
why: "SCEP-mTLS sibling endpoint, POST variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /scep-mtls/"
|
- route: "GET /scep-mtls/"
|
||||||
why: "SCEP-mTLS sibling endpoint, trailing-slash variant."
|
why: "SCEP-mTLS sibling endpoint, trailing-slash variant."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /scep-mtls/"
|
- route: "POST /scep-mtls/"
|
||||||
why: "SCEP-mTLS sibling endpoint, trailing-slash POST variant."
|
why: "SCEP-mTLS sibling endpoint, trailing-slash POST variant."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# ACME server (RFC 8555 + RFC 9773 ARI) — wire-protocol surface.
|
# ACME server (RFC 8555 + RFC 9773 ARI) — wire-protocol surface.
|
||||||
# Like SCEP/EST, ACME is a JWS-signed-JSON wire protocol whose
|
# Like SCEP/EST, ACME is a JWS-signed-JSON wire protocol whose
|
||||||
@@ -54,62 +106,90 @@ documented_exceptions:
|
|||||||
# challenge, cert, key-change, revoke-cert, renewal-info routes land.
|
# challenge, cert, key-change, revoke-cert, renewal-info routes land.
|
||||||
- route: "GET /acme/profile/{id}/directory"
|
- route: "GET /acme/profile/{id}/directory"
|
||||||
why: "ACME server RFC 8555 §7.1.1 directory; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.1.1 directory; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "HEAD /acme/profile/{id}/new-nonce"
|
- route: "HEAD /acme/profile/{id}/new-nonce"
|
||||||
why: "ACME server RFC 8555 §7.2 new-nonce; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.2 new-nonce; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/profile/{id}/new-nonce"
|
- route: "GET /acme/profile/{id}/new-nonce"
|
||||||
why: "ACME server RFC 8555 §7.2 new-nonce GET form; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.2 new-nonce GET form; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/new-account"
|
- route: "POST /acme/profile/{id}/new-account"
|
||||||
why: "ACME server RFC 8555 §7.3 new-account (JWS jwk); documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3 new-account (JWS jwk); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/account/{acc_id}"
|
- route: "POST /acme/profile/{id}/account/{acc_id}"
|
||||||
why: "ACME server RFC 8555 §7.3.2 + §7.3.6 (JWS kid) account update + deactivation; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3.2 + §7.3.6 (JWS kid) account update + deactivation; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/directory"
|
- route: "GET /acme/directory"
|
||||||
why: "ACME server default-profile shorthand; mirrors per-profile when CERTCTL_ACME_SERVER_DEFAULT_PROFILE_ID is set."
|
why: "ACME server default-profile shorthand; mirrors per-profile when CERTCTL_ACME_SERVER_DEFAULT_PROFILE_ID is set."
|
||||||
|
category: wire-protocol
|
||||||
- route: "HEAD /acme/new-nonce"
|
- route: "HEAD /acme/new-nonce"
|
||||||
why: "ACME server default-profile shorthand for new-nonce HEAD."
|
why: "ACME server default-profile shorthand for new-nonce HEAD."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/new-nonce"
|
- route: "GET /acme/new-nonce"
|
||||||
why: "ACME server default-profile shorthand for new-nonce GET."
|
why: "ACME server default-profile shorthand for new-nonce GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/new-account"
|
- route: "POST /acme/new-account"
|
||||||
why: "ACME server default-profile shorthand for new-account."
|
why: "ACME server default-profile shorthand for new-account."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/account/{acc_id}"
|
- route: "POST /acme/account/{acc_id}"
|
||||||
why: "ACME server default-profile shorthand for account update + deactivation."
|
why: "ACME server default-profile shorthand for account update + deactivation."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# Phase 2 — orders + finalize + authz + cert.
|
# Phase 2 — orders + finalize + authz + cert.
|
||||||
- route: "POST /acme/profile/{id}/new-order"
|
- route: "POST /acme/profile/{id}/new-order"
|
||||||
why: "ACME server RFC 8555 §7.4 new-order; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 new-order; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/order/{ord_id}"
|
- route: "POST /acme/profile/{id}/order/{ord_id}"
|
||||||
why: "ACME server RFC 8555 §7.4 order POST-as-GET; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 order POST-as-GET; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/order/{ord_id}/finalize"
|
- route: "POST /acme/profile/{id}/order/{ord_id}/finalize"
|
||||||
why: "ACME server RFC 8555 §7.4 finalize; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4 finalize; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/authz/{authz_id}"
|
- route: "POST /acme/profile/{id}/authz/{authz_id}"
|
||||||
why: "ACME server RFC 8555 §7.5 authz POST-as-GET; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.5 authz POST-as-GET; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/challenge/{chall_id}"
|
- route: "POST /acme/profile/{id}/challenge/{chall_id}"
|
||||||
why: "ACME server RFC 8555 §7.5.1 challenge response; dispatches to Phase 3 validator pool."
|
why: "ACME server RFC 8555 §7.5.1 challenge response; dispatches to Phase 3 validator pool."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/cert/{cert_id}"
|
- route: "POST /acme/profile/{id}/cert/{cert_id}"
|
||||||
why: "ACME server RFC 8555 §7.4.2 cert download; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.4.2 cert download; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/new-order"
|
- route: "POST /acme/new-order"
|
||||||
why: "Phase 2 default-profile shorthand for new-order."
|
why: "Phase 2 default-profile shorthand for new-order."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/order/{ord_id}"
|
- route: "POST /acme/order/{ord_id}"
|
||||||
why: "Phase 2 default-profile shorthand for order POST-as-GET."
|
why: "Phase 2 default-profile shorthand for order POST-as-GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/order/{ord_id}/finalize"
|
- route: "POST /acme/order/{ord_id}/finalize"
|
||||||
why: "Phase 2 default-profile shorthand for finalize."
|
why: "Phase 2 default-profile shorthand for finalize."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/authz/{authz_id}"
|
- route: "POST /acme/authz/{authz_id}"
|
||||||
why: "Phase 2 default-profile shorthand for authz POST-as-GET."
|
why: "Phase 2 default-profile shorthand for authz POST-as-GET."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/challenge/{chall_id}"
|
- route: "POST /acme/challenge/{chall_id}"
|
||||||
why: "Phase 3 default-profile shorthand for challenge response."
|
why: "Phase 3 default-profile shorthand for challenge response."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/cert/{cert_id}"
|
- route: "POST /acme/cert/{cert_id}"
|
||||||
why: "Phase 2 default-profile shorthand for cert download."
|
why: "Phase 2 default-profile shorthand for cert download."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/key-change"
|
- route: "POST /acme/profile/{id}/key-change"
|
||||||
why: "ACME server RFC 8555 §7.3.5 doubly-signed key rollover; documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.3.5 doubly-signed key rollover; documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/profile/{id}/revoke-cert"
|
- route: "POST /acme/profile/{id}/revoke-cert"
|
||||||
why: "ACME server RFC 8555 §7.6 revoke-cert (kid OR cert-key auth); documented in docs/acme-server.md."
|
why: "ACME server RFC 8555 §7.6 revoke-cert (kid OR cert-key auth); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/profile/{id}/renewal-info/{cert_id}"
|
- route: "GET /acme/profile/{id}/renewal-info/{cert_id}"
|
||||||
why: "ACME server RFC 9773 ACME Renewal Information (unauthenticated GET); documented in docs/acme-server.md."
|
why: "ACME server RFC 9773 ACME Renewal Information (unauthenticated GET); documented in docs/acme-server.md."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/key-change"
|
- route: "POST /acme/key-change"
|
||||||
why: "Phase 4 default-profile shorthand for key rollover."
|
why: "Phase 4 default-profile shorthand for key rollover."
|
||||||
|
category: wire-protocol
|
||||||
- route: "POST /acme/revoke-cert"
|
- route: "POST /acme/revoke-cert"
|
||||||
why: "Phase 4 default-profile shorthand for revoke-cert."
|
why: "Phase 4 default-profile shorthand for revoke-cert."
|
||||||
|
category: wire-protocol
|
||||||
- route: "GET /acme/renewal-info/{cert_id}"
|
- route: "GET /acme/renewal-info/{cert_id}"
|
||||||
why: "Phase 4 default-profile shorthand for ARI."
|
why: "Phase 4 default-profile shorthand for ARI."
|
||||||
|
category: wire-protocol
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# Auth Bundle 2 + audit-2026-05-10/11 fix bundle — REST endpoints not yet
|
# Auth Bundle 2 + audit-2026-05-10/11 fix bundle — REST endpoints not yet
|
||||||
@@ -119,59 +199,3 @@ documented_exceptions:
|
|||||||
# stays green for the v2.1.0 release tag. Threat model + handler contracts
|
# stays green for the v2.1.0 release tag. Threat model + handler contracts
|
||||||
# live in docs/operator/{rbac.md,auth-threat-model.md,oidc-runbooks/*}.
|
# live in docs/operator/{rbac.md,auth-threat-model.md,oidc-runbooks/*}.
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
- route: "GET /auth/oidc/login"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC login redirect; user-facing 302 with state cookie. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "GET /auth/oidc/callback"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC callback handler; RFC 9700 §4.7.1 + RFC 9207. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/logout"
|
|
||||||
why: "Bundle 2 Phase 5 cookie + CSRF revoker. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/breakglass/login"
|
|
||||||
why: "Bundle 2 Phase 7.5 public break-glass login (auth-bypass, 404 when disabled). OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "POST /auth/oidc/back-channel-logout"
|
|
||||||
why: "Bundle 2 Phase 5 RFC OIDC Back-Channel Logout 1.0 endpoint. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "GET /api/v1/auth/sessions"
|
|
||||||
why: "Bundle 2 Phase 5 self/admin session list. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "DELETE /api/v1/auth/sessions/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 session revoke. OpenAPI rep deferred to pre-2.2.0."
|
|
||||||
- route: "DELETE /api/v1/auth/sessions"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-2/3 revoke-all-except-current."
|
|
||||||
- route: "GET /api/v1/auth/oidc/providers"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (list)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/providers"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (create)."
|
|
||||||
- route: "PUT /api/v1/auth/oidc/providers/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (update)."
|
|
||||||
- route: "DELETE /api/v1/auth/oidc/providers/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 OIDC provider CRUD (delete)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/providers/{id}/refresh"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-7 JWKS hot-refresh."
|
|
||||||
- route: "GET /api/v1/auth/oidc/providers/{id}/jwks-status"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-7 JWKS health snapshot."
|
|
||||||
- route: "POST /api/v1/auth/oidc/test"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-5 dry-run discovery + JWKS + alg-downgrade check."
|
|
||||||
- route: "GET /api/v1/auth/oidc/group-mappings"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (list)."
|
|
||||||
- route: "POST /api/v1/auth/oidc/group-mappings"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (create)."
|
|
||||||
- route: "DELETE /api/v1/auth/oidc/group-mappings/{id}"
|
|
||||||
why: "Bundle 2 Phase 5 group-mapping CRUD (delete)."
|
|
||||||
- route: "GET /api/v1/auth/breakglass/credentials"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass list (404 when disabled; password hash never on wire)."
|
|
||||||
- route: "POST /api/v1/auth/breakglass/credentials"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass set/rotate password."
|
|
||||||
- route: "POST /api/v1/auth/breakglass/credentials/{actor_id}/unlock"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass unlock after lockout."
|
|
||||||
- route: "DELETE /api/v1/auth/breakglass/credentials/{actor_id}"
|
|
||||||
why: "Bundle 2 Phase 7.5 admin break-glass credential delete."
|
|
||||||
- route: "GET /api/v1/auth/users"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 users page."
|
|
||||||
- route: "DELETE /api/v1/auth/users/{id}"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 user deactivate."
|
|
||||||
- route: "POST /api/v1/auth/users/{id}/reactivate"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-11 user reactivate."
|
|
||||||
- route: "GET /api/v1/auth/runtime-config"
|
|
||||||
why: "Bundle 2 audit-2026-05-10 MED-12 effective auth-runtime-config (read-only)."
|
|
||||||
- route: "POST /api/v1/auth/demo-residual/cleanup"
|
|
||||||
why: "Audit 2026-05-11 A-8 demo-mode residual-grants cleanup endpoint."
|
|
||||||
- route: "GET /api/v1/audit/export"
|
|
||||||
why: "Bundle 1 Phase 8 streaming NDJSON audit export."
|
|
||||||
|
|||||||
+1391
-2
File diff suppressed because it is too large
Load Diff
+18
-3
@@ -11,7 +11,6 @@ import (
|
|||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
"strings"
|
||||||
|
|
||||||
"github.com/certctl-io/certctl/internal/connector/target"
|
"github.com/certctl-io/certctl/internal/connector/target"
|
||||||
@@ -105,8 +104,24 @@ func (a *Agent) executeDeploymentJob(ctx context.Context, job JobItem) {
|
|||||||
// Split PEM into cert and chain (separated by double newline between PEM blocks)
|
// Split PEM into cert and chain (separated by double newline between PEM blocks)
|
||||||
certOnly, chainPEM := splitPEMChain(certPEM)
|
certOnly, chainPEM := splitPEMChain(certPEM)
|
||||||
|
|
||||||
// Check for locally-stored private key (agent keygen mode)
|
// Check for locally-stored private key (agent keygen mode).
|
||||||
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
//
|
||||||
|
// SEC-002 closure (Sprint 1, 2026-05-16): safeAgentKeyPath validates
|
||||||
|
// the certificate_id shape AND asserts the joined path is contained
|
||||||
|
// within a.config.KeyDir. A crafted certificate_id (path traversal,
|
||||||
|
// absolute path, NUL byte, Windows separators) fails closed before
|
||||||
|
// any disk I/O. See cmd/agent/keymem.go for the helper.
|
||||||
|
keyPath, kerr := safeAgentKeyPath(a.config.KeyDir, job.CertificateID)
|
||||||
|
if kerr != nil {
|
||||||
|
a.logger.Error("agent key path validation failed for deployment",
|
||||||
|
"job_id", job.ID,
|
||||||
|
"certificate_id", job.CertificateID,
|
||||||
|
"error", kerr)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key path validation failed: %v", kerr)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
var keyPEM string
|
var keyPEM string
|
||||||
keyData, err := os.ReadFile(keyPath)
|
keyData, err := os.ReadFile(keyPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Bundle-9 / Audit L-002 + L-003 (agent edition).
|
// Bundle-9 / Audit L-002 + L-003 (agent edition).
|
||||||
@@ -41,6 +43,87 @@ func marshalAgentKeyAndZeroize(priv *ecdsa.PrivateKey, onDER func([]byte) error)
|
|||||||
return onDER(der)
|
return onDER(der)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SEC-002 closure (Sprint 1, 2026-05-16). The agent derives an on-disk
|
||||||
|
// key path from job.CertificateID via filepath.Join. Pre-fix, a
|
||||||
|
// crafted certificate_id ("../../etc/passwd", "/absolute/path",
|
||||||
|
// "abc\x00d", "..\\Windows\\path") would drive arbitrary file
|
||||||
|
// write/read on the agent host. The shape regex below mirrors the
|
||||||
|
// server-side internal/validation.ValidateCertificateID gate — both
|
||||||
|
// ends MUST hold for the load-bearing defense (the server can't be
|
||||||
|
// trusted in isolation; a compromised control plane could deliver a
|
||||||
|
// crafted job).
|
||||||
|
//
|
||||||
|
// agentCertIDPattern accepts ASCII letters, digits, ".", "_", "-",
|
||||||
|
// bounded to 128 chars. Existing prefixed IDs (mc-..., cert-..., etc.)
|
||||||
|
// satisfy this trivially. Deliberately rejects path separators (POSIX
|
||||||
|
// and Windows), NUL byte, whitespace, control characters, and the
|
||||||
|
// bare relative-path tokens "." and "..".
|
||||||
|
var agentCertIDPattern = regexp.MustCompile(`^[A-Za-z0-9._-]{1,128}$`)
|
||||||
|
|
||||||
|
// validateAgentCertID returns an error if id is not a well-formed
|
||||||
|
// certificate identifier. Mirrors internal/validation.ValidateCertificateID
|
||||||
|
// — the duplication is deliberate per the package-level comment
|
||||||
|
// ("cmd/agent is a separate binary; copy-paste cheaper than lifting
|
||||||
|
// a shared internal/keystore for a single shape check").
|
||||||
|
func validateAgentCertID(id string) error {
|
||||||
|
if id == "" {
|
||||||
|
return fmt.Errorf("certificate_id is required")
|
||||||
|
}
|
||||||
|
if len(id) > 128 {
|
||||||
|
return fmt.Errorf("certificate_id length %d exceeds 128", len(id))
|
||||||
|
}
|
||||||
|
if !agentCertIDPattern.MatchString(id) {
|
||||||
|
return fmt.Errorf("certificate_id %q contains disallowed characters", id)
|
||||||
|
}
|
||||||
|
if id == "." || id == ".." {
|
||||||
|
return fmt.Errorf("certificate_id %q is a relative-path token", id)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// safeAgentKeyPath returns the on-disk key path for the given
|
||||||
|
// certificateID, after validating the ID shape AND asserting the
|
||||||
|
// joined path is contained within keyDir. Containment is the
|
||||||
|
// authoritative guard — even if validateAgentCertID is bypassed (e.g.
|
||||||
|
// a future refactor removes it), the post-Clean rel-path check below
|
||||||
|
// rejects any path that escapes keyDir.
|
||||||
|
//
|
||||||
|
// The two-leg defense:
|
||||||
|
//
|
||||||
|
// leg 1: shape check (validateAgentCertID) → cheap up-front fail
|
||||||
|
// leg 2: containment check (filepath.Rel) → load-bearing guard
|
||||||
|
//
|
||||||
|
// Returns the joined path on success, or a non-nil error describing
|
||||||
|
// the rejected vector.
|
||||||
|
func safeAgentKeyPath(keyDir, certificateID string) (string, error) {
|
||||||
|
if err := validateAgentCertID(certificateID); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
if keyDir == "" {
|
||||||
|
return "", fmt.Errorf("safeAgentKeyPath: empty keyDir")
|
||||||
|
}
|
||||||
|
cleanDir, err := filepath.Abs(filepath.Clean(keyDir))
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("safeAgentKeyPath: resolve keyDir: %w", err)
|
||||||
|
}
|
||||||
|
joined := filepath.Join(cleanDir, certificateID+".key")
|
||||||
|
cleanJoined := filepath.Clean(joined)
|
||||||
|
rel, err := filepath.Rel(cleanDir, cleanJoined)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("safeAgentKeyPath: rel(%q,%q): %w", cleanDir, cleanJoined, err)
|
||||||
|
}
|
||||||
|
// Reject any path that escapes the directory: a leading ".." in the
|
||||||
|
// relative form means the joined path resolved outside keyDir.
|
||||||
|
if rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
||||||
|
return "", fmt.Errorf("safeAgentKeyPath: %q escapes keyDir %q (rel=%q)", certificateID, cleanDir, rel)
|
||||||
|
}
|
||||||
|
// Belt-and-suspenders: the rel form must also not contain a NUL.
|
||||||
|
if strings.ContainsRune(rel, 0) {
|
||||||
|
return "", fmt.Errorf("safeAgentKeyPath: NUL byte in computed path")
|
||||||
|
}
|
||||||
|
return cleanJoined, nil
|
||||||
|
}
|
||||||
|
|
||||||
// ensureAgentKeyDirSecure creates dir (and ancestors) with mode 0700 or
|
// ensureAgentKeyDirSecure creates dir (and ancestors) with mode 0700 or
|
||||||
// asserts an existing dir is owner-only. If a pre-existing dir is more
|
// asserts an existing dir is owner-only. If a pre-existing dir is more
|
||||||
// permissive than 0700 we tighten it to 0700 (logging-free; this is a
|
// permissive than 0700 we tighten it to 0700 (logging-free; this is a
|
||||||
|
|||||||
@@ -716,3 +716,113 @@ func TestKeymem_AgentMainFlowSmoke(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// SEC-002 closure (Sprint 1, 2026-05-16) — safeAgentKeyPath path-traversal
|
||||||
|
// regression coverage.
|
||||||
|
//
|
||||||
|
// Pre-fix the agent built the on-disk key path via:
|
||||||
|
//
|
||||||
|
// keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
||||||
|
//
|
||||||
|
// migrations/000001_initial_schema.up.sql declares
|
||||||
|
// managed_certificates.id as TEXT PRIMARY KEY with no shape constraint, so
|
||||||
|
// a crafted certificate_id from a compromised control plane (or a poisoned
|
||||||
|
// DB row) could land outside KeyDir. The fix:
|
||||||
|
//
|
||||||
|
// - validateAgentCertID rejects shape violations up-front
|
||||||
|
// - safeAgentKeyPath additionally asserts the joined path is contained
|
||||||
|
// within KeyDir via filepath.Rel; even a future refactor that drops
|
||||||
|
// the shape regex would still fail closed on escape.
|
||||||
|
//
|
||||||
|
// These tests pin both legs against the four vectors called out in the
|
||||||
|
// audit (../../etc/passwd, /absolute/path, NUL byte, Windows separators).
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestValidateAgentCertID_AcceptsCanonicalShapes(t *testing.T) {
|
||||||
|
for _, id := range []string{
|
||||||
|
"mc-cdn-edge",
|
||||||
|
"mc-cdn-edge-2026.q1",
|
||||||
|
"cert-1",
|
||||||
|
"abc123",
|
||||||
|
"MC-UPPER",
|
||||||
|
} {
|
||||||
|
t.Run(id, func(t *testing.T) {
|
||||||
|
if err := validateAgentCertID(id); err != nil {
|
||||||
|
t.Errorf("validateAgentCertID(%q): unexpected error %v", id, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidateAgentCertID_RejectsTraversalVectors(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
id string
|
||||||
|
}{
|
||||||
|
{"empty", ""},
|
||||||
|
{"parent_token", ".."},
|
||||||
|
{"current_token", "."},
|
||||||
|
{"posix_traversal", "../../etc/passwd"},
|
||||||
|
{"absolute_posix", "/absolute/path"},
|
||||||
|
{"windows_traversal", `..\..\evil`},
|
||||||
|
{"windows_separator", `bad\path`},
|
||||||
|
{"nul_byte", "abc\x00def"},
|
||||||
|
{"newline", "abc\ndef"},
|
||||||
|
{"space", "id with spaces"},
|
||||||
|
{"overlong", strings.Repeat("a", 129)},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
if err := validateAgentCertID(tc.id); err == nil {
|
||||||
|
t.Errorf("id=%q: expected rejection, got nil", tc.id)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSafeAgentKeyPath_HappyPath_ProducesContainedPath(t *testing.T) {
|
||||||
|
keyDir := t.TempDir()
|
||||||
|
got, err := safeAgentKeyPath(keyDir, "mc-good")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("safeAgentKeyPath: %v", err)
|
||||||
|
}
|
||||||
|
want := filepath.Join(keyDir, "mc-good.key")
|
||||||
|
// filepath.Clean normalisation may strip a trailing separator, etc.;
|
||||||
|
// compare canonical forms.
|
||||||
|
if filepath.Clean(got) != filepath.Clean(want) {
|
||||||
|
t.Errorf("safeAgentKeyPath = %q; want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSafeAgentKeyPath_RejectsTraversalVectors(t *testing.T) {
|
||||||
|
keyDir := t.TempDir()
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
id string
|
||||||
|
}{
|
||||||
|
{"posix_traversal", "../../etc/passwd"},
|
||||||
|
{"absolute_posix", "/etc/passwd"},
|
||||||
|
{"parent_token", ".."},
|
||||||
|
{"current_token", "."},
|
||||||
|
{"windows_traversal", `..\..\evil`},
|
||||||
|
{"windows_separator", `bad\path`},
|
||||||
|
{"nul_byte", "abc\x00def"},
|
||||||
|
{"empty", ""},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
_, err := safeAgentKeyPath(keyDir, tc.id)
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("id=%q: expected rejection, got nil", tc.id)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSafeAgentKeyPath_RejectsEmptyKeyDir(t *testing.T) {
|
||||||
|
_, err := safeAgentKeyPath("", "mc-good")
|
||||||
|
if err == nil {
|
||||||
|
t.Errorf("empty keyDir: expected rejection, got nil")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
+42
-5
@@ -14,6 +14,7 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"math/rand/v2"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
@@ -24,6 +25,8 @@ import (
|
|||||||
"sync"
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/scheduler"
|
||||||
)
|
)
|
||||||
|
|
||||||
// AgentConfig represents the agent-side configuration.
|
// AgentConfig represents the agent-side configuration.
|
||||||
@@ -231,15 +234,49 @@ func (a *Agent) Run(ctx context.Context) error {
|
|||||||
a.logger.Warn("failed to enforce key directory permissions", "path", a.config.KeyDir, "error", err)
|
a.logger.Warn("failed to enforce key directory permissions", "path", a.config.KeyDir, "error", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create ticker channels for heartbeat, polling, and discovery
|
// SCALE-006 closure (Sprint 2, 2026-05-16). Pre-fix the agent
|
||||||
heartbeatTicker := time.NewTicker(a.heartbeatInterval)
|
// started its heartbeat + poll loops on fixed time.NewTicker
|
||||||
|
// cadence with an unjittered immediate first invocation. Mass
|
||||||
|
// restarts (rolling K8s deploy, control-plane reboot, scheduled
|
||||||
|
// fleet bounce) produced a thundering herd — 5K agents booting
|
||||||
|
// in a 10-second window all hit /heartbeat in lockstep, then
|
||||||
|
// /poll, every interval forever afterward.
|
||||||
|
//
|
||||||
|
// Fix: (1) sleep a random startup-jitter ∈ [0, interval) before
|
||||||
|
// the first heartbeat + first poll to spread the initial cohort,
|
||||||
|
// and (2) use scheduler.JitteredTicker (±10% per-tick envelope)
|
||||||
|
// for the recurring ticks so the cohort stays spread across
|
||||||
|
// every tick boundary. Both legs use the existing in-tree
|
||||||
|
// JitteredTicker primitive (internal/scheduler/jitter.go) —
|
||||||
|
// pattern already exercised by every scheduler.go loop on the
|
||||||
|
// server side.
|
||||||
|
heartbeatTicker := scheduler.NewJitteredTicker(a.heartbeatInterval, scheduler.DefaultSchedulerJitter)
|
||||||
defer heartbeatTicker.Stop()
|
defer heartbeatTicker.Stop()
|
||||||
|
pollTicker := scheduler.NewJitteredTicker(a.pollInterval, scheduler.DefaultSchedulerJitter)
|
||||||
pollTicker := time.NewTicker(a.pollInterval)
|
|
||||||
defer pollTicker.Stop()
|
defer pollTicker.Stop()
|
||||||
|
|
||||||
// Run initial heartbeat and poll
|
// Startup jitter — run-first delay drawn fresh per-agent so a
|
||||||
|
// 5K-agent rolling-restart spreads out across (max interval).
|
||||||
|
// Bounded by ctx so a sigint-during-startup exits cleanly rather
|
||||||
|
// than hanging on the Sleep. Heartbeat and poll are drawn
|
||||||
|
// independently so a single random seed doesn't create a
|
||||||
|
// secondary correlation pattern.
|
||||||
|
hbJitter := time.Duration(rand.Int64N(int64(a.heartbeatInterval)))
|
||||||
|
pollJitter := time.Duration(rand.Int64N(int64(a.pollInterval)))
|
||||||
|
a.logger.Info("startup jitter applied",
|
||||||
|
"heartbeat_jitter", hbJitter.String(),
|
||||||
|
"poll_jitter", pollJitter.String())
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(hbJitter):
|
||||||
|
}
|
||||||
a.sendHeartbeat(ctx)
|
a.sendHeartbeat(ctx)
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(pollJitter):
|
||||||
|
}
|
||||||
a.pollForWork(ctx)
|
a.pollForWork(ctx)
|
||||||
|
|
||||||
// Discovery: run initial scan if directories configured, then on interval
|
// Discovery: run initial scan if directories configured, then on interval
|
||||||
|
|||||||
+14
-1
@@ -151,7 +151,20 @@ func (a *Agent) executeCSRJob(ctx context.Context, job JobItem) {
|
|||||||
// before any write touches disk. Also defer-clear the PEM buffer for
|
// before any write touches disk. Also defer-clear the PEM buffer for
|
||||||
// the same reason — the encoded key isn't sensitive in transit (it's
|
// the same reason — the encoded key isn't sensitive in transit (it's
|
||||||
// going to disk) but lingers on the heap if we don't.
|
// going to disk) but lingers on the heap if we don't.
|
||||||
keyPath := filepath.Join(a.config.KeyDir, job.CertificateID+".key")
|
//
|
||||||
|
// SEC-002 closure (Sprint 1, 2026-05-16): safeAgentKeyPath validates
|
||||||
|
// the certificate_id shape AND asserts the joined path is contained
|
||||||
|
// within a.config.KeyDir. A crafted certificate_id like
|
||||||
|
// "../../etc/passwd" or "/abs/path" now fails closed before any
|
||||||
|
// disk I/O. See cmd/agent/keymem.go for the helper.
|
||||||
|
keyPath, kerr := safeAgentKeyPath(a.config.KeyDir, job.CertificateID)
|
||||||
|
if kerr != nil {
|
||||||
|
a.logger.Error("agent key path validation failed", "job_id", job.ID, "certificate_id", job.CertificateID, "error", kerr)
|
||||||
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key path validation failed: %v", kerr)); reportErr != nil {
|
||||||
|
a.logger.Error("failed to report job status to server", "job_id", job.ID, "status", "Failed", "error", reportErr)
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
if err := ensureAgentKeyDirSecure(filepath.Dir(keyPath)); err != nil {
|
if err := ensureAgentKeyDirSecure(filepath.Dir(keyPath)); err != nil {
|
||||||
a.logger.Error("agent key dir hardening failed", "job_id", job.ID, "error", err)
|
a.logger.Error("agent key dir hardening failed", "job_id", job.ID, "error", err)
|
||||||
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key dir hardening failed: %v", err)); reportErr != nil {
|
if reportErr := a.reportJobStatus(ctx, job.ID, "Failed", fmt.Sprintf("key dir hardening failed: %v", err)); reportErr != nil {
|
||||||
|
|||||||
+197
-30
@@ -38,9 +38,11 @@ import (
|
|||||||
notifypagerduty "github.com/certctl-io/certctl/internal/connector/notifier/pagerduty"
|
notifypagerduty "github.com/certctl-io/certctl/internal/connector/notifier/pagerduty"
|
||||||
notifyslack "github.com/certctl-io/certctl/internal/connector/notifier/slack"
|
notifyslack "github.com/certctl-io/certctl/internal/connector/notifier/slack"
|
||||||
notifyteams "github.com/certctl-io/certctl/internal/connector/notifier/teams"
|
notifyteams "github.com/certctl-io/certctl/internal/connector/notifier/teams"
|
||||||
|
notifywebhook "github.com/certctl-io/certctl/internal/connector/notifier/webhook"
|
||||||
"github.com/certctl-io/certctl/internal/crypto/signer"
|
"github.com/certctl-io/certctl/internal/crypto/signer"
|
||||||
"github.com/certctl-io/certctl/internal/domain"
|
"github.com/certctl-io/certctl/internal/domain"
|
||||||
authdomainAlias "github.com/certctl-io/certctl/internal/domain/auth"
|
authdomainAlias "github.com/certctl-io/certctl/internal/domain/auth"
|
||||||
|
"github.com/certctl-io/certctl/internal/observability"
|
||||||
"github.com/certctl-io/certctl/internal/ratelimit"
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
"github.com/certctl-io/certctl/internal/repository/postgres"
|
"github.com/certctl-io/certctl/internal/repository/postgres"
|
||||||
"github.com/certctl-io/certctl/internal/scep/intune"
|
"github.com/certctl-io/certctl/internal/scep/intune"
|
||||||
@@ -48,6 +50,7 @@ import (
|
|||||||
"github.com/certctl-io/certctl/internal/service"
|
"github.com/certctl-io/certctl/internal/service"
|
||||||
authsvc "github.com/certctl-io/certctl/internal/service/auth"
|
authsvc "github.com/certctl-io/certctl/internal/service/auth"
|
||||||
"github.com/certctl-io/certctl/internal/trustanchor"
|
"github.com/certctl-io/certctl/internal/trustanchor"
|
||||||
|
"github.com/certctl-io/certctl/internal/validation"
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
@@ -76,27 +79,30 @@ func main() {
|
|||||||
// the slog logger is constructed from cfg below this point; we want
|
// the slog logger is constructed from cfg below this point; we want
|
||||||
// the failure to be visible regardless of log-level configuration.
|
// the failure to be visible regardless of log-level configuration.
|
||||||
//
|
//
|
||||||
// Auth Bundle 2 Phase 0: AuthTypeOIDC is in ValidAuthTypes() but the
|
// ARCH-002 closure (Sprint 4, 2026-05-16). Auth Bundle 2 is now
|
||||||
// session middleware + OIDC handler chain ship in later phases. An
|
// fully wired: session.NewService at L394 + oidcsvc.NewService at
|
||||||
// operator who sets CERTCTL_AUTH_TYPE=oidc on a Bundle-2-incomplete
|
// L436 + ChainAuthSessionThenBearer at L2012 + the OIDC handler
|
||||||
// deployment must NOT silently fall back to api-key (the silent
|
// routes (`/auth/oidc/login`, `/auth/oidc/callback`,
|
||||||
// auth-downgrade failure mode that drove G-1 in the first place).
|
// `/auth/oidc/back-channel-logout`) registered in
|
||||||
// The OIDC case below refuses-to-start with an actionable message.
|
// internal/api/router/router.go. The pre-ARCH-002 Phase-0 guard
|
||||||
// Phase 6 of Bundle 2 (session middleware wiring) relaxes this case
|
// that exited on AuthTypeOIDC made sense when the handler chain
|
||||||
// to fall through alongside the api-key + none cases.
|
// was a stub; it became a stale fail-loud after Phase 6 shipped
|
||||||
switch config.AuthType(cfg.Auth.Type) {
|
// and is the only thing that stopped CERTCTL_AUTH_TYPE=oidc from
|
||||||
case config.AuthTypeAPIKey, config.AuthTypeNone:
|
// being a viable production auth mode.
|
||||||
// ok — fall through
|
//
|
||||||
case config.AuthTypeOIDC:
|
// Post-fix: oidc falls through alongside api-key + none. The
|
||||||
fmt.Fprintf(os.Stderr,
|
// G-1 silent-auth-downgrade invariant stays intact — "jwt" is
|
||||||
"CERTCTL_AUTH_TYPE=oidc: the OIDC auth chain is not yet wired in this build (Auth Bundle 2 Phase 6 ships the session middleware that consumes this auth-type literal). Set CERTCTL_AUTH_TYPE=api-key or run an authenticating gateway with CERTCTL_AUTH_TYPE=none until Bundle 2 lands. See cowork/auth-bundle-2-prompt.md.\n")
|
// still rejected at config.Validate() time (it never made it
|
||||||
os.Exit(1)
|
// into ValidAuthTypes()) and the default branch below still
|
||||||
default:
|
// refuses any other unrecognised value at runtime.
|
||||||
|
if !config.IsRuntimeSupportedAuthType(config.AuthType(cfg.Auth.Type)) {
|
||||||
fmt.Fprintf(os.Stderr,
|
fmt.Fprintf(os.Stderr,
|
||||||
"unsupported auth type at runtime: %q (valid: %v) — config validation should have caught this; refusing to start\n",
|
"unsupported auth type at runtime: %q (valid: %v) — config validation should have caught this; refusing to start\n",
|
||||||
cfg.Auth.Type, config.ValidAuthTypes())
|
cfg.Auth.Type, config.ValidAuthTypes())
|
||||||
os.Exit(1)
|
os.Exit(1)
|
||||||
}
|
}
|
||||||
|
// ok — all three modes (api-key / none / oidc) route through the
|
||||||
|
// chained session-then-Bearer auth middleware constructed at L2011.
|
||||||
|
|
||||||
// Set up structured logging
|
// Set up structured logging
|
||||||
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
logger := slog.New(slog.NewJSONHandler(os.Stdout, &slog.HandlerOptions{
|
||||||
@@ -121,19 +127,69 @@ func main() {
|
|||||||
logger.Warn("⚠ DEMO MODE ACTIVE — CERTCTL_DEMO_MODE_ACK=true is set; every request is served as the synthetic admin actor `actor-demo-anon` (no authentication enforced). This deployment MUST NOT hold production keys, certificates, or audit history. To promote to production: (1) unset CERTCTL_DEMO_MODE_ACK; (2) set CERTCTL_AUTH_TYPE=api-key or oidc; (3) set CERTCTL_AUTH_SECRET to a fresh `openssl rand -base64 32`; (4) set CERTCTL_KEYGEN_MODE=agent; (5) rotate CERTCTL_CONFIG_ENCRYPTION_KEY to a fresh `openssl rand -base64 32` (≥ 32 bytes, not the change-me placeholder); (6) restart the server. See docs/operator/security.md for the full posture.")
|
logger.Warn("⚠ DEMO MODE ACTIVE — CERTCTL_DEMO_MODE_ACK=true is set; every request is served as the synthetic admin actor `actor-demo-anon` (no authentication enforced). This deployment MUST NOT hold production keys, certificates, or audit history. To promote to production: (1) unset CERTCTL_DEMO_MODE_ACK; (2) set CERTCTL_AUTH_TYPE=api-key or oidc; (3) set CERTCTL_AUTH_SECRET to a fresh `openssl rand -base64 32`; (4) set CERTCTL_KEYGEN_MODE=agent; (5) rotate CERTCTL_CONFIG_ENCRYPTION_KEY to a fresh `openssl rand -base64 32` (≥ 32 bytes, not the change-me placeholder); (6) restart the server. See docs/operator/security.md for the full posture.")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bundle-5 / Audit H-007: deprecation WARN when the agent bootstrap
|
// Bundle-5 / Audit H-007 + acquisition-audit RED-003 closure
|
||||||
// token is unset. Pre-Bundle-5 there was no token at all; the v2.0.x
|
// (Sprint 5 ACQ, 2026-05-16): deny-empty default for the agent
|
||||||
// default keeps the warn-mode pass-through so existing demo deploys
|
// bootstrap token. v2.2.0 flipped CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY
|
||||||
// keep working, but operators must set CERTCTL_AGENT_BOOTSTRAP_TOKEN
|
// from false → true; Validate() now refuses to start with an
|
||||||
// before v2.2.0 lands. This is a one-shot startup line — the
|
// empty token UNLESS the operator either (a) explicitly opts back
|
||||||
// per-request path stays silent so a busy registration endpoint
|
// into v2.1.x warn-mode with CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=false
|
||||||
// doesn't flood the log.
|
// or (b) is running a demo deploy (CERTCTL_DEMO_MODE_ACK=true).
|
||||||
|
//
|
||||||
|
// The remaining code path here only fires in those two override
|
||||||
|
// scenarios — in both cases the operator has accepted the
|
||||||
|
// posture, but a one-shot startup line keeps the warn-mode case
|
||||||
|
// visible in journals.
|
||||||
if cfg.Auth.AgentBootstrapToken == "" {
|
if cfg.Auth.AgentBootstrapToken == "" {
|
||||||
logger.Warn("agent bootstrap token unset (CERTCTL_AGENT_BOOTSTRAP_TOKEN) — agents may self-register without authentication; this default will become deny-by-default in v2.2.0; generate one with: openssl rand -hex 32")
|
logger.Warn("agent bootstrap token unset (CERTCTL_AGENT_BOOTSTRAP_TOKEN) — agents may self-register without authentication; running in v2.1.x-compat warn-mode (DENY_EMPTY=false) or demo mode (DEMO_MODE_ACK=true). Production deploys MUST set the token; generate with: openssl rand -base64 32")
|
||||||
} else {
|
} else {
|
||||||
logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)")
|
logger.Info("agent bootstrap token configured (length redacted; constant-time compare on POST /api/v1/agents)")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Acquisition-audit SEC-009 + RED-005 closure (Sprint 5 ACQ,
|
||||||
|
// 2026-05-16). Opt-in RFC1918 outbound block for hosted-IaaS
|
||||||
|
// operators where private-IP space carries internal trust
|
||||||
|
// (Kubernetes API on 10.96.0.1 in default kubeadm clusters,
|
||||||
|
// cloud-provider monitoring endpoints, etc.). The toggle wires
|
||||||
|
// into the package-level state in internal/validation/ssrf.go;
|
||||||
|
// from there every IsReservedIP-derived path (SafeHTTPDialContext,
|
||||||
|
// ValidateSafeURL, the network scanner, the webhook + OIDC + ACME
|
||||||
|
// callers) picks up the policy transitively. Default false
|
||||||
|
// preserves the existing self-hosted threat model.
|
||||||
|
validation.SetBlockRFC1918Outbound(cfg.Network.BlockRFC1918Outbound)
|
||||||
|
if cfg.Network.BlockRFC1918Outbound {
|
||||||
|
logger.Info("RFC1918 outbound block ENABLED (CERTCTL_BLOCK_RFC1918_OUTBOUND=true) — 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 are reserved for outbound HTTP egress AND for the network scanner")
|
||||||
|
}
|
||||||
|
|
||||||
|
// Acquisition-audit DEPL-006 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
// Optional OpenTelemetry seed. Init returns a no-op shutdown when
|
||||||
|
// CERTCTL_OTEL_ENABLED is unset/false — defer'ing it
|
||||||
|
// unconditionally is safe. The OTLP gRPC client connects lazily,
|
||||||
|
// so an unreachable collector surfaces as failed export attempts
|
||||||
|
// in the SDK's internal error log, NOT as a boot-time failure.
|
||||||
|
//
|
||||||
|
// Sprint 6 stands up the surface only — no per-handler /
|
||||||
|
// per-query / per-connector spans are emitted yet (v2.3 roadmap
|
||||||
|
// follow-up). Operators enabling the toggle today see process-
|
||||||
|
// level resource attributes and any spans the OTel SDK emits
|
||||||
|
// internally; no certctl-domain spans until v2.3.
|
||||||
|
otelShutdown, err := observability.Init(context.Background(), observability.Config{
|
||||||
|
Enabled: cfg.Observability.OTelEnabled,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("failed to initialize OpenTelemetry", "error", err)
|
||||||
|
os.Exit(1)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
shutdownCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
if err := otelShutdown(shutdownCtx); err != nil {
|
||||||
|
logger.Warn("OpenTelemetry shutdown returned error", "error", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
if cfg.Observability.OTelEnabled {
|
||||||
|
logger.Info("OpenTelemetry tracing ENABLED (CERTCTL_OTEL_ENABLED=true) — OTLP/gRPC exporter wired; honors OTEL_EXPORTER_OTLP_ENDPOINT + other OTEL_* env vars. Per-handler instrumentation is a v2.3 roadmap follow-up; this release stands up the surface only.")
|
||||||
|
}
|
||||||
|
|
||||||
// Phase 6 SCALE-M3 closure (2026-05-14): operator-overridable
|
// Phase 6 SCALE-M3 closure (2026-05-14): operator-overridable
|
||||||
// package-level default for the asyncpoll MaxWait fallback.
|
// package-level default for the asyncpoll MaxWait fallback.
|
||||||
// Per-connector overrides (CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS,
|
// Per-connector overrides (CERTCTL_DIGICERT_POLL_MAX_WAIT_SECONDS,
|
||||||
@@ -577,7 +633,7 @@ func main() {
|
|||||||
// AuthExemptRouterRoutes path. The service-layer Argon2id lockout
|
// AuthExemptRouterRoutes path. The service-layer Argon2id lockout
|
||||||
// state machine remains the second line of defense.
|
// state machine remains the second line of defense.
|
||||||
breakglassHandler.SetLoginRateLimiter(
|
breakglassHandler.SetLoginRateLimiter(
|
||||||
ratelimit.NewSlidingWindowLimiter(5, time.Minute, 50_000),
|
ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 5, time.Minute, 50_000),
|
||||||
)
|
)
|
||||||
if cfg.Auth.Breakglass.Enabled {
|
if cfg.Auth.Breakglass.Enabled {
|
||||||
logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.",
|
logger.Warn("CERTCTL_BREAKGLASS_ENABLED=true — break-glass admin path is ACTIVE; this bypasses SSO. Disable in steady-state.",
|
||||||
@@ -687,6 +743,31 @@ func main() {
|
|||||||
logger.Info("OpsGenie notifier enabled")
|
logger.Info("OpsGenie notifier enabled")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Acquisition-audit DOC-001 closure (Sprint 7 ACQ, 2026-05-16).
|
||||||
|
// Generic webhook notifier. The webhook impl shipped to
|
||||||
|
// internal/connector/notifier/webhook/ months ago with full
|
||||||
|
// SafeHTTPDialContext SSRF guard + HMAC-SHA256 signing + tests but
|
||||||
|
// was never wired here — the README's "6 notifiers" claim was off
|
||||||
|
// by one. NotifierAdapter bridges the rich notifier.Connector
|
||||||
|
// interface (SendEvent / SendAlert / ValidateConfig) to the
|
||||||
|
// service.Notifier (Send + Channel) shape used by the notification
|
||||||
|
// service. Empty CERTCTL_WEBHOOK_URL keeps the notifier disabled
|
||||||
|
// (matches the env-var-gated pattern of the other five). The
|
||||||
|
// signing secret is operator-acknowledged optional — see
|
||||||
|
// internal/config/notifiers.go::NotifierConfig.WebhookSecret.
|
||||||
|
if cfg.Notifiers.WebhookURL != "" {
|
||||||
|
webhookConnector := notifywebhook.New(¬ifywebhook.Config{
|
||||||
|
URL: cfg.Notifiers.WebhookURL,
|
||||||
|
Secret: cfg.Notifiers.WebhookSecret,
|
||||||
|
}, logger)
|
||||||
|
notifierRegistry["Webhook"] = notifywebhook.NewNotifierAdapter(webhookConnector)
|
||||||
|
signedHint := "unsigned"
|
||||||
|
if cfg.Notifiers.WebhookSecret != "" {
|
||||||
|
signedHint = "HMAC-SHA256 signed"
|
||||||
|
}
|
||||||
|
logger.Info("Webhook notifier enabled", "signing", signedHint)
|
||||||
|
}
|
||||||
|
|
||||||
// Wire email notifier if SMTP is configured
|
// Wire email notifier if SMTP is configured
|
||||||
var emailAdapter *notifyemail.NotifierAdapter
|
var emailAdapter *notifyemail.NotifierAdapter
|
||||||
if cfg.Notifiers.SMTPHost != "" && cfg.Notifiers.SMTPFromAddress != "" {
|
if cfg.Notifiers.SMTPHost != "" && cfg.Notifiers.SMTPFromAddress != "" {
|
||||||
@@ -808,6 +889,11 @@ func main() {
|
|||||||
// CERTCTL_RENEWAL_CONCURRENCY; ≤0 normalised to 1 (sequential)
|
// CERTCTL_RENEWAL_CONCURRENCY; ≤0 normalised to 1 (sequential)
|
||||||
// inside the setter.
|
// inside the setter.
|
||||||
jobService.SetRenewalConcurrency(cfg.Scheduler.RenewalConcurrency)
|
jobService.SetRenewalConcurrency(cfg.Scheduler.RenewalConcurrency)
|
||||||
|
// SCALE-001 closure (Sprint 2, 2026-05-16): per-tick ClaimPendingJobs
|
||||||
|
// cap so 100K-job bursts don't materialise the full queue into
|
||||||
|
// memory before the bounded fan-out engages. Setting normalises ≤0
|
||||||
|
// to 1000 (fail-safe vs. legacy unlimited semantics).
|
||||||
|
jobService.SetClaimLimit(cfg.Scheduler.JobClaimLimit)
|
||||||
agentService := service.NewAgentService(agentRepo, certificateRepo, jobRepo, targetRepo, auditService, issuerRegistry, renewalService)
|
agentService := service.NewAgentService(agentRepo, certificateRepo, jobRepo, targetRepo, auditService, issuerRegistry, renewalService)
|
||||||
agentService.SetProfileRepo(profileRepo)
|
agentService.SetProfileRepo(profileRepo)
|
||||||
issuerService := service.NewIssuerService(issuerRepo, auditService, issuerRegistry, encryptionKey, logger)
|
issuerService := service.NewIssuerService(issuerRepo, auditService, issuerRegistry, encryptionKey, logger)
|
||||||
@@ -1000,7 +1086,7 @@ func main() {
|
|||||||
// Production hardening II Phase 3: per-source-IP OCSP rate limit.
|
// Production hardening II Phase 3: per-source-IP OCSP rate limit.
|
||||||
// Window 1m so the cap counts requests per minute. Map cap 50k
|
// Window 1m so the cap counts requests per minute. Map cap 50k
|
||||||
// matches the SCEP/Intune replay cache cap. Zero disables.
|
// matches the SCEP/Intune replay cache cap. Zero disables.
|
||||||
ocspLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000)
|
ocspLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.OCSPRateLimitPerIPMin, time.Minute, 50_000)
|
||||||
certificateHandler.SetOCSPRateLimiter(ocspLimiter)
|
certificateHandler.SetOCSPRateLimiter(ocspLimiter)
|
||||||
issuerHandler := handler.NewIssuerHandler(issuerService)
|
issuerHandler := handler.NewIssuerHandler(issuerService)
|
||||||
targetHandler := handler.NewTargetHandler(targetService)
|
targetHandler := handler.NewTargetHandler(targetService)
|
||||||
@@ -1035,6 +1121,12 @@ func main() {
|
|||||||
// notification service uses to record per-(channel, threshold,
|
// notification service uses to record per-(channel, threshold,
|
||||||
// result) outcomes.
|
// result) outcomes.
|
||||||
metricsHandler.SetExpiryAlerts(expiryAlertMetrics)
|
metricsHandler.SetExpiryAlerts(expiryAlertMetrics)
|
||||||
|
// Sprint 6 COMP-001-HASH: audit_events tamper-evidence counters.
|
||||||
|
// Shared instance — the scheduler's auditChainVerifyLoop writes
|
||||||
|
// to it; the metrics handler reads from it. Wired into the
|
||||||
|
// scheduler below at sched.SetAuditChainBreakRecorder.
|
||||||
|
auditChainCounter := service.NewAuditChainCounter()
|
||||||
|
metricsHandler.SetAuditChainCounter(auditChainCounter)
|
||||||
// Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB
|
// Bundle-5 / H-006: pass the *sql.DB pool so /ready can probe DB
|
||||||
// connectivity via PingContext. /health stays shallow (liveness signal).
|
// connectivity via PingContext. /health stays shallow (liveness signal).
|
||||||
healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db)
|
healthHandler := handler.NewHealthHandler(cfg.Auth.Type, db)
|
||||||
@@ -1065,7 +1157,7 @@ func main() {
|
|||||||
exportHandler := handler.NewExportHandler(exportService)
|
exportHandler := handler.NewExportHandler(exportService)
|
||||||
// Production hardening II Phase 3: per-actor cert-export rate limit.
|
// Production hardening II Phase 3: per-actor cert-export rate limit.
|
||||||
// Window 1h so the cap counts exports per hour. Zero disables.
|
// Window 1h so the cap counts exports per hour. Zero disables.
|
||||||
exportLimiter := ratelimit.NewSlidingWindowLimiter(cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000)
|
exportLimiter := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, cfg.Scheduler.CertExportRateLimitPerActorHr, time.Hour, 50_000)
|
||||||
exportHandler.SetExportRateLimiter(exportLimiter)
|
exportHandler.SetExportRateLimiter(exportLimiter)
|
||||||
|
|
||||||
bulkRevocationHandler := handler.NewBulkRevocationHandler(bulkRevocationService)
|
bulkRevocationHandler := handler.NewBulkRevocationHandler(bulkRevocationService)
|
||||||
@@ -1209,6 +1301,61 @@ func main() {
|
|||||||
sched.SetSessionGarbageCollector(sessionService)
|
sched.SetSessionGarbageCollector(sessionService)
|
||||||
sched.SetBCLReplayGarbageCollector(bclReplayRepo) // Audit 2026-05-10 HIGH-3.
|
sched.SetBCLReplayGarbageCollector(bclReplayRepo) // Audit 2026-05-10 HIGH-3.
|
||||||
sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval)
|
sched.SetSessionGCInterval(cfg.Auth.Session.GCInterval)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 closure (ARCH-M1): when the operator selected
|
||||||
|
// CERTCTL_RATE_LIMIT_BACKEND=postgres, wire the bucket janitor so
|
||||||
|
// stale rows from rate_limit_buckets get swept on the configured
|
||||||
|
// interval. The in-memory backend's prune-on-Allow path keeps
|
||||||
|
// buckets short-lived without a separate sweep, so we skip the
|
||||||
|
// loop entirely for backend=memory.
|
||||||
|
//
|
||||||
|
// maxWindow = 24h: the EST per-principal limiter is the longest
|
||||||
|
// window any current caller configures (the breakglass / OCSP /
|
||||||
|
// export / EST failed-basic limiters use shorter windows). Bump
|
||||||
|
// this if a new caller introduces a longer window — rows pruned
|
||||||
|
// inside their window aren't deletable.
|
||||||
|
if cfg.RateLimit.SlidingWindowBackend == "postgres" {
|
||||||
|
rateLimitGC := ratelimit.NewPostgresGC(db, 24*time.Hour)
|
||||||
|
sched.SetRateLimitGarbageCollector(rateLimitGC)
|
||||||
|
sched.SetRateLimitGCInterval(cfg.RateLimit.SlidingWindowJanitorInterval)
|
||||||
|
logger.Info("rate-limit GC sweep enabled (postgres backend)",
|
||||||
|
"interval", cfg.RateLimit.SlidingWindowJanitorInterval.String(),
|
||||||
|
"max_window", "24h")
|
||||||
|
} else {
|
||||||
|
logger.Info("rate-limit backend = memory; postgres GC sweep not wired (in-memory backend self-prunes)")
|
||||||
|
}
|
||||||
|
// Sprint 6 COMP-001-HASH: wire the audit_events chain-verify loop.
|
||||||
|
// The verifier is *postgres.AuditRepository (delegates to the
|
||||||
|
// migration 000047 audit_events_verify_chain() plpgsql function);
|
||||||
|
// the metric-side recorder is the same auditChainCounter the
|
||||||
|
// metrics handler reads above. Defaults to a 6h tick; operator
|
||||||
|
// overrides via CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL.
|
||||||
|
sched.SetAuditChainVerifier(auditRepo)
|
||||||
|
sched.SetAuditChainBreakRecorder(auditChainCounter)
|
||||||
|
sched.SetAuditChainVerifyInterval(cfg.AuditChain.VerifyInterval)
|
||||||
|
logger.Info("audit chain verify loop enabled",
|
||||||
|
"interval", cfg.AuditChain.VerifyInterval.String())
|
||||||
|
|
||||||
|
// Sprint 6 COMP-002-RETENTION: wire the user-PII purge loop. The
|
||||||
|
// service nullifies email + display_name on users whose
|
||||||
|
// deactivated_at exceeds the retention window (default 30d) and
|
||||||
|
// hashes oidc_subject to preserve audit attribution. The scheduler
|
||||||
|
// loop ticks on CERTCTL_USER_RETENTION_INTERVAL (default 24h).
|
||||||
|
userRetentionService := service.NewUserRetentionService(
|
||||||
|
oidcUserRepo,
|
||||||
|
sessionRepo,
|
||||||
|
auditService,
|
||||||
|
logger,
|
||||||
|
cfg.UserRetention.RetentionWindow,
|
||||||
|
cfg.UserRetention.BatchCap,
|
||||||
|
)
|
||||||
|
sched.SetUserRetentionPurger(userRetentionService)
|
||||||
|
sched.SetUserRetentionInterval(cfg.UserRetention.Interval)
|
||||||
|
logger.Info("user PII retention purge loop enabled",
|
||||||
|
"interval", cfg.UserRetention.Interval.String(),
|
||||||
|
"retention_window", cfg.UserRetention.RetentionWindow.String(),
|
||||||
|
"batch_cap", cfg.UserRetention.BatchCap)
|
||||||
|
|
||||||
logger.Info("session GC sweep enabled",
|
logger.Info("session GC sweep enabled",
|
||||||
"interval", cfg.Auth.Session.GCInterval.String(),
|
"interval", cfg.Auth.Session.GCInterval.String(),
|
||||||
"absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(),
|
"absolute_timeout", cfg.Auth.Session.AbsoluteTimeout.String(),
|
||||||
@@ -1532,7 +1679,7 @@ func main() {
|
|||||||
// release. The shared SlidingWindowLimiter applies the same
|
// release. The shared SlidingWindowLimiter applies the same
|
||||||
// math the SCEP/Intune limiter uses — extracted in Phase 4.1
|
// math the SCEP/Intune limiter uses — extracted in Phase 4.1
|
||||||
// of this bundle so both call sites share the implementation.
|
// of this bundle so both call sites share the implementation.
|
||||||
failed := ratelimit.NewSlidingWindowLimiter(10, time.Hour, 50_000)
|
failed := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, 10, time.Hour, 50_000)
|
||||||
estHandler.SetSourceIPRateLimiter(failed)
|
estHandler.SetSourceIPRateLimiter(failed)
|
||||||
}
|
}
|
||||||
// Phase 2.1: mTLS sibling route. When MTLSEnabled=true, build a
|
// Phase 2.1: mTLS sibling route. When MTLSEnabled=true, build a
|
||||||
@@ -1588,7 +1735,7 @@ func main() {
|
|||||||
mtlsHandler.SetChannelBindingRequired(profile.ChannelBindingRequired)
|
mtlsHandler.SetChannelBindingRequired(profile.ChannelBindingRequired)
|
||||||
mtlsHandler.SetServerKeygenEnabled(profile.ServerKeygenEnabled)
|
mtlsHandler.SetServerKeygenEnabled(profile.ServerKeygenEnabled)
|
||||||
if profile.RateLimitPerPrincipal24h > 0 {
|
if profile.RateLimitPerPrincipal24h > 0 {
|
||||||
perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
||||||
mtlsHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
mtlsHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
||||||
}
|
}
|
||||||
estMTLSHandlers[profile.PathID] = mtlsHandler
|
estMTLSHandlers[profile.PathID] = mtlsHandler
|
||||||
@@ -1610,7 +1757,7 @@ func main() {
|
|||||||
// when configured). The mTLS handler above gets its own
|
// when configured). The mTLS handler above gets its own
|
||||||
// limiter instance so the two routes don't share a bucket.
|
// limiter instance so the two routes don't share a bucket.
|
||||||
if profile.RateLimitPerPrincipal24h > 0 {
|
if profile.RateLimitPerPrincipal24h > 0 {
|
||||||
perPrincipal := ratelimit.NewSlidingWindowLimiter(profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
perPrincipal := ratelimit.NewLimiter(cfg.RateLimit.SlidingWindowBackend, db, profile.RateLimitPerPrincipal24h, 24*time.Hour, 100_000)
|
||||||
estHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
estHandler.SetPerPrincipalRateLimiter(perPrincipal)
|
||||||
}
|
}
|
||||||
estHandlers[profile.PathID] = estHandler
|
estHandlers[profile.PathID] = estHandler
|
||||||
@@ -2052,12 +2199,28 @@ func main() {
|
|||||||
BurstSize: cfg.RateLimit.BurstSize,
|
BurstSize: cfg.RateLimit.BurstSize,
|
||||||
PerUserRPS: cfg.RateLimit.PerUserRPS,
|
PerUserRPS: cfg.RateLimit.PerUserRPS,
|
||||||
PerUserBurstSize: cfg.RateLimit.PerUserBurstSize,
|
PerUserBurstSize: cfg.RateLimit.PerUserBurstSize,
|
||||||
|
// SEC-006 (Sprint 2): bounded bucket TTL so a long-running
|
||||||
|
// server with high-cardinality unauthenticated traffic
|
||||||
|
// (CGNAT churn, Tor exits, scanners) doesn't grow the map
|
||||||
|
// indefinitely.
|
||||||
|
BucketTTL: cfg.RateLimit.BucketTTL,
|
||||||
})
|
})
|
||||||
|
// SEC-003 closure (Sprint 1, 2026-05-16). Pre-fix the
|
||||||
|
// rate-limit-enabled stack was rebuilt without
|
||||||
|
// securityHeadersMiddleware, silently dropping HSTS,
|
||||||
|
// X-Frame-Options, X-Content-Type-Options, Referrer-Policy,
|
||||||
|
// and Content-Security-Policy across every response when an
|
||||||
|
// operator flipped CERTCTL_RATE_LIMIT_ENABLED=true — a
|
||||||
|
// defensive-config toggle weakened browser-side security.
|
||||||
|
// The fixed stack keeps securityHeadersMiddleware at the same
|
||||||
|
// position as the default and inserts rateLimiter right after
|
||||||
|
// so a 429 response still carries the same headers as a 200.
|
||||||
middlewareStack = []func(http.Handler) http.Handler{
|
middlewareStack = []func(http.Handler) http.Handler{
|
||||||
middleware.RequestID,
|
middleware.RequestID,
|
||||||
structuredLogger,
|
structuredLogger,
|
||||||
middleware.Recovery,
|
middleware.Recovery,
|
||||||
bodyLimitMiddleware,
|
bodyLimitMiddleware,
|
||||||
|
securityHeadersMiddleware,
|
||||||
rateLimiter,
|
rateLimiter,
|
||||||
corsMiddleware,
|
corsMiddleware,
|
||||||
// Phase 6 chain: Auth (session-then-Bearer fallback) → CSRF
|
// Phase 6 chain: Auth (session-then-Bearer fallback) → CSRF
|
||||||
@@ -2127,6 +2290,10 @@ func main() {
|
|||||||
noAuthRateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{
|
noAuthRateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{
|
||||||
RPS: cfg.RateLimit.RPS,
|
RPS: cfg.RateLimit.RPS,
|
||||||
BurstSize: cfg.RateLimit.BurstSize,
|
BurstSize: cfg.RateLimit.BurstSize,
|
||||||
|
// SEC-006 closure (Sprint 2): same bucket-TTL eviction for the
|
||||||
|
// no-auth limiter — this one's the higher exposure since every
|
||||||
|
// unauthenticated probe gets a fresh IP-keyed bucket.
|
||||||
|
BucketTTL: cfg.RateLimit.BucketTTL,
|
||||||
})
|
})
|
||||||
noAuthMiddleware = append(noAuthMiddleware, noAuthRateLimiter)
|
noAuthMiddleware = append(noAuthMiddleware, noAuthRateLimiter)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -256,6 +256,18 @@ func TestMain_ServerConfigFromEnvironment(t *testing.T) {
|
|||||||
os.Setenv("CERTCTL_SERVER_PORT", "8080")
|
os.Setenv("CERTCTL_SERVER_PORT", "8080")
|
||||||
os.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", certPath)
|
os.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", certPath)
|
||||||
os.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", keyPath)
|
os.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", keyPath)
|
||||||
|
// Acquisition-audit RED-003 closure (Sprint 5 ACQ, 2026-05-16):
|
||||||
|
// deny-empty default flipped to true; supply a placeholder token
|
||||||
|
// so Load() succeeds. The defer below restores prior env.
|
||||||
|
oldBootstrap := os.Getenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN")
|
||||||
|
os.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
|
defer func() {
|
||||||
|
if oldBootstrap != "" {
|
||||||
|
os.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", oldBootstrap)
|
||||||
|
} else {
|
||||||
|
os.Unsetenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
cfg, err := config.Load()
|
cfg, err := config.Load()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -317,6 +329,18 @@ func TestMain_AuthTypeConfiguration(t *testing.T) {
|
|||||||
|
|
||||||
// Set auth secret for api-key mode
|
// Set auth secret for api-key mode
|
||||||
os.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
os.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
// Acquisition-audit RED-003 closure (Sprint 5 ACQ, 2026-05-16):
|
||||||
|
// deny-empty default flipped to true; supply a placeholder token
|
||||||
|
// so Load() succeeds.
|
||||||
|
oldBootstrap := os.Getenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN")
|
||||||
|
os.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
|
defer func() {
|
||||||
|
if oldBootstrap != "" {
|
||||||
|
os.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", oldBootstrap)
|
||||||
|
} else {
|
||||||
|
os.Unsetenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN")
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
testCases := []string{"api-key", "none"}
|
testCases := []string{"api-key", "none"}
|
||||||
|
|
||||||
@@ -645,3 +669,64 @@ func TestPreflightSCEPChallengePassword(t *testing.T) {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// SEC-003 closure (Sprint 1, 2026-05-16). Pin that the rate-limit-enabled
|
||||||
|
// middleware stack still emits the five security headers (HSTS, XFO,
|
||||||
|
// nosniff, Referrer-Policy, CSP) that the default stack carries.
|
||||||
|
//
|
||||||
|
// Pre-fix the stack rebuild at main.go ~L2079 dropped
|
||||||
|
// securityHeadersMiddleware so flipping CERTCTL_RATE_LIMIT_ENABLED=true
|
||||||
|
// silently turned off five browser-side defenses. This test exercises
|
||||||
|
// the same middleware composition main.go now builds when the flag is
|
||||||
|
// on, and asserts each header lands on the wire. A future regression
|
||||||
|
// that removes securityHeadersMiddleware (or reorders it after the
|
||||||
|
// rate limiter such that a 429 response misses the headers) would
|
||||||
|
// surface here.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestMain_RateLimitedStack_EmitsSecurityHeaders(t *testing.T) {
|
||||||
|
baseHandler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
})
|
||||||
|
|
||||||
|
// Mirror the rate-limit-enabled middlewareStack from main.go.
|
||||||
|
rateLimiter := middleware.NewRateLimiter(middleware.RateLimitConfig{
|
||||||
|
RPS: 1000, // high enough that the single test request isn't dropped
|
||||||
|
BurstSize: 1000,
|
||||||
|
})
|
||||||
|
securityHeaders := middleware.SecurityHeaders(middleware.SecurityHeadersDefaults())
|
||||||
|
bodyLimit := middleware.NewBodyLimit(middleware.BodyLimitConfig{MaxBytes: 1 << 20})
|
||||||
|
|
||||||
|
stack := []func(http.Handler) http.Handler{
|
||||||
|
middleware.RequestID,
|
||||||
|
middleware.Recovery,
|
||||||
|
bodyLimit,
|
||||||
|
securityHeaders,
|
||||||
|
rateLimiter,
|
||||||
|
// Skip the CORS/auth/csrf/audit layers — they aren't relevant
|
||||||
|
// to the headers-on-response invariant we're pinning.
|
||||||
|
}
|
||||||
|
chained := middleware.Chain(baseHandler, stack...)
|
||||||
|
|
||||||
|
req := httptest.NewRequest(http.MethodGet, "/api/v1/test", nil)
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
chained.ServeHTTP(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Fatalf("status = %d; want 200 (rate limit should not trip on a single request)", w.Code)
|
||||||
|
}
|
||||||
|
wantHeaders := map[string]string{
|
||||||
|
"Strict-Transport-Security": "max-age=31536000; includeSubDomains",
|
||||||
|
"X-Frame-Options": "DENY",
|
||||||
|
"X-Content-Type-Options": "nosniff",
|
||||||
|
"Referrer-Policy": "no-referrer-when-downgrade",
|
||||||
|
"Content-Security-Policy": "default-src 'self'; img-src 'self' data:; style-src 'self' 'unsafe-inline'; script-src 'self'; connect-src 'self'; frame-ancestors 'none'",
|
||||||
|
}
|
||||||
|
for name, want := range wantHeaders {
|
||||||
|
got := w.Header().Get(name)
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("rate-limited stack: %s = %q; want %q", name, got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -417,8 +417,10 @@ Every `CERTCTL_*` environment variable is read by the server's `internal/config/
|
|||||||
| `CERTCTL_CORS_ORIGINS` | (empty) | Allowed CORS origins, comma-separated. Empty = deny all cross-origin |
|
| `CERTCTL_CORS_ORIGINS` | (empty) | Allowed CORS origins, comma-separated. Empty = deny all cross-origin |
|
||||||
| `CERTCTL_RATE_LIMIT_RPS` | `10` | Requests per second per client |
|
| `CERTCTL_RATE_LIMIT_RPS` | `10` | Requests per second per client |
|
||||||
| `CERTCTL_RATE_LIMIT_BURST` | `20` | Burst allowance above RPS |
|
| `CERTCTL_RATE_LIMIT_BURST` | `20` | Burst allowance above RPS |
|
||||||
| `CERTCTL_AGENT_BOOTSTRAP_TOKEN` | (empty) | Agent-registration bootstrap secret. Empty = v2.1.x warn-mode pass-through. Set to a real value (`openssl rand -base64 32`); the deny-empty flag's default flip in v2.2.0 will require it. |
|
| `CERTCTL_RATE_LIMIT_BUCKET_TTL` | `1h` | Sprint 2 SEC-006: lifetime of an unused token-bucket entry. A background sweeper running every `BucketTTL/4` reclaims buckets whose last `allow()` call is older than this. Values < 1m clamp up to 1m. Lower when facing high-cardinality unauthenticated traffic (CGNAT churn, scanners) where the bucket-map RSS becomes a concern. |
|
||||||
| `CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY` | `false` | Phase 2 SEC-H1 staged flag. When `true`, the server refuses to start unless `CERTCTL_AGENT_BOOTSTRAP_TOKEN` is non-empty. Default flip to `true` scheduled for v2.2.0. |
|
| `CERTCTL_SCHEDULER_JOB_CLAIM_LIMIT` | `1000` | Sprint 2 SCALE-001: cap on the number of Pending rows a single scheduler tick may claim via `ClaimPendingJobs`. Pre-Sprint-2 the scheduler claimed every Pending row in one transaction, which page-thrashed on 100K-job bursts. Values ≤ 0 fail-safe to `1000` (legacy unlimited semantics are no longer reachable). Pair-tune with `CERTCTL_RENEWAL_CONCURRENCY` (default 25) — the default 40:1 ratio keeps the fan-out busy without exhausting upstream-CA rate limits. |
|
||||||
|
| `CERTCTL_AGENT_BOOTSTRAP_TOKEN` | (empty — required) | Agent-registration bootstrap secret. Set to a real value (`openssl rand -base64 32`). Sprint 5 ACQ RED-003 (2026-05-16) flipped the paired `_DENY_EMPTY` flag's default to `true`, so leaving this empty now refuses server start (unless `CERTCTL_DEMO_MODE_ACK=true`). Operators on v2.1.x reopening the warn-mode escape hatch one upgrade-window can set `CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=false` explicitly. |
|
||||||
|
| `CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY` | `true` | Phase 2 SEC-H1 fail-closed guard. When `true` (default since Sprint 5 ACQ RED-003 closure, 2026-05-16), the server refuses to start unless `CERTCTL_AGENT_BOOTSTRAP_TOKEN` is non-empty. Set to `false` only for a v2.1.x→v2.2.x upgrade-window warn-mode escape hatch. |
|
||||||
| `CERTCTL_DEMO_MODE_ACK` | `false` | Acknowledges demo-mode synthetic admin posture (required when `CERTCTL_AUTH_TYPE=none` binds to a non-loopback host). Must be paired with `CERTCTL_DEMO_MODE_ACK_TS` per Phase 2 SEC-H3. |
|
| `CERTCTL_DEMO_MODE_ACK` | `false` | Acknowledges demo-mode synthetic admin posture (required when `CERTCTL_AUTH_TYPE=none` binds to a non-loopback host). Must be paired with `CERTCTL_DEMO_MODE_ACK_TS` per Phase 2 SEC-H3. |
|
||||||
| `CERTCTL_DEMO_MODE_ACK_TS` | (empty) | Phase 2 SEC-H3: unix-epoch timestamp at which DemoModeAck was last acknowledged. When `CERTCTL_DEMO_MODE_ACK=true`, this must parse as a unix epoch within the last 24h. Set via `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` at every `docker compose up`. |
|
| `CERTCTL_DEMO_MODE_ACK_TS` | (empty) | Phase 2 SEC-H3: unix-epoch timestamp at which DemoModeAck was last acknowledged. When `CERTCTL_DEMO_MODE_ACK=true`, this must parse as a unix epoch within the last 24h. Set via `CERTCTL_DEMO_MODE_ACK_TS=$(date +%s)` at every `docker compose up`. |
|
||||||
| `CERTCTL_ACME_INSECURE_ACK` | `false` | Phase 2 SEC-M4: explicit ACK required to boot with `CERTCTL_ACME_INSECURE=true`. Production deploys MUST never set either flag. |
|
| `CERTCTL_ACME_INSECURE_ACK` | `false` | Phase 2 SEC-M4: explicit ACK required to boot with `CERTCTL_ACME_INSECURE=true`. Production deploys MUST never set either flag. |
|
||||||
|
|||||||
@@ -116,8 +116,11 @@ services:
|
|||||||
networks:
|
networks:
|
||||||
certctl-test:
|
certctl-test:
|
||||||
ipv4_address: 10.30.50.2
|
ipv4_address: 10.30.50.2
|
||||||
|
# Acquisition-audit SEC-014 closure (Sprint 2, 2026-05-16).
|
||||||
|
# Loopback-only host-port bind — the integration-test runner on
|
||||||
|
# the host needs reachability, no other interface does.
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "127.0.0.1:5432:5432"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U certctl -d certctl"]
|
test: ["CMD-SHELL", "pg_isready -U certctl -d certctl"]
|
||||||
interval: 5s
|
interval: 5s
|
||||||
@@ -261,6 +264,18 @@ services:
|
|||||||
CERTCTL_AUTH_TYPE: api-key
|
CERTCTL_AUTH_TYPE: api-key
|
||||||
CERTCTL_AUTH_SECRET: test-key-2026
|
CERTCTL_AUTH_SECRET: test-key-2026
|
||||||
|
|
||||||
|
# Phase 2 SEC-H1 + Sprint 5 RED-003 closure (2026-05-16): the
|
||||||
|
# AgentBootstrapTokenDenyEmpty fail-closed guard refuses to start
|
||||||
|
# the server when CERTCTL_AGENT_BOOTSTRAP_TOKEN is empty (the
|
||||||
|
# default DENY_EMPTY=true flipped on Sprint 5). Demo stacks
|
||||||
|
# bypass the guard via CERTCTL_DEMO_MODE_ACK=true, but this is
|
||||||
|
# the e2e TEST stack (production-like auth posture), not a demo
|
||||||
|
# stack — set a deterministic placeholder token so the server
|
||||||
|
# boots and the vendor-edge integration tests can run. Clearly
|
||||||
|
# test-only; do NOT copy to production. Operators set this from
|
||||||
|
# `openssl rand -base64 32` per docs/operator/security.md.
|
||||||
|
CERTCTL_AGENT_BOOTSTRAP_TOKEN: test-agent-bootstrap-token-deterministic-fixture
|
||||||
|
|
||||||
# Key generation — agent-side (production-like)
|
# Key generation — agent-side (production-like)
|
||||||
CERTCTL_KEYGEN_MODE: agent
|
CERTCTL_KEYGEN_MODE: agent
|
||||||
|
|
||||||
|
|||||||
@@ -62,7 +62,13 @@ services:
|
|||||||
# handshake. ECDSA-P256 with SHA-256 is universally supported. See
|
# handshake. ECDSA-P256 with SHA-256 is universally supported. See
|
||||||
# docs/tls.md Pattern 1.
|
# docs/tls.md Pattern 1.
|
||||||
certctl-tls-init:
|
certctl-tls-init:
|
||||||
image: alpine/openssl:latest
|
# DEPL-002 closure (Sprint 3, 2026-05-16): digest-pin so the
|
||||||
|
# production-shaped compose has the same supply-chain posture as
|
||||||
|
# the certctl Dockerfiles (which CI guards via digest-validity.sh).
|
||||||
|
# The :latest tag floats; the digest is captured at the time
|
||||||
|
# this comment was written. Bump after running the digest-
|
||||||
|
# validity guard to confirm the new digest is still pullable.
|
||||||
|
image: alpine/openssl:latest@sha256:41036db23542ed4cc09bc278d8a7e23b3da01690abb4b0e353b1bb87d70520ed
|
||||||
container_name: certctl-tls-init
|
container_name: certctl-tls-init
|
||||||
restart: "no"
|
restart: "no"
|
||||||
entrypoint: /bin/sh
|
entrypoint: /bin/sh
|
||||||
@@ -123,7 +129,12 @@ services:
|
|||||||
# `unhealthy` flap to cascade into certctl-server's `service_healthy`
|
# `unhealthy` flap to cascade into certctl-server's `service_healthy`
|
||||||
# depends_on, blocking the whole stack.
|
# depends_on, blocking the whole stack.
|
||||||
postgres:
|
postgres:
|
||||||
image: postgres:16-alpine
|
# DEPL-002 closure (Sprint 3, 2026-05-16): digest-pin matching the
|
||||||
|
# alpine/openssl pin above. The `16-alpine` tag is the stable
|
||||||
|
# major-version stream; the digest snapshots today's image so a
|
||||||
|
# silent upstream rebuild can't slip into a production deploy
|
||||||
|
# mid-rollout. Bump alongside dependency reviews.
|
||||||
|
image: postgres:16-alpine@sha256:890480b08124ce7f79960a9bb16fe39729aa302bd384bfd7c408fee6c8f7adb7
|
||||||
container_name: certctl-postgres
|
container_name: certctl-postgres
|
||||||
environment:
|
environment:
|
||||||
POSTGRES_DB: certctl
|
POSTGRES_DB: certctl
|
||||||
@@ -134,8 +145,18 @@ services:
|
|||||||
# default for screenshot/demo use; production deploys never
|
# default for screenshot/demo use; production deploys never
|
||||||
# depend on that fallback.
|
# depend on that fallback.
|
||||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD}
|
||||||
|
# Acquisition-audit SEC-014 closure (Sprint 2, 2026-05-16). Bind
|
||||||
|
# the published port to 127.0.0.1 ONLY — the certctl-server
|
||||||
|
# connection comes in via the `certctl-network` Docker network
|
||||||
|
# (the host-port mapping is operator convenience for psql / DB
|
||||||
|
# inspection only). Pre-fix, the "5432:5432" form bound on
|
||||||
|
# 0.0.0.0, exposing the Postgres TCP listener on every interface
|
||||||
|
# of any host that happened to be on a public IP. The loopback
|
||||||
|
# bind keeps host-side psql access working while preventing the
|
||||||
|
# cross-network exposure landmine for compose deploys that aren't
|
||||||
|
# behind a firewall.
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "127.0.0.1:5432:5432"
|
||||||
volumes:
|
volumes:
|
||||||
- postgres_data:/var/lib/postgresql/data
|
- postgres_data:/var/lib/postgresql/data
|
||||||
networks:
|
networks:
|
||||||
|
|||||||
@@ -72,3 +72,28 @@ IMPORTANT NOTES FOR PRODUCTION:
|
|||||||
- All containers run as non-root
|
- All containers run as non-root
|
||||||
- Implement network policies to restrict traffic between components
|
- Implement network policies to restrict traffic between components
|
||||||
- Consider pod security policies or security standards for your cluster
|
- Consider pod security policies or security standards for your cluster
|
||||||
|
{{- /*
|
||||||
|
DEPL-006 closure (Sprint 3, 2026-05-16). Loud notice when the
|
||||||
|
operator runs a multi-replica deploy without crossing the two
|
||||||
|
required HA toggles. Per-pod rate-limit buckets and round-robin
|
||||||
|
load balancing both silently break correctness above replicas:1.
|
||||||
|
*/}}
|
||||||
|
{{- if gt (int .Values.server.replicas) 1 }}
|
||||||
|
|
||||||
|
⚠️ HA MISCONFIGURATION WARNINGS (replicas={{ .Values.server.replicas }}):
|
||||||
|
{{- $backend := .Values.server.rateLimiting.backend | default "memory" }}
|
||||||
|
{{- if eq $backend "memory" }}
|
||||||
|
- server.rateLimiting.backend = "memory" with replicas > 1 gives each
|
||||||
|
pod its own bucket map, so the configured cap is effectively
|
||||||
|
multiplied by the replica count. Set
|
||||||
|
`--set server.rateLimiting.backend=postgres` (see DEPL-006 /
|
||||||
|
docs/operator/runbooks/ha.md).
|
||||||
|
{{- end }}
|
||||||
|
{{- if not .Values.server.service.sessionAffinity }}
|
||||||
|
- server.service.sessionAffinity is empty. Round-robin Service load
|
||||||
|
balancing routes login → /api/v1/auth/login → /api/v1/auth/csrf
|
||||||
|
across different pods, breaking the CSRF token + session cookie
|
||||||
|
handshake. Set
|
||||||
|
`--set server.service.sessionAffinity=ClientIP`.
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
|
|||||||
@@ -12,6 +12,8 @@ data:
|
|||||||
keygen-mode: {{ .Values.server.keygen.mode | quote }}
|
keygen-mode: {{ .Values.server.keygen.mode | quote }}
|
||||||
rate-limit-rps: {{ .Values.server.rateLimiting.rps | quote }}
|
rate-limit-rps: {{ .Values.server.rateLimiting.rps | quote }}
|
||||||
rate-limit-burst: {{ .Values.server.rateLimiting.burst | quote }}
|
rate-limit-burst: {{ .Values.server.rateLimiting.burst | quote }}
|
||||||
|
rate-limit-backend: {{ .Values.server.rateLimiting.backend | default "memory" | quote }}
|
||||||
|
rate-limit-janitor-interval: {{ .Values.server.rateLimiting.janitorInterval | default "5m" | quote }}
|
||||||
{{- if .Values.server.cors.origins }}
|
{{- if .Values.server.cors.origins }}
|
||||||
cors-origins: {{ .Values.server.cors.origins | quote }}
|
cors-origins: {{ .Values.server.cors.origins | quote }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|||||||
@@ -51,6 +51,20 @@ spec:
|
|||||||
containerPort: {{ .Values.server.port }}
|
containerPort: {{ .Values.server.port }}
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
env:
|
env:
|
||||||
|
# DEPL-003 closure (Sprint 3, 2026-05-16). Pre-fix the
|
||||||
|
# CERTCTL_MIGRATIONS_VIA_HOOK env var was documented in
|
||||||
|
# values.yaml (L797-810) and migration-job.yaml comments
|
||||||
|
# but was never rendered into the server Deployment env
|
||||||
|
# block. With migrations.viaHook=true the operator's
|
||||||
|
# intent is "the pre-install/pre-upgrade Helm Job owns
|
||||||
|
# migrations" — but the server pods, missing the env,
|
||||||
|
# ran their own boot-time RunMigrations alongside the
|
||||||
|
# hook Job, racing on the schema lock. cmd/server/migrations.go
|
||||||
|
# only short-circuits when this env is "true" (line 144).
|
||||||
|
{{- if .Values.migrations.viaHook }}
|
||||||
|
- name: CERTCTL_MIGRATIONS_VIA_HOOK
|
||||||
|
value: "true"
|
||||||
|
{{- end }}
|
||||||
- name: CERTCTL_SERVER_HOST
|
- name: CERTCTL_SERVER_HOST
|
||||||
value: "0.0.0.0"
|
value: "0.0.0.0"
|
||||||
- name: CERTCTL_SERVER_PORT
|
- name: CERTCTL_SERVER_PORT
|
||||||
@@ -108,6 +122,19 @@ spec:
|
|||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
name: {{ include "certctl.fullname" . }}-server
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
key: rate-limit-burst
|
key: rate-limit-burst
|
||||||
|
# Phase 13 Sprint 13.3 (ARCH-M1) — cross-replica-consistent
|
||||||
|
# sliding-window rate limiter. Default memory; flip to
|
||||||
|
# postgres when server.replicas > 1.
|
||||||
|
- name: CERTCTL_RATE_LIMIT_BACKEND
|
||||||
|
valueFrom:
|
||||||
|
configMapKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
|
key: rate-limit-backend
|
||||||
|
- name: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL
|
||||||
|
valueFrom:
|
||||||
|
configMapKeyRef:
|
||||||
|
name: {{ include "certctl.fullname" . }}-server
|
||||||
|
key: rate-limit-janitor-interval
|
||||||
{{- if .Values.server.cors.origins }}
|
{{- if .Values.server.cors.origins }}
|
||||||
- name: CERTCTL_CORS_ORIGINS
|
- name: CERTCTL_CORS_ORIGINS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
@@ -11,6 +11,23 @@ metadata:
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
spec:
|
spec:
|
||||||
type: {{ .Values.server.service.type }}
|
type: {{ .Values.server.service.type }}
|
||||||
|
{{- /*
|
||||||
|
DEPL-006 closure (Sprint 3, 2026-05-16). Render the optional
|
||||||
|
sessionAffinity field. docs/operator/runbooks/ha.md instructs
|
||||||
|
operators to set sessionAffinity: ClientIP for replicas > 1 so
|
||||||
|
login + CSRF flows stay on the same pod; pre-fix the chart did
|
||||||
|
not actually pass the value through. sessionAffinityConfig
|
||||||
|
clientIP.timeoutSeconds renders only when set, otherwise
|
||||||
|
Kubernetes applies its default (10800s / 3h).
|
||||||
|
*/}}
|
||||||
|
{{- if .Values.server.service.sessionAffinity }}
|
||||||
|
sessionAffinity: {{ .Values.server.service.sessionAffinity }}
|
||||||
|
{{- with .Values.server.service.sessionAffinityTimeoutSeconds }}
|
||||||
|
sessionAffinityConfig:
|
||||||
|
clientIP:
|
||||||
|
timeoutSeconds: {{ . }}
|
||||||
|
{{- end }}
|
||||||
|
{{- end }}
|
||||||
ports:
|
ports:
|
||||||
- port: {{ .Values.server.service.port }}
|
- port: {{ .Values.server.service.port }}
|
||||||
targetPort: https
|
targetPort: https
|
||||||
|
|||||||
@@ -42,15 +42,33 @@ spec:
|
|||||||
interval: {{ .Values.monitoring.serviceMonitor.interval | default "30s" }}
|
interval: {{ .Values.monitoring.serviceMonitor.interval | default "30s" }}
|
||||||
scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout | default "10s" }}
|
scrapeTimeout: {{ .Values.monitoring.serviceMonitor.scrapeTimeout | default "10s" }}
|
||||||
tlsConfig:
|
tlsConfig:
|
||||||
# The certctl server uses self-signed bootstrap TLS or operator-
|
{{- /*
|
||||||
# provided cert-manager TLS — the ServiceMonitor consumes the
|
Acquisition-audit DEPL-004 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
# same CA bundle the server presents. When server.tls.existingSecret
|
Pre-Sprint-6 the default was an implicit insecureSkipVerify
|
||||||
# is set, operators usually want to pull the matching ca.crt key
|
true via the template falling through the else branch.
|
||||||
# out of that Secret. Adjust if your CA chain lives elsewhere.
|
Post-Sprint-6 values.yaml ships a real-verify default
|
||||||
|
(caFile + serverName matching the chart existingSecret /
|
||||||
|
cert-manager-emitted Secret at /etc/prometheus/secrets/
|
||||||
|
certctl-ca/), so the truthy if-branch below always fires for
|
||||||
|
the default install. Operators who want skipVerify back must
|
||||||
|
override with tlsConfig insecureSkipVerify true explicitly.
|
||||||
|
Operators who blank tlsConfig entirely hit the else-branch
|
||||||
|
below and trip the Helm fail directive at chart-render time;
|
||||||
|
there is no way to inherit the pre-Sprint-6 implicit-skip
|
||||||
|
behavior silently. See docs/operator/helm-deployment.md for
|
||||||
|
the narrative explanation, including the lesson that comment
|
||||||
|
text referencing Helm template-action delimiters must live
|
||||||
|
in Helm-style comment blocks (this block), never in YAML
|
||||||
|
hash-comment blocks — the Helm lexer scans for action
|
||||||
|
delimiters everywhere in the source text, ignoring YAML
|
||||||
|
comment markers, so descriptive references to actions inside
|
||||||
|
YAML hash-comments are reinterpreted as template actions
|
||||||
|
and abort the entire chart render.
|
||||||
|
*/ -}}
|
||||||
{{- if .Values.monitoring.serviceMonitor.tlsConfig }}
|
{{- if .Values.monitoring.serviceMonitor.tlsConfig }}
|
||||||
{{- toYaml .Values.monitoring.serviceMonitor.tlsConfig | nindent 8 }}
|
{{- toYaml .Values.monitoring.serviceMonitor.tlsConfig | nindent 8 }}
|
||||||
{{- else }}
|
{{- else }}
|
||||||
insecureSkipVerify: true
|
{{- fail "monitoring.serviceMonitor.tlsConfig was explicitly blanked but monitoring.serviceMonitor.enabled=true (Sprint 6 ACQ DEPL-004 closure, 2026-05-16). The values.yaml default ships caFile=/etc/prometheus/secrets/certctl-ca/ca.crt + serverName=certctl-server which matches the existingSecret mount pattern. If your Prometheus pod mounts the CA bundle at a different path, override caFile rather than blanking the block. If you genuinely need skipVerify, set tlsConfig insecureSkipVerify=true explicitly — never blank. See docs/operator/helm-deployment.md for the upgrade-path note." }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- with .Values.monitoring.serviceMonitor.bearerTokenSecret }}
|
{{- with .Values.monitoring.serviceMonitor.bearerTokenSecret }}
|
||||||
bearerTokenSecret:
|
bearerTokenSecret:
|
||||||
|
|||||||
@@ -160,6 +160,17 @@ server:
|
|||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
port: 8443
|
port: 8443
|
||||||
annotations: {}
|
annotations: {}
|
||||||
|
# DEPL-006 closure (Sprint 3, 2026-05-16). Optional sticky-session
|
||||||
|
# routing. REQUIRED when server.replicas > 1 so login + CSRF token
|
||||||
|
# rows stay on the same pod for the duration of a session — the
|
||||||
|
# default round-robin load balancing breaks those flows. Set to
|
||||||
|
# "ClientIP" for production HA (see deploy/helm/examples/values-prod-ha.yaml).
|
||||||
|
# Leave empty for single-replica deploys.
|
||||||
|
sessionAffinity: ""
|
||||||
|
# When sessionAffinity is set, timeout window (in seconds) the
|
||||||
|
# Service maps a source IP to the same pod. Default null →
|
||||||
|
# Kubernetes applies its built-in default (10800s / 3h).
|
||||||
|
sessionAffinityTimeoutSeconds: null
|
||||||
|
|
||||||
# Authentication configuration.
|
# Authentication configuration.
|
||||||
# Valid types: "api-key" (production) or "none" (demo only — disables
|
# Valid types: "api-key" (production) or "none" (demo only — disables
|
||||||
@@ -211,8 +222,25 @@ server:
|
|||||||
|
|
||||||
# Rate limiting configuration
|
# Rate limiting configuration
|
||||||
rateLimiting:
|
rateLimiting:
|
||||||
rps: 100 # Requests per second
|
rps: 100 # Requests per second (token-bucket middleware)
|
||||||
burst: 200 # Burst capacity
|
burst: 200 # Burst capacity (token-bucket middleware)
|
||||||
|
|
||||||
|
# Sliding-window-log rate-limit backend (Phase 13 Sprint 13.2/13.3
|
||||||
|
# ARCH-M1 closure). Selects the implementation backing the
|
||||||
|
# break-glass / OCSP / cert-export / EST limiters. See
|
||||||
|
# docs/operator/observability.md for the operator decision tree.
|
||||||
|
#
|
||||||
|
# memory — per-process (default; single-replica deploys).
|
||||||
|
# postgres — cross-replica-consistent via rate_limit_buckets.
|
||||||
|
# REQUIRED when server.replicas > 1 for accurate
|
||||||
|
# cluster-wide enforcement.
|
||||||
|
backend: memory
|
||||||
|
|
||||||
|
# Scheduler janitor interval for the postgres backend's
|
||||||
|
# rate_limit_buckets sweep. Ignored when backend=memory (the
|
||||||
|
# in-memory backend self-prunes on every Allow call).
|
||||||
|
# Default 5m; minimum 1m.
|
||||||
|
janitorInterval: "5m"
|
||||||
|
|
||||||
# Network scanning configuration
|
# Network scanning configuration
|
||||||
networkScan:
|
networkScan:
|
||||||
@@ -652,14 +680,36 @@ monitoring:
|
|||||||
# name: certctl-prometheus-key
|
# name: certctl-prometheus-key
|
||||||
# key: api-key
|
# key: api-key
|
||||||
# bearerTokenSecret: {}
|
# bearerTokenSecret: {}
|
||||||
# TLS config for the scrape endpoint. The certctl server presents
|
# TLS config for the scrape endpoint. Acquisition-audit DEPL-004
|
||||||
# the same TLS cert the rest of the chart uses; insecureSkipVerify
|
# closure (Sprint 6 ACQ, 2026-05-16): pre-Sprint-6 the default was
|
||||||
# defaults to true so demos work out of the box. Production deploys
|
# an implicit `insecureSkipVerify: true` (fell through the
|
||||||
# should pin the CA via caFile or ca.secret.
|
# template's else-branch). Post-Sprint-6 the default is a real
|
||||||
|
# verify against the chart's CA at the canonical mount path the
|
||||||
|
# existingSecret pattern produces (Prometheus mounts the
|
||||||
|
# certctl-ca Secret as a volume at /etc/prometheus/secrets/
|
||||||
|
# certctl-ca/). Operators whose Prometheus pod mounts the bundle
|
||||||
|
# at a different path override `caFile` below; operators who
|
||||||
|
# genuinely want skipVerify back can do so explicitly. Operators
|
||||||
|
# who blank tlsConfig entirely (`tlsConfig: null` or
|
||||||
|
# `tlsConfig: {}`) trip the `{{ fail }}` guard in
|
||||||
|
# templates/servicemonitor.yaml at chart-render time — there is
|
||||||
|
# no way to inherit the pre-Sprint-6 implicit-skipVerify behavior
|
||||||
|
# silently.
|
||||||
|
#
|
||||||
|
# Production default (verify against the chart's CA):
|
||||||
|
tlsConfig:
|
||||||
|
caFile: /etc/prometheus/secrets/certctl-ca/ca.crt
|
||||||
|
serverName: certctl-server
|
||||||
|
#
|
||||||
|
# Operator override — different CA mount path:
|
||||||
# tlsConfig:
|
# tlsConfig:
|
||||||
# caFile: /etc/prometheus/secrets/certctl-ca/ca.crt
|
# caFile: /path/to/your/ca.crt
|
||||||
# serverName: certctl-server
|
# serverName: your-cert-CN
|
||||||
# tlsConfig: {}
|
#
|
||||||
|
# Operator override — demo / dev-cluster escape hatch
|
||||||
|
# (operator-acknowledged unsafe):
|
||||||
|
# tlsConfig:
|
||||||
|
# insecureSkipVerify: true
|
||||||
# Optional relabeling for the scrape job.
|
# Optional relabeling for the scrape job.
|
||||||
# relabelings: []
|
# relabelings: []
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,14 @@ server:
|
|||||||
|
|
||||||
service:
|
service:
|
||||||
type: ClusterIP
|
type: ClusterIP
|
||||||
|
# DEPL-006 closure (Sprint 3, 2026-05-16): with replicas:3, the
|
||||||
|
# default round-robin Service load balancing breaks login/CSRF
|
||||||
|
# flows because the session cookie + the CSRF token row land on
|
||||||
|
# different pods between requests. sessionAffinity: ClientIP
|
||||||
|
# routes every connection from a given source IP to the same
|
||||||
|
# pod for the configured timeout window. docs/operator/runbooks/ha.md
|
||||||
|
# documents this; pre-fix the chart did not actually render it.
|
||||||
|
sessionAffinity: ClientIP
|
||||||
annotations:
|
annotations:
|
||||||
prometheus.io/scrape: "true"
|
prometheus.io/scrape: "true"
|
||||||
prometheus.io/port: "8443"
|
prometheus.io/port: "8443"
|
||||||
@@ -53,6 +61,14 @@ server:
|
|||||||
rateLimiting:
|
rateLimiting:
|
||||||
rps: 500
|
rps: 500
|
||||||
burst: 1000
|
burst: 1000
|
||||||
|
# DEPL-006 closure (Sprint 3, 2026-05-16): replicas > 1 REQUIRES
|
||||||
|
# the postgres backend so per-key buckets are cross-replica-
|
||||||
|
# consistent. The default 'memory' backend gives each pod its
|
||||||
|
# own bucket map, so a 3-replica fleet effectively triples the
|
||||||
|
# configured cap (a client churning across pods bypasses the
|
||||||
|
# limit). See deploy/helm/certctl/values.yaml L217-226 for the
|
||||||
|
# canonical comment.
|
||||||
|
backend: postgres
|
||||||
|
|
||||||
postgresql:
|
postgresql:
|
||||||
enabled: true
|
enabled: true
|
||||||
|
|||||||
Executable
+225
@@ -0,0 +1,225 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
# SPDX-License-Identifier: BUSL-1.1
|
||||||
|
#
|
||||||
|
# Acquisition-audit DEPL-005 + DATA-012 closure (Sprint 4 ACQ,
|
||||||
|
# 2026-05-16). Backup/restore smoke harness — orchestrates a real
|
||||||
|
# pg_dump -Fc → DROP DATABASE → CREATE DATABASE → pg_restore loop
|
||||||
|
# around the audit_events hash chain and asserts the chain head
|
||||||
|
# round-trips byte-for-byte.
|
||||||
|
#
|
||||||
|
# This script is the body of the `.github/workflows/backup-restore.yml`
|
||||||
|
# weekly job AND the same thing an operator can run locally against a
|
||||||
|
# running Postgres to gain confidence before a real restore.
|
||||||
|
#
|
||||||
|
# Prereqs
|
||||||
|
# =======
|
||||||
|
# - psql / pg_dump / pg_restore installed and on PATH (ubuntu-latest
|
||||||
|
# ships postgresql-client by default; on macOS use Homebrew's
|
||||||
|
# libpq).
|
||||||
|
# - A reachable Postgres at $PGHOST:$PGPORT, plus the certctl user +
|
||||||
|
# database created. In CI we point this at the GHA service container
|
||||||
|
# (postgres:16-alpine, pinned to the same digest as
|
||||||
|
# deploy/docker-compose.yml). Locally, point it wherever — the
|
||||||
|
# script DROPs the database it connects to, so DO NOT POINT THIS
|
||||||
|
# AT A DATABASE YOU CARE ABOUT.
|
||||||
|
# - Go 1.25+ on PATH so the smoke program can be built. (CI's
|
||||||
|
# setup-go step handles this.)
|
||||||
|
# - jq is NOT required — JSON snapshots are compared via python3.
|
||||||
|
#
|
||||||
|
# Behavior contract
|
||||||
|
# =================
|
||||||
|
# - On success: exit 0, prints "PASS" + a summary line.
|
||||||
|
# - On any assertion failure: prints `::error::<reason>`, exits 1.
|
||||||
|
# (The ::error:: prefix is the GitHub Actions log-annotation shape;
|
||||||
|
# it surfaces as a red banner in the Actions run UI.)
|
||||||
|
#
|
||||||
|
# Non-goals
|
||||||
|
# =========
|
||||||
|
# - Does not exercise PITR / WAL archiving. The Sprint 4 scope is the
|
||||||
|
# pg_dump/pg_restore path only; managed-DB PITR is the operator's
|
||||||
|
# responsibility per docs/operator/runbooks/postgres-backup.md.
|
||||||
|
# - Does not regenerate the audit chain after restore. A "restore
|
||||||
|
# that rewrote history" would mask exactly the bug under test.
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||||
|
WORKDIR="$(mktemp -d)"
|
||||||
|
trap 'rm -rf "$WORKDIR"' EXIT
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Configuration — every knob is env-overridable so the same script
|
||||||
|
# runs unchanged in CI (where the GHA service container exposes
|
||||||
|
# 127.0.0.1:5432) and on an operator's laptop (where they may have
|
||||||
|
# Postgres on a UNIX socket or a different port).
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
: "${PGHOST:=127.0.0.1}"
|
||||||
|
: "${PGPORT:=5432}"
|
||||||
|
: "${PGUSER:=certctl}"
|
||||||
|
: "${PGPASSWORD:=certctl}"
|
||||||
|
: "${PGDATABASE:=certctl}"
|
||||||
|
: "${SMOKE_ROWS:=24}"
|
||||||
|
: "${MIGRATIONS_PATH:=${REPO_ROOT}/migrations}"
|
||||||
|
|
||||||
|
# psql/pg_dump/pg_restore all read PG* env vars. Export so we don't
|
||||||
|
# have to spell them out on every command line.
|
||||||
|
export PGHOST PGPORT PGUSER PGPASSWORD PGDATABASE
|
||||||
|
|
||||||
|
DB_URL="postgres://${PGUSER}:${PGPASSWORD}@${PGHOST}:${PGPORT}/${PGDATABASE}?sslmode=disable"
|
||||||
|
|
||||||
|
fail() {
|
||||||
|
# GitHub Actions log annotation. The `::error::` prefix is what
|
||||||
|
# the Actions UI uses to highlight a line in the run log.
|
||||||
|
echo "::error::backup-restore-smoke: $*" >&2
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
step() { printf '\n=== %s ===\n' "$*"; }
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Sanity preflight
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "preflight"
|
||||||
|
command -v psql >/dev/null || fail "psql not on PATH (install postgresql-client)"
|
||||||
|
command -v pg_dump >/dev/null || fail "pg_dump not on PATH"
|
||||||
|
command -v pg_restore >/dev/null || fail "pg_restore not on PATH"
|
||||||
|
command -v go >/dev/null || fail "go not on PATH (need Go to build the smoke program)"
|
||||||
|
command -v python3 >/dev/null || fail "python3 not on PATH (used for JSON diff)"
|
||||||
|
test -d "${MIGRATIONS_PATH}" || fail "migrations dir not found: ${MIGRATIONS_PATH}"
|
||||||
|
|
||||||
|
# Wait for Postgres readiness up to 60s. pg_isready returns 0 when
|
||||||
|
# the server is accepting connections, so the loop is the canonical
|
||||||
|
# CI-friendly "wait for the service container" pattern.
|
||||||
|
step "waiting for postgres at ${PGHOST}:${PGPORT}"
|
||||||
|
for _ in $(seq 1 60); do
|
||||||
|
if pg_isready -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" -q; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
|
pg_isready -h "${PGHOST}" -p "${PGPORT}" -U "${PGUSER}" -d "${PGDATABASE}" -q \
|
||||||
|
|| fail "postgres not ready after 60s at ${PGHOST}:${PGPORT}"
|
||||||
|
|
||||||
|
# Wipe any prior state in the target DB. A previous failed run could
|
||||||
|
# have left rows behind; the smoke contract is "starts from clean."
|
||||||
|
step "wiping ${PGDATABASE} schema (DROP SCHEMA public CASCADE; CREATE SCHEMA public)"
|
||||||
|
psql -v ON_ERROR_STOP=1 -c 'DROP SCHEMA IF EXISTS public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO PUBLIC;'
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Build the smoke program. We use `go run` to avoid leaving a binary
|
||||||
|
# behind; the migrations + workload are quick so the per-invocation
|
||||||
|
# compile cost is negligible.
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "building smoke program"
|
||||||
|
cd "${REPO_ROOT}"
|
||||||
|
go build -o "${WORKDIR}/smoke" ./deploy/test/backupsmoke
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 1 — workload: migrate, insert rows, snapshot chain head.
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 1 — workload (${SMOKE_ROWS} audit_events rows)"
|
||||||
|
"${WORKDIR}/smoke" \
|
||||||
|
--mode=workload \
|
||||||
|
--db-url="${DB_URL}" \
|
||||||
|
--migrations-path="${MIGRATIONS_PATH}" \
|
||||||
|
--rows="${SMOKE_ROWS}" \
|
||||||
|
| tee "${WORKDIR}/pre.json"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 2 — backup. Canonical pg_dump shape per
|
||||||
|
# deploy/helm/certctl/templates/backup-cronjob.yaml: --format=custom,
|
||||||
|
# --no-owner, --no-acl. --no-owner / --no-acl keep the dump portable
|
||||||
|
# across Postgres installations with different role layouts (the
|
||||||
|
# audit-trail hash chain is data, not ACL state).
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 2 — pg_dump -Fc"
|
||||||
|
pg_dump --format=custom --no-owner --no-acl --dbname="${PGDATABASE}" --file="${WORKDIR}/backup.dump"
|
||||||
|
test -s "${WORKDIR}/backup.dump" || fail "pg_dump produced an empty file"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 3 — wipe. The fresh-schema approach is the closest analogue
|
||||||
|
# to "operator nuked the wrong volume." DROP DATABASE would require
|
||||||
|
# connecting to a different DB and reconnect dance; DROP SCHEMA
|
||||||
|
# achieves the same "no rows, no schema, no functions" end state
|
||||||
|
# inside the existing connection and is restore-compatible (pg_dump
|
||||||
|
# -Fc bundles the schema in the dump, so pg_restore recreates it).
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 3 — drop schema (simulating data-loss event)"
|
||||||
|
psql -v ON_ERROR_STOP=1 -c 'DROP SCHEMA IF EXISTS public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO PUBLIC;'
|
||||||
|
|
||||||
|
# Sanity: confirm audit_events is actually gone before restore. A
|
||||||
|
# regression here (e.g. DROP SCHEMA silently no-op) would let the
|
||||||
|
# verifier "succeed" by reading the original rows, making the test
|
||||||
|
# false-pass.
|
||||||
|
PRE_RESTORE_TABLES=$(psql -tAc "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='public'")
|
||||||
|
if [ "${PRE_RESTORE_TABLES}" -ne 0 ]; then
|
||||||
|
fail "post-DROP SCHEMA, expected 0 public tables; saw ${PRE_RESTORE_TABLES}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 4 — restore.
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 4 — pg_restore"
|
||||||
|
pg_restore --dbname="${PGDATABASE}" --no-owner --no-acl --exit-on-error "${WORKDIR}/backup.dump"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 5 — verify: re-snapshot, run audit_events_verify_chain().
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 5 — verify (audit_events_verify_chain() + snapshot)"
|
||||||
|
"${WORKDIR}/smoke" \
|
||||||
|
--mode=verify \
|
||||||
|
--db-url="${DB_URL}" \
|
||||||
|
| tee "${WORKDIR}/post.json"
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
# Phase 6 — assert.
|
||||||
|
#
|
||||||
|
# pre.row_count == post.row_count
|
||||||
|
# pre.chain_head_hash == post.chain_head_hash (BYTE-EXACT)
|
||||||
|
# post.first_break_id == "" (verifier clean)
|
||||||
|
# post.verifier_walked == pre.row_count (every row walked)
|
||||||
|
#
|
||||||
|
# Use python3 rather than jq so the script runs unchanged on macOS
|
||||||
|
# without an extra Homebrew install.
|
||||||
|
# ----------------------------------------------------------------------
|
||||||
|
step "phase 6 — assertions"
|
||||||
|
python3 - <<'PY' "${WORKDIR}/pre.json" "${WORKDIR}/post.json"
|
||||||
|
import json, sys
|
||||||
|
|
||||||
|
pre = json.load(open(sys.argv[1]))
|
||||||
|
post = json.load(open(sys.argv[2]))
|
||||||
|
|
||||||
|
def bail(msg):
|
||||||
|
print(f"::error::backup-restore-smoke: {msg}", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if pre["row_count"] != post["row_count"]:
|
||||||
|
bail(f"row_count mismatch: pre={pre['row_count']} post={post['row_count']}")
|
||||||
|
|
||||||
|
if pre["chain_head_hash"] != post["chain_head_hash"]:
|
||||||
|
bail(
|
||||||
|
"chain_head_hash mismatch — pg_dump/pg_restore did NOT round-trip the "
|
||||||
|
"audit_events hash chain byte-for-byte. "
|
||||||
|
f"pre={pre['chain_head_hash']} post={post['chain_head_hash']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if post.get("first_break_id", "") != "":
|
||||||
|
bail(
|
||||||
|
"audit_events_verify_chain() reports a break post-restore at id="
|
||||||
|
f"{post['first_break_id']} pos={post.get('first_break_pos', '?')} — "
|
||||||
|
"the chain is no longer self-consistent after the restore."
|
||||||
|
)
|
||||||
|
|
||||||
|
if post.get("verifier_walked", -1) != pre["row_count"]:
|
||||||
|
bail(
|
||||||
|
f"verifier_walked={post.get('verifier_walked')} != pre.row_count="
|
||||||
|
f"{pre['row_count']} — verifier short-circuited or read stale rows."
|
||||||
|
)
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"PASS rows={pre['row_count']} "
|
||||||
|
f"chain_head={pre['chain_head_hash'][:16]}… "
|
||||||
|
f"verifier=clean"
|
||||||
|
)
|
||||||
|
PY
|
||||||
@@ -0,0 +1,222 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
// Command backupsmoke is the workload+verifier half of the
|
||||||
|
// backup/restore CI gate (acquisition-audit DEPL-005 + DATA-012
|
||||||
|
// closure, Sprint 4 ACQ, 2026-05-16).
|
||||||
|
//
|
||||||
|
// The companion shell harness `deploy/test/backup-restore-smoke.sh`
|
||||||
|
// orchestrates the dump/drop/restore lifecycle around two
|
||||||
|
// invocations of this program: one before the backup
|
||||||
|
// (--mode=workload) and one after the restore (--mode=verify). Both
|
||||||
|
// emit a small JSON snapshot to stdout; the shell harness diffs them
|
||||||
|
// and asserts the chain head + row count round-trip byte-for-byte.
|
||||||
|
//
|
||||||
|
// Modes
|
||||||
|
// =====
|
||||||
|
//
|
||||||
|
// --mode=workload
|
||||||
|
// Run all up-migrations against `--migrations-path`, then
|
||||||
|
// generate `--rows` (default 24) audit_events rows representing
|
||||||
|
// an issue / renew / revoke / auth-login cycle. Emit a snapshot
|
||||||
|
// with the post-workload row_count + chain head row_hash.
|
||||||
|
//
|
||||||
|
// --mode=verify
|
||||||
|
// Run `audit_events_verify_chain()` (the per-row hash-chain
|
||||||
|
// verifier installed by migration 000047) and capture
|
||||||
|
// first_break_id / first_break_pos / verifier_walked. Emit a
|
||||||
|
// snapshot with row_count + chain head row_hash + verifier
|
||||||
|
// output. No mutations.
|
||||||
|
//
|
||||||
|
// The CI assertion contract
|
||||||
|
// =========================
|
||||||
|
//
|
||||||
|
// After (workload → pg_dump -Fc → DROP + CREATE → pg_restore →
|
||||||
|
// verify), the shell asserts:
|
||||||
|
//
|
||||||
|
// pre.row_count == post.row_count
|
||||||
|
// pre.chain_head_hash == post.chain_head_hash (byte-exact)
|
||||||
|
// post.first_break_id == "" (verifier clean)
|
||||||
|
//
|
||||||
|
// A pg_dump format-quirk that didn't preserve TIMESTAMPTZ
|
||||||
|
// microseconds would surface as a chain-head mismatch (the
|
||||||
|
// canonical payload re-formats `timestamp AT TIME ZONE 'UTC'` to
|
||||||
|
// microsecond ISO-8601 — any precision loss breaks the hash). A
|
||||||
|
// trigger-or-function regression would surface as a verifier non-
|
||||||
|
// empty first_break_id. The test exists to PROVE these properties
|
||||||
|
// under a real workload, not to defend against a known quirk.
|
||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"encoding/json"
|
||||||
|
"flag"
|
||||||
|
"fmt"
|
||||||
|
"log"
|
||||||
|
"os"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/repository/postgres"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Snapshot is the on-the-wire shape emitted to stdout. The shell
|
||||||
|
// orchestrator parses it via python3 -c 'json.load(...)' and diffs
|
||||||
|
// the relevant fields. Keep it stable — any rename here must land
|
||||||
|
// alongside a shell-harness change.
|
||||||
|
type Snapshot struct {
|
||||||
|
Phase string `json:"phase"`
|
||||||
|
RowCount int `json:"row_count"`
|
||||||
|
ChainHead string `json:"chain_head_hash"`
|
||||||
|
FirstBreakID string `json:"first_break_id,omitempty"`
|
||||||
|
FirstBreakPos int `json:"first_break_pos,omitempty"`
|
||||||
|
VerifierWalked int `json:"verifier_walked,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func main() {
|
||||||
|
var (
|
||||||
|
mode = flag.String("mode", "", "workload | verify")
|
||||||
|
dbURL = flag.String("db-url", os.Getenv("DATABASE_URL"), "Postgres URL (or set DATABASE_URL)")
|
||||||
|
migrationsPath = flag.String("migrations-path", "./migrations", "Path to the migrations/ directory (workload mode only)")
|
||||||
|
rows = flag.Int("rows", 24, "Number of audit_events rows to insert (workload mode only)")
|
||||||
|
)
|
||||||
|
flag.Parse()
|
||||||
|
|
||||||
|
if *dbURL == "" {
|
||||||
|
log.Fatal("--db-url or DATABASE_URL is required")
|
||||||
|
}
|
||||||
|
if *mode == "" {
|
||||||
|
log.Fatal("--mode is required (workload | verify)")
|
||||||
|
}
|
||||||
|
|
||||||
|
db, err := sql.Open("postgres", *dbURL)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("sql.Open: %v", err)
|
||||||
|
}
|
||||||
|
defer db.Close()
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Minute)
|
||||||
|
defer cancel()
|
||||||
|
if err := db.PingContext(ctx); err != nil {
|
||||||
|
log.Fatalf("ping: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
switch *mode {
|
||||||
|
case "workload":
|
||||||
|
// Run all up-migrations end-to-end. The trigger + verifier
|
||||||
|
// function installed by migration 000047 must be in place
|
||||||
|
// before the inserts below; partial migration would mask a
|
||||||
|
// real bug.
|
||||||
|
if err := postgres.RunMigrations(db, *migrationsPath); err != nil {
|
||||||
|
log.Fatalf("RunMigrations(%s): %v", *migrationsPath, err)
|
||||||
|
}
|
||||||
|
if err := runWorkload(ctx, db, *rows); err != nil {
|
||||||
|
log.Fatalf("runWorkload: %v", err)
|
||||||
|
}
|
||||||
|
snap, err := snapshot(ctx, db, "workload", false)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("snapshot: %v", err)
|
||||||
|
}
|
||||||
|
emit(snap)
|
||||||
|
case "verify":
|
||||||
|
snap, err := snapshot(ctx, db, "verify", true)
|
||||||
|
if err != nil {
|
||||||
|
log.Fatalf("snapshot: %v", err)
|
||||||
|
}
|
||||||
|
emit(snap)
|
||||||
|
default:
|
||||||
|
log.Fatalf("unknown --mode=%q (workload | verify)", *mode)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// runWorkload inserts n audit_events rows representing an
|
||||||
|
// issue / renew / revoke / auth-login cycle. Patterns mirror the
|
||||||
|
// shape the application emits (see internal/service/audit_*.go),
|
||||||
|
// so the canonical payload exercised here is representative.
|
||||||
|
//
|
||||||
|
// event_category is omitted on each INSERT — migration 000032 gave
|
||||||
|
// the column DEFAULT 'cert_lifecycle', which is also the value the
|
||||||
|
// application uses for cert lifecycle events. Auth rows get the
|
||||||
|
// default too, which is harmless for the round-trip property under
|
||||||
|
// test (only the canonical-payload byte sequence matters).
|
||||||
|
//
|
||||||
|
// Timestamps are monotonic via the `NOW() + ($interval ||
|
||||||
|
// ' microsecond')::interval` pattern from
|
||||||
|
// internal/repository/postgres/audit_chain_test.go — ordering
|
||||||
|
// determinism is necessary for the chain head to be stable across
|
||||||
|
// runs.
|
||||||
|
func runWorkload(ctx context.Context, db *sql.DB, n int) error {
|
||||||
|
actions := []struct{ act, resType, resID string }{
|
||||||
|
{"certificate.issue", "certificate", "mc-smoke"},
|
||||||
|
{"certificate.renew", "certificate", "mc-smoke"},
|
||||||
|
{"certificate.revoke", "certificate", "mc-smoke"},
|
||||||
|
{"auth.login", "session", "sess-smoke"},
|
||||||
|
}
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
a := actions[i%len(actions)]
|
||||||
|
id := fmt.Sprintf("audit-smoke-%04d", i)
|
||||||
|
_, err := db.ExecContext(ctx, `
|
||||||
|
INSERT INTO audit_events (
|
||||||
|
id, actor, actor_type, action,
|
||||||
|
resource_type, resource_id, details, timestamp
|
||||||
|
)
|
||||||
|
VALUES (
|
||||||
|
$1, 'smoke-actor', 'User', $2,
|
||||||
|
$3, $4, '{}'::jsonb,
|
||||||
|
NOW() + ($5 || ' microsecond')::interval
|
||||||
|
)
|
||||||
|
`, id, a.act, a.resType, a.resID, fmt.Sprintf("%d", i))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("insert row %d (%s): %w", i, id, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// snapshot reads the chain head + row count, optionally invoking
|
||||||
|
// the on-demand verifier. Verifier output goes in three additional
|
||||||
|
// fields so the workload-side snapshot can omit them via the
|
||||||
|
// `omitempty` tag.
|
||||||
|
func snapshot(ctx context.Context, db *sql.DB, phase string, runVerifier bool) (*Snapshot, error) {
|
||||||
|
s := &Snapshot{Phase: phase}
|
||||||
|
|
||||||
|
if err := db.QueryRowContext(ctx, `SELECT COUNT(*) FROM audit_events`).Scan(&s.RowCount); err != nil {
|
||||||
|
return nil, fmt.Errorf("count(audit_events): %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := db.QueryRowContext(ctx, `SELECT row_hash FROM audit_chain_head WHERE id = 1`).Scan(&s.ChainHead); err != nil {
|
||||||
|
return nil, fmt.Errorf("read audit_chain_head: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if runVerifier {
|
||||||
|
var brokenID sql.NullString
|
||||||
|
var brokenPos, walked int
|
||||||
|
err := db.QueryRowContext(ctx, `
|
||||||
|
SELECT first_break_id, first_break_pos, row_count
|
||||||
|
FROM audit_events_verify_chain()
|
||||||
|
`).Scan(&brokenID, &brokenPos, &walked)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("audit_events_verify_chain(): %w", err)
|
||||||
|
}
|
||||||
|
if brokenID.Valid {
|
||||||
|
s.FirstBreakID = brokenID.String
|
||||||
|
}
|
||||||
|
s.FirstBreakPos = brokenPos
|
||||||
|
s.VerifierWalked = walked
|
||||||
|
}
|
||||||
|
|
||||||
|
return s, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// emit pretty-prints the snapshot to stdout. The trailing newline
|
||||||
|
// from json.Encoder is the right shape for both shell `tee` and
|
||||||
|
// python3 stdin handling.
|
||||||
|
func emit(s *Snapshot) {
|
||||||
|
enc := json.NewEncoder(os.Stdout)
|
||||||
|
enc.SetIndent("", " ")
|
||||||
|
if err := enc.Encode(s); err != nil {
|
||||||
|
log.Fatalf("encode snapshot: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -82,7 +82,17 @@ ARG LIBEST_REF
|
|||||||
# is the same major version libest r3.2.0 was tested against. libest
|
# is the same major version libest r3.2.0 was tested against. libest
|
||||||
# also wants libcurl + libsafec; we install both via apt rather than
|
# also wants libcurl + libsafec; we install both via apt rather than
|
||||||
# building from source for reproducibility.
|
# building from source for reproducibility.
|
||||||
RUN apt-get update && apt-get install --no-install-recommends -y \
|
#
|
||||||
|
# Hotfix #18 (2026-05-14): wrap in a 3-retry loop with --fix-missing
|
||||||
|
# fallback to absorb transient Debian mirror flakes. The original
|
||||||
|
# unwrapped apt-get install failed CI run #N on a "Connection reset
|
||||||
|
# by peer" mid-fetch of libssh2-1 from fastly's debian.org mirror at
|
||||||
|
# 151.101.202.132. Mirrors flake; production-grade Dockerfiles wrap
|
||||||
|
# network ops in retry. Same pattern as the main Dockerfile's npm-ci
|
||||||
|
# 3-retry loop from Hotfix #9.
|
||||||
|
RUN for i in 1 2 3; do \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install --no-install-recommends -y --fix-missing \
|
||||||
autoconf \
|
autoconf \
|
||||||
automake \
|
automake \
|
||||||
build-essential \
|
build-essential \
|
||||||
@@ -92,6 +102,10 @@ RUN apt-get update && apt-get install --no-install-recommends -y \
|
|||||||
libssl-dev \
|
libssl-dev \
|
||||||
libtool \
|
libtool \
|
||||||
pkg-config \
|
pkg-config \
|
||||||
|
&& break; \
|
||||||
|
echo "apt-get install attempt $i/3 failed; sleeping 5s before retry"; \
|
||||||
|
sleep 5; \
|
||||||
|
done \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
WORKDIR /src
|
WORKDIR /src
|
||||||
@@ -172,13 +186,22 @@ RUN git clone --depth 1 --branch ${LIBEST_REF} https://github.com/cisco/libest.g
|
|||||||
# Pinned to the same digest as the builder above (Bundle A / H-001).
|
# Pinned to the same digest as the builder above (Bundle A / H-001).
|
||||||
FROM debian:bullseye-slim@sha256:1a4701c321b1d28b1ff5f0230e766791e4b79b1d4c6c7a70064f4b297b1a330f
|
FROM debian:bullseye-slim@sha256:1a4701c321b1d28b1ff5f0230e766791e4b79b1d4c6c7a70064f4b297b1a330f
|
||||||
|
|
||||||
RUN apt-get update && apt-get install --no-install-recommends -y \
|
# Hotfix #18 (2026-05-14): same 3-retry pattern as the builder stage
|
||||||
|
# above. Runtime image installs are also vulnerable to transient
|
||||||
|
# mirror flakes.
|
||||||
|
RUN for i in 1 2 3; do \
|
||||||
|
apt-get update && \
|
||||||
|
apt-get install --no-install-recommends -y --fix-missing \
|
||||||
bash \
|
bash \
|
||||||
ca-certificates \
|
ca-certificates \
|
||||||
curl \
|
curl \
|
||||||
libcurl4 \
|
libcurl4 \
|
||||||
libssl1.1 \
|
libssl1.1 \
|
||||||
openssl \
|
openssl \
|
||||||
|
&& break; \
|
||||||
|
echo "apt-get install attempt $i/3 failed; sleeping 5s before retry"; \
|
||||||
|
sleep 5; \
|
||||||
|
done \
|
||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& useradd --create-home --uid 1000 estuser
|
&& useradd --create-home --uid 1000 estuser
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,52 @@
|
|||||||
|
# loadtest-artifacts/
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
Long-term archive of k6 load-test results from the `loadtest` GitHub
|
||||||
|
Actions workflow. TEST-005 closure (Sprint 5, 2026-05-16) introduces
|
||||||
|
this directory as the committed home for captures the operator
|
||||||
|
chooses to retain past GitHub's 90-day artifact-retention window.
|
||||||
|
|
||||||
|
## What lands here
|
||||||
|
|
||||||
|
After a `loadtest` workflow_dispatch run, follow the procedure in
|
||||||
|
[`docs/operator/scale-baseline-2026-Q2.md`](../../../docs/operator/scale-baseline-2026-Q2.md#capture-procedure):
|
||||||
|
|
||||||
|
1. Download the three matrix-leg artifacts from the workflow page.
|
||||||
|
2. Update the latest-capture table in the baseline doc with the
|
||||||
|
extracted percentiles.
|
||||||
|
3. Commit the raw artifacts you want long-term-retained here, named:
|
||||||
|
|
||||||
|
```
|
||||||
|
2026-Q2-bulk-renewal-<run-id>.tar.gz
|
||||||
|
2026-Q2-acme-burst-<run-id>.tar.gz
|
||||||
|
2026-Q2-agent-storm-<run-id>.tar.gz
|
||||||
|
```
|
||||||
|
|
||||||
|
4. If any single archive exceeds 100 MB, route it through `git lfs`
|
||||||
|
(configured at repo root via `.gitattributes`).
|
||||||
|
|
||||||
|
## Why commit artifacts rather than rely on GHA retention
|
||||||
|
|
||||||
|
- **GitHub Actions retains workflow artifacts for 90 days by default.**
|
||||||
|
Acquisition-diligence reviewers looking at scale evidence months
|
||||||
|
later get a 404 unless we keep the raw NDJSON in tree.
|
||||||
|
- **Reproducibility.** Pinning the k6 NDJSON to a SHA makes it
|
||||||
|
cheap to re-derive percentiles with a different filter (e.g.
|
||||||
|
"p99 excluding the warmup ramp's first 30 seconds") without
|
||||||
|
re-running the workflow.
|
||||||
|
|
||||||
|
## What does NOT belong here
|
||||||
|
|
||||||
|
- **Per-PR ephemeral runs.** The `loadtest` workflow runs on
|
||||||
|
workflow_dispatch + weekly cron; per-PR runs would be too noisy
|
||||||
|
and aren't retained.
|
||||||
|
- **Production-environment captures.** These artifacts are the
|
||||||
|
ubuntu-latest reference baseline. An operator capturing their
|
||||||
|
production-environment scale should put the artifacts in their
|
||||||
|
own observability platform — committing them here would imply
|
||||||
|
"this is what certctl's reference numbers are" which it isn't.
|
||||||
|
- **Manual k6 captures from a developer's laptop.** Same rationale
|
||||||
|
as the visual-regression snapshot runbook
|
||||||
|
([`docs/operator/runbooks/e2e-snapshot-update.md`](../../../docs/operator/runbooks/e2e-snapshot-update.md))
|
||||||
|
— only the CI environment produces canonical numbers.
|
||||||
@@ -55,6 +55,29 @@ This is the load-bearing two-person-integrity contract. Pinned by:
|
|||||||
- `internal/service/approval_test.go::TestApproval_Approve_RejectsSameActor` — service-level pin.
|
- `internal/service/approval_test.go::TestApproval_Approve_RejectsSameActor` — service-level pin.
|
||||||
- `internal/api/handler/approval_test.go::TestApproval_HandlerApproveAsSameActor_Returns403` — handler-level pin (HTTP 403 + body contains "two-person integrity").
|
- `internal/api/handler/approval_test.go::TestApproval_HandlerApproveAsSameActor_Returns403` — handler-level pin (HTTP 403 + body contains "two-person integrity").
|
||||||
|
|
||||||
|
## Enforcement invariants (COMP-006 closure)
|
||||||
|
|
||||||
|
Acquisition-audit COMP-006 closure (Sprint 7 ACQ, 2026-05-16). The audit flagged COMP-006 as UNKNOWN because it couldn't independently verify that the approval workflow was bullet-tight — i.e., that a denied approval definitely results in NO certificate being signed, and an approved approval definitely lets the issuance proceed. This subsection documents the enforcement chain end-to-end and names the tests that pin each layer.
|
||||||
|
|
||||||
|
**Layer 1 — Issuance gate.** `internal/service/certificate.go::CertificateService.Create` (around L341-373) reads `CertificateProfile.RequiresApproval`. When true, the created Job is stamped `JobStatusAwaitingApproval` (not `Pending`), AND a parallel `ApprovalRequest` row is created. The job processor never touches `AwaitingApproval` rows.
|
||||||
|
|
||||||
|
**Layer 2 — Approval state machine.** `internal/service/approval.go::ApprovalService.Reject` and `Approve` flip the approval row + the job row atomically:
|
||||||
|
|
||||||
|
- `Reject` → approval=`Rejected`, job=`Cancelled` (pinned by `internal/service/approval_test.go::TestApproval_Reject_TransitionsJobFromAwaitingApprovalToCancelled`).
|
||||||
|
- `Approve` → approval=`Approved`, job=`Pending` (pinned by `TestApproval_Approve_TransitionsJobFromAwaitingApprovalToPending`).
|
||||||
|
|
||||||
|
The "already terminal" guard (`TestApproval_Approve_RejectsAlreadyDecided`) prevents a rejected approval from later being flipped to approved.
|
||||||
|
|
||||||
|
**Layer 3 — Job claim filter (the LOAD-BEARING SQL invariant).** `internal/repository/postgres/job.go::JobRepository.ClaimPendingJobs` (around L296-310) issues:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT ... FROM jobs WHERE status = $1
|
||||||
|
```
|
||||||
|
|
||||||
|
with `$1 = JobStatusPending`. Cancelled jobs are therefore **never** returned to `ProcessPendingJobs`, so the certificate-issuance call path (the only path that signs certs) is unreachable for a denied approval. This SQL filter is the load-bearing "no cert if denied" enforcement — Layer 2 transitions the job to `Cancelled`, Layer 3 ensures `Cancelled` jobs are inert.
|
||||||
|
|
||||||
|
**Composition pin.** `internal/service/approval_test.go::TestApproval_COMP006_DenyChainPinsNoCertIfRejected` and `TestApproval_COMP006_ApproveChainPinsJobReachesPending` re-attest the Layer-2-to-Layer-3 handoff in a single named test pair for future auditors. A refactor that, e.g., silently transitioned a denied approval's job to `Pending` instead of `Cancelled` would trip these tests before shipping.
|
||||||
|
|
||||||
## Operator playbook: "I need to approve a renewal"
|
## Operator playbook: "I need to approve a renewal"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
|
|||||||
@@ -0,0 +1,161 @@
|
|||||||
|
# Audit-trail tamper-evidence (audit_events hash chain)
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
Sprint 6 COMP-001-HASH closure. The `audit_events` table has two
|
||||||
|
layered defenses against history rewrites:
|
||||||
|
|
||||||
|
| Layer | Migration | What it blocks |
|
||||||
|
|---|---|---|
|
||||||
|
| **WORM trigger** | `000018_audit_events_worm.up.sql` | The application role cannot `UPDATE` or `DELETE` rows (tamper-**prevention**). |
|
||||||
|
| **Hash chain** | `000047_audit_events_hash_chain.up.sql` | A compliance superuser (DB-superuser-equivalent) who bypasses the WORM trigger CAN still rewrite rows, but the rewrite is **detectable** — every subsequent `audit_events_verify_chain()` walk reports the first broken row's id + position (tamper-**evidence**). |
|
||||||
|
|
||||||
|
This document covers the hash-chain layer. The WORM layer is
|
||||||
|
documented inline in `migrations/000018_audit_events_worm.up.sql`.
|
||||||
|
|
||||||
|
## Why a hash chain in addition to WORM
|
||||||
|
|
||||||
|
The WORM trigger documents (in its header comment) that a compliance
|
||||||
|
superuser role exists by design — backup-restore, retention purges,
|
||||||
|
and breach-recovery operators need a way through. Without a hash
|
||||||
|
chain, that role can rewrite any row's `actor` / `action` / `details`
|
||||||
|
content with no on-disk trace.
|
||||||
|
|
||||||
|
HIPAA §164.312(b), FedRAMP AU-9, and NIST 800-53 AU-10 want
|
||||||
|
tamper-**evidence**, not just tamper-prevention. The hash chain
|
||||||
|
provides it: every row carries a `row_hash = sha256(prev_hash || id
|
||||||
|
|| actor || actor_type || action || resource_type || resource_id
|
||||||
|
|| details::text || timestamp_iso8601_utc || event_category)`, and
|
||||||
|
the genesis row's `prev_hash` is `NULL`. Mutating any field in any
|
||||||
|
row breaks the chain at that row's position; the verifier returns
|
||||||
|
the first break.
|
||||||
|
|
||||||
|
## The verifier function
|
||||||
|
|
||||||
|
`audit_events_verify_chain()` is a STABLE plpgsql function shipped
|
||||||
|
in migration 000047. It walks every row in `(timestamp ASC, id ASC)`
|
||||||
|
order, recomputes each row's expected hash, and returns:
|
||||||
|
|
||||||
|
```
|
||||||
|
first_break_id TEXT -- NULL if the chain validated end-to-end
|
||||||
|
first_break_pos INT -- 0-indexed position of the first break
|
||||||
|
row_count INT -- rows walked (= position + 1 on break, else table size)
|
||||||
|
```
|
||||||
|
|
||||||
|
Call it directly from psql:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT first_break_id, first_break_pos, row_count FROM audit_events_verify_chain();
|
||||||
|
```
|
||||||
|
|
||||||
|
## Scheduled verification + Prometheus exposure
|
||||||
|
|
||||||
|
The scheduler's `auditChainVerifyLoop` calls the verifier every
|
||||||
|
`CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL` (default 6h) and writes the
|
||||||
|
results into the `AuditChainCounter` instance shared with the
|
||||||
|
metrics handler. Four metrics get exposed at
|
||||||
|
`/api/v1/metrics/prometheus`:
|
||||||
|
|
||||||
|
| Metric | Type | Meaning |
|
||||||
|
|---|---|---|
|
||||||
|
| `certctl_audit_chain_break_detected_total` | counter | Sticky once non-zero — the actionable alarm. |
|
||||||
|
| `certctl_audit_chain_verify_total` | counter | Walks completed. Cross-check that the loop is alive. |
|
||||||
|
| `certctl_audit_chain_rows` | gauge | Most recent walk's row count. |
|
||||||
|
| `certctl_audit_chain_last_verified_at` | gauge | Unix seconds of most recent walk (0 = never). |
|
||||||
|
|
||||||
|
The recommended alert rule is:
|
||||||
|
|
||||||
|
```
|
||||||
|
ALERT AuditChainBreak
|
||||||
|
IF certctl_audit_chain_break_detected_total > 0
|
||||||
|
FOR 1m
|
||||||
|
LABELS { severity = "page", category = "compliance" }
|
||||||
|
ANNOTATIONS {
|
||||||
|
summary = "audit_events hash chain break detected — investigate immediately",
|
||||||
|
runbook = "<your-runbook-url>/audit-chain-break"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Cross-check `certctl_audit_chain_last_verified_at` (should advance
|
||||||
|
roughly every `CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL`) and
|
||||||
|
`certctl_audit_chain_verify_total` (should increment monotonically).
|
||||||
|
A stalled `_verified_at` with an unchanged `_verify_total` means the
|
||||||
|
scheduler loop has died — page on that too.
|
||||||
|
|
||||||
|
## Performance notes
|
||||||
|
|
||||||
|
The walk is `O(N)` plpgsql over the `audit_events` table. On
|
||||||
|
testcontainers + postgres:16-alpine the cost scales linearly:
|
||||||
|
|
||||||
|
| Row count | Walk duration (approx) |
|
||||||
|
|---|---|
|
||||||
|
| 10k | < 50 ms |
|
||||||
|
| 100k | < 500 ms |
|
||||||
|
| 1M | 2-3 s |
|
||||||
|
| 10M | 25-30 s |
|
||||||
|
|
||||||
|
A 5-minute per-tick context timeout (in
|
||||||
|
`internal/scheduler/scheduler.go::runAuditChainVerify`) bounds the
|
||||||
|
worst case. Fleets with > 10M audit rows should consider:
|
||||||
|
|
||||||
|
1. Lengthening `CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL` to 24h.
|
||||||
|
2. Pre-aggregating older rows (out of scope today — would require a
|
||||||
|
"chain checkpoint" concept that re-anchors the genesis hash to a
|
||||||
|
snapshot's row_hash; future work if needed).
|
||||||
|
|
||||||
|
## What to do when a break is detected
|
||||||
|
|
||||||
|
1. **Don't panic, don't auto-remediate.** The break is a forensic
|
||||||
|
signal, not a self-healing event.
|
||||||
|
2. **Capture the position + id.** The metric exposes both, but the
|
||||||
|
sticky in-memory state (`AuditChainCounter.BrokenAtID`) only
|
||||||
|
records the first break. SQL the verifier yourself to enumerate
|
||||||
|
downstream breaks:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT first_break_id, first_break_pos, row_count FROM audit_events_verify_chain();
|
||||||
|
```
|
||||||
|
|
||||||
|
3. **Snapshot the table.** `pg_dump --table=audit_events --data-only`
|
||||||
|
to a chain-of-custody location. The next investigative step is
|
||||||
|
recovering the original row content from the most recent backup
|
||||||
|
that pre-dates the tampering — without this snapshot you can't
|
||||||
|
tell which write order caused the divergence.
|
||||||
|
4. **Audit the compliance-superuser credential trail.** The break
|
||||||
|
implies someone with non-app DB credentials wrote to
|
||||||
|
`audit_events`. Rotate the credential, investigate every recent
|
||||||
|
session that authenticated under it, and review the WAL for the
|
||||||
|
write.
|
||||||
|
5. **Restore + cross-reference.** If you keep streaming WAL or
|
||||||
|
periodic snapshots, restore a known-good snapshot to a sandbox
|
||||||
|
and `EXCEPT`-diff the two `audit_events` tables to enumerate
|
||||||
|
every mutated row.
|
||||||
|
|
||||||
|
## Backfill behavior
|
||||||
|
|
||||||
|
Migration 000047 backfills existing `audit_events` rows in
|
||||||
|
`(timestamp ASC, id ASC)` order during its transaction. The WORM
|
||||||
|
trigger is temporarily `DISABLE`d for the duration; subsequent
|
||||||
|
`ENABLE` is a no-op equivalent. The migration is idempotent — a
|
||||||
|
re-run sees `row_hash IS NULL` rows as the only backfill targets, so
|
||||||
|
already-hashed rows are not touched.
|
||||||
|
|
||||||
|
Once backfill completes, `row_hash` becomes `NOT NULL`. `prev_hash`
|
||||||
|
remains nullable so the genesis row (first row in the chain) stays
|
||||||
|
representable.
|
||||||
|
|
||||||
|
## Operator configuration
|
||||||
|
|
||||||
|
| Env var | Default | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL` | `6h` | Tick cadence for the scheduler's verify loop. Zero or negative is ignored. |
|
||||||
|
|
||||||
|
## See also
|
||||||
|
|
||||||
|
- `migrations/000047_audit_events_hash_chain.up.sql` — migration source.
|
||||||
|
- `migrations/000018_audit_events_worm.up.sql` — paired WORM trigger.
|
||||||
|
- `internal/repository/postgres/audit_chain_test.go` — testcontainers integration tests.
|
||||||
|
- `internal/repository/postgres/audit_worm_test.go` — WORM behaviour tests.
|
||||||
|
- `internal/scheduler/scheduler.go::auditChainVerifyLoop` — scheduler loop.
|
||||||
|
- `internal/service/audit_chain_metric.go` — `AuditChainCounter`.
|
||||||
|
- `internal/api/handler/metrics.go` — Prometheus exposer.
|
||||||
@@ -300,6 +300,64 @@ constant, router-level no-rbacGate-wraps-protocol-paths).
|
|||||||
attacks where an attacker captures a logout JWT and replays it.
|
attacks where an attacker captures a logout JWT and replays it.
|
||||||
- **Cache-Control: no-store** on the response per spec §2.5.
|
- **Cache-Control: no-store** on the response per spec §2.5.
|
||||||
|
|
||||||
|
### Userinfo + BCL SSRF parity (post-SEC-001 follow-up)
|
||||||
|
|
||||||
|
The original SEC-001 closure (Sprint 1, 2026-05-16) routed two OIDC
|
||||||
|
discovery legs — `test_discovery.go` dry-run and `service.go` runtime
|
||||||
|
provider load — through `validation.SafeHTTPDialContext` via the
|
||||||
|
`SafeOIDCContext(ctx)` helper at
|
||||||
|
[`internal/auth/oidc/safehttp.go`](../../internal/auth/oidc/safehttp.go).
|
||||||
|
The acquisition-audit follow-up (2026-05-16) flagged two adjacent
|
||||||
|
call sites the sweep missed; both are now wrapped identically.
|
||||||
|
|
||||||
|
- **SEC-020 — Userinfo fallback (`fetchUserinfoGroups`).**
|
||||||
|
`internal/auth/oidc/service.go` previously called
|
||||||
|
`entry.provider.UserInfo(ctx, ts)` with the bare request context
|
||||||
|
on the userinfo-fallback leg (operator opt-in when an IdP doesn't
|
||||||
|
surface groups in the ID token). go-oidc/v3's `Provider.UserInfo`
|
||||||
|
derives its `http.Client` from `ctx` via `getClient(ctx)`
|
||||||
|
(`oidc.go:61-65`); without an override the internal `doRequest`
|
||||||
|
falls through to `http.DefaultClient` — no SSRF guard, no DNS-
|
||||||
|
rebinding re-resolve at dial time. An IdP whose discovery doc
|
||||||
|
advertises a `userinfo_endpoint` pointing at a reserved address
|
||||||
|
(loopback, cloud-metadata `169.254.169.254`, RFC 1918) would
|
||||||
|
trigger an unguarded egress at userinfo-fetch time. Fixed by
|
||||||
|
wrapping `ctx` via `SafeOIDCContext(ctx)` before both
|
||||||
|
`oauthConfig.TokenSource` and `provider.UserInfo`. Pinned by
|
||||||
|
`TestFetchUserinfoGroups_SSRF_BlocksReservedAddress`.
|
||||||
|
|
||||||
|
- **SEC-021 — Back-channel logout discovery re-fetch.**
|
||||||
|
`internal/api/handler/auth_session_oidc_bcl.go::Verify` performs
|
||||||
|
a per-request `gooidc.NewProvider(ctx, matched.IssuerURL)` to
|
||||||
|
fetch the JWKS for verifying the BCL token's signature. Same
|
||||||
|
bare-ctx shape — an IdP whose registered `IssuerURL` resolves to
|
||||||
|
a reserved address (or that is rebinding to one at logout time)
|
||||||
|
would dial an unguarded HTTPS egress. Fixed by wrapping via
|
||||||
|
`oidcsvc.SafeOIDCContext(ctx)` before `NewProvider`. Pinned by
|
||||||
|
`TestDefaultBCLVerifier_SSRF_BlocksReservedAddress`.
|
||||||
|
|
||||||
|
- **Context-key shape (why a single wrap covers both legs).**
|
||||||
|
`gooidc.ClientContext` is implemented as
|
||||||
|
`context.WithValue(ctx, oauth2.HTTPClient, client)` (go-oidc
|
||||||
|
v3.18.0 `oidc.go:57-59`). Both go-oidc's `getClient` AND
|
||||||
|
`golang.org/x/oauth2`'s `internal.ContextClient` read the same
|
||||||
|
`oauth2.HTTPClient` key. So the single `SafeOIDCContext` wrap
|
||||||
|
covers go-oidc-driven HTTP (Provider.UserInfo, NewProvider
|
||||||
|
discovery, Verifier JWKS) AND oauth2-driven HTTP
|
||||||
|
(Config.TokenSource refresh, Config.Exchange). No additional
|
||||||
|
`context.WithValue(ctx, oauth2.HTTPClient, ...)` is required.
|
||||||
|
|
||||||
|
- **Out-of-scope: RFC 1918.** Per the `IsReservedIP` policy
|
||||||
|
documented at [`internal/validation/ssrf.go:15-32`](../../internal/validation/ssrf.go),
|
||||||
|
RFC 1918 ranges are NOT treated as reserved by the SSRF guard.
|
||||||
|
certctl is designed to manage certificates inside private
|
||||||
|
networks; filtering 10/8 + 172.16/12 + 192.168/16 would break
|
||||||
|
the primary use case. Operators on hosted IaaS who want
|
||||||
|
RFC 1918 treated as reserved can opt in via the future
|
||||||
|
`CERTCTL_BLOCK_RFC1918_OUTBOUND` toggle (see acquisition-audit
|
||||||
|
Sprint 5 RED-005). The Sprint 1 SSRF parity fix above closes
|
||||||
|
the loopback / link-local / cloud-metadata leg only.
|
||||||
|
|
||||||
### OIDC first-admin bootstrap
|
### OIDC first-admin bootstrap
|
||||||
|
|
||||||
- **Coexists with the env-var-token bootstrap path.** Both can be
|
- **Coexists with the env-var-token bootstrap path.** Both can be
|
||||||
|
|||||||
@@ -94,6 +94,46 @@ helm upgrade certctl deploy/helm/certctl/ \
|
|||||||
|
|
||||||
Postgres state survives the upgrade (the PVC is retained). The server / agent images bump per the chart's `image.tag`. See [`docs/archive/upgrades/`](../archive/upgrades/) for version-specific upgrade guidance.
|
Postgres state survives the upgrade (the PVC is retained). The server / agent images bump per the chart's `image.tag`. See [`docs/archive/upgrades/`](../archive/upgrades/) for version-specific upgrade guidance.
|
||||||
|
|
||||||
|
### 2026-05-16 — ServiceMonitor TLS default flipped (DEPL-004)
|
||||||
|
|
||||||
|
Acquisition-audit DEPL-004 closure. Pre-2026-05-16, `monitoring.serviceMonitor.tlsConfig` was empty by default and the chart template fell through to an implicit `insecureSkipVerify: true`. Post-2026-05-16, the values.yaml default is a real TLS verify against the chart's CA (caFile + serverName matching the existingSecret mount path the chart's Prometheus integration produces).
|
||||||
|
|
||||||
|
The new default works out of the box for the canonical install (the chart's `existingSecret` or cert-manager-emitted Secret mounted at `/etc/prometheus/secrets/certctl-ca/`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# Default in values.yaml (no operator action required for the
|
||||||
|
# canonical install path).
|
||||||
|
monitoring:
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
tlsConfig:
|
||||||
|
caFile: /etc/prometheus/secrets/certctl-ca/ca.crt
|
||||||
|
serverName: certctl-server
|
||||||
|
```
|
||||||
|
|
||||||
|
Operators whose Prometheus pod mounts the CA bundle at a different path override `caFile`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
monitoring:
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
tlsConfig:
|
||||||
|
caFile: /path/to/your/ca.crt
|
||||||
|
serverName: your-cert-CN
|
||||||
|
```
|
||||||
|
|
||||||
|
Operators who genuinely need `insecureSkipVerify` (demo / dev clusters) must opt in **explicitly** — blanking the `tlsConfig` block trips the chart's `{{ fail }}` guard at render time:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
monitoring:
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: true
|
||||||
|
tlsConfig:
|
||||||
|
insecureSkipVerify: true
|
||||||
|
```
|
||||||
|
|
||||||
|
There is no way to inherit the pre-2026-05-16 implicit-skipVerify behavior silently. Operators with `monitoring.serviceMonitor.enabled: false` (the chart default) need no action — the template short-circuits before the `tlsConfig` block.
|
||||||
|
|
||||||
## Configuration reference
|
## Configuration reference
|
||||||
|
|
||||||
Every value is documented at `deploy/helm/certctl/values.yaml`. Common tweaks:
|
Every value is documented at `deploy/helm/certctl/values.yaml`. Common tweaks:
|
||||||
|
|||||||
+177
-53
@@ -74,22 +74,55 @@ metric surface meet our SLO needs today" — not "is the right library
|
|||||||
under the hood." If the answer to the first question is yes, the
|
under the hood." If the answer to the first question is yes, the
|
||||||
second is a refactor, not a feature gap.
|
second is a refactor, not a feature gap.
|
||||||
|
|
||||||
## Tracing — explicitly not yet shipped
|
## Tracing — OTLP surface available, instrumentation pending
|
||||||
|
|
||||||
certctl does **not** ship distributed tracing instrumentation today:
|
Sprint 6 ACQ DEPL-006 closure (2026-05-16) stood up the OTel tracer-
|
||||||
|
provider surface. Operators with an OTel collector can opt in via:
|
||||||
|
|
||||||
- No OpenTelemetry SDK setup in `cmd/server/main.go`.
|
```
|
||||||
- No OTLP exporter wired into outbound calls (issuer connectors,
|
CERTCTL_OTEL_ENABLED=true
|
||||||
agent enrollment, etc.).
|
OTEL_EXPORTER_OTLP_ENDPOINT=https://otel-collector.example.com:4318
|
||||||
- The `go.opentelemetry.io/otel` packages that appear in
|
```
|
||||||
[`go.mod`](../../go.mod) are indirect-only — they're transitive
|
|
||||||
dependencies of `coreos/go-oidc` and similar.
|
|
||||||
|
|
||||||
This is honest: there is no in-process tracing surface to monitor,
|
When `CERTCTL_OTEL_ENABLED` is true, `cmd/server/main.go` calls
|
||||||
correlate, or sample. If your environment requires end-to-end traces
|
`internal/observability.Init` which:
|
||||||
across the certctl control plane + agents + issuer backends, this is
|
|
||||||
a gap you would close on the certctl side as part of a v3 work item.
|
- Constructs an OTLP/HTTP exporter (chosen over OTLP/gRPC to keep
|
||||||
Until then:
|
the dependency surface narrow — see `internal/observability/otel.go`
|
||||||
|
header for the transport-choice rationale).
|
||||||
|
- Registers a real `sdktrace.TracerProvider` as the otel global.
|
||||||
|
- Honors the standard OTel env vars (`OTEL_EXPORTER_OTLP_ENDPOINT`,
|
||||||
|
`OTEL_EXPORTER_OTLP_HEADERS`, `OTEL_EXPORTER_OTLP_INSECURE`,
|
||||||
|
`OTEL_SERVICE_NAME` overrides the default `certctl-server`, etc.).
|
||||||
|
- Defers a graceful shutdown that flushes the in-flight batcher.
|
||||||
|
|
||||||
|
What this **does not** ship yet:
|
||||||
|
|
||||||
|
- No per-handler / per-DB / per-connector span instrumentation in
|
||||||
|
the certctl code base. The OTel SDK emits the spans it generates
|
||||||
|
internally (process resource attributes, eventual stdlib HTTP
|
||||||
|
spans), but certctl-domain spans (issuance, renewal, deployment,
|
||||||
|
agent enrollment) are a v2.3 roadmap follow-up.
|
||||||
|
- No tracing-correlated metric exemplars in the Prometheus
|
||||||
|
histograms above. Those still ship the per-issuer latency signal
|
||||||
|
without per-request fan-out.
|
||||||
|
- No backwards-compat shim — operators who never set
|
||||||
|
`CERTCTL_OTEL_ENABLED` (the default) see zero behavior change.
|
||||||
|
The init returns a no-op shutdown so the deferred call is safe
|
||||||
|
to invoke unconditionally.
|
||||||
|
|
||||||
|
When this matters today:
|
||||||
|
|
||||||
|
- Operators wiring up a v3 instrumentation effort have the OTel
|
||||||
|
surface in place; they only need to add `tracer.Start(ctx, "…")`
|
||||||
|
call sites in the handler/service code.
|
||||||
|
- Operators evaluating certctl for acquisition / due-diligence see
|
||||||
|
an opt-in OTel surface in the current release rather than a "v3
|
||||||
|
roadmap item" — a useful signal for buyer credibility per the
|
||||||
|
acquisition-thesis framing in `WORKSPACE-ROADMAP.md` §3.
|
||||||
|
|
||||||
|
Existing correlation surfaces stay in place until span coverage
|
||||||
|
ships:
|
||||||
|
|
||||||
- Structured logs include a `request_id` you can correlate across
|
- Structured logs include a `request_id` you can correlate across
|
||||||
the server log stream. See
|
the server log stream. See
|
||||||
@@ -99,8 +132,9 @@ Until then:
|
|||||||
same per-issuer latency signal a trace span would, just without
|
same per-issuer latency signal a trace span would, just without
|
||||||
the per-request fan-out.
|
the per-request fan-out.
|
||||||
|
|
||||||
OpenTelemetry instrumentation is tracked in
|
Per-handler / per-query / per-connector span instrumentation is
|
||||||
[WORKSPACE-ROADMAP.md](../../WORKSPACE-ROADMAP.md) as a v3 item.
|
tracked in [WORKSPACE-ROADMAP.md](../../WORKSPACE-ROADMAP.md) under
|
||||||
|
§2 (NHI / Agent Identity, Phase 4 in the path-b build plan).
|
||||||
|
|
||||||
## Logging
|
## Logging
|
||||||
|
|
||||||
@@ -121,52 +155,142 @@ explicitly scrubs the password before it reaches the audit subsystem
|
|||||||
(see [`docs/operator/auth-threat-model.md`](auth-threat-model.md) §
|
(see [`docs/operator/auth-threat-model.md`](auth-threat-model.md) §
|
||||||
"Break-glass token leak").
|
"Break-glass token leak").
|
||||||
|
|
||||||
## Rate-limit behavior under restarts and replicas
|
## Rate-limit behavior — configurable backend (memory or postgres)
|
||||||
|
|
||||||
Where rate limits exist, they are **per-process, in-memory,
|
The sliding-window-log rate limiters used across certctl's
|
||||||
reset-on-restart, and not shared across replicas**. This matters for
|
authenticated-but-shared-credential code paths (break-glass login,
|
||||||
multi-replica deployments and for any compliance posture that asks
|
OCSP per-IP, cert-export per-actor, EST per-principal, EST
|
||||||
"what limits apply globally vs per-pod."
|
failed-basic source-IP) carry a **configurable backend**. The
|
||||||
|
operator picks between two implementations via
|
||||||
|
`CERTCTL_RATE_LIMIT_BACKEND`:
|
||||||
|
|
||||||
|
| Value | When to use |
|
||||||
|
|------------|------------------------------------------------------|
|
||||||
|
| `memory` | Default. Single-replica deploys; sketchpad / dev. |
|
||||||
|
| `postgres` | HA deploys (`server.replicas > 1`). Cross-replica-consistent. |
|
||||||
|
|
||||||
|
Phase 13 Sprint 13.2/13.3 (architecture diligence audit ARCH-M1
|
||||||
|
closure) replaced the prior single-process limitation with a
|
||||||
|
substantive close: when the operator opts into `postgres`, all
|
||||||
|
replicas share the same
|
||||||
|
`rate_limit_buckets` table (migration 000046) and per-key access is
|
||||||
|
arbitrated via `SELECT FOR UPDATE` row locks. A 3-replica cluster
|
||||||
|
hitting one rate-limited endpoint concurrently sees exactly the
|
||||||
|
configured cap succeed across the cluster — not 3× the cap as the
|
||||||
|
old per-process backend would have allowed.
|
||||||
|
|
||||||
|
### Operator decision tree
|
||||||
|
|
||||||
|
```
|
||||||
|
Single replica (server.replicas = 1, the helm chart default)?
|
||||||
|
└─ Use CERTCTL_RATE_LIMIT_BACKEND=memory (the default; no action
|
||||||
|
required). Bucket lookups stay in-process; zero DB round-trips
|
||||||
|
on the hot path.
|
||||||
|
|
||||||
|
Two or more replicas?
|
||||||
|
└─ Use CERTCTL_RATE_LIMIT_BACKEND=postgres. Two extra DB round-trips
|
||||||
|
per Allow call (BEGIN ... SELECT FOR UPDATE ... UPDATE ... COMMIT);
|
||||||
|
acceptable on the gated hot path. The Sprint 13.2 multi-replica
|
||||||
|
integration test pins exactly-cap enforcement across N replicas
|
||||||
|
as the closure proof.
|
||||||
|
```
|
||||||
|
|
||||||
### Inventory
|
### Inventory
|
||||||
|
|
||||||
| Limiter | Scope | Window | Cap | Survives restart? | Shared across replicas? |
|
| Limiter | Scope | Window | Cap |
|
||||||
|---|---|---|---|---|---|
|
|---|---|---|---|
|
||||||
| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts | No | No |
|
| Break-glass login (per source-IP) | `internal/api/handler/auth_breakglass.go` | 60s | 5 attempts |
|
||||||
| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) | No | No |
|
| OCSP query (per source-IP) | `internal/api/handler/certificates.go` | 60s | configurable (`CERTCTL_OCSP_RATE_LIMIT_PER_IP_MIN`) |
|
||||||
| EST per-principal CSR enrollment | `internal/est/` | 60s | configurable | No | No |
|
| Cert export (per actor) | `internal/api/handler/export.go` | 1h | configurable (`CERTCTL_CERT_EXPORT_RATE_LIMIT_PER_ACTOR_HR`) |
|
||||||
| EST HTTP-Basic source-IP failed-auth | `internal/est/` | 60s | configurable | No | No |
|
| EST per-principal CSR enrollment | `internal/api/handler/est.go` | 24h | configurable (per-profile `RateLimitPerPrincipal24h`) |
|
||||||
| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable | No | No |
|
| EST HTTP-Basic source-IP failed-auth | `internal/api/handler/est.go` | 60m | 10 attempts |
|
||||||
|
| SCEP/Intune per-device challenge | `internal/scep/intune/` | 60s | configurable (`*_PER_MINUTE`) |
|
||||||
|
| ACME per-account orders / key-change / challenge-respond | `internal/service/acme.go` | 1h | configurable |
|
||||||
|
|
||||||
All five use the shared `internal/ratelimit/sliding_window.go`
|
The `CERTCTL_RATE_LIMIT_BACKEND` selector applies to the first five
|
||||||
primitive. Buckets live in a single per-process map guarded by a
|
(the cmd/server-wired limiters). The SCEP/Intune wrapper + the ACME
|
||||||
mutex; the package-level cap prevents unbounded growth under
|
per-account limiter ride their own internal accounting today; both
|
||||||
adversarial key cardinality (default 100,000 keys; oldest-by-newest-
|
are tracked as follow-ups in WORKSPACE-ROADMAP.md.
|
||||||
timestamp evicted under pressure).
|
|
||||||
|
|
||||||
### Implications for multi-replica deployments
|
### Backend internals
|
||||||
|
|
||||||
- **Effective per-replica cap is the documented cap.** A 2-replica
|
Both backends share the algorithm: sliding-window log + per-key
|
||||||
deployment lets through up to 2× the per-key window cap before
|
bucket + prune-on-Allow.
|
||||||
either replica rejects.
|
|
||||||
- **Restart resets the bucket.** A `kubectl rollout restart` empties
|
|
||||||
the in-memory windows on every replica. An attacker who notices
|
|
||||||
this could in principle re-issue burst attempts after every roll;
|
|
||||||
the threat model accepts this because rollouts are operator-driven
|
|
||||||
and the relevant endpoints already require credentials.
|
|
||||||
- **No cross-replica fan-out.** Rate-limit decisions on replica A
|
|
||||||
are not visible to replica B. Sticky-session ingress routing (with
|
|
||||||
`service.spec.sessionAffinity: ClientIP` on Kubernetes or the
|
|
||||||
equivalent on your load balancer) tightens the effective cap to
|
|
||||||
per-replica + per-source-IP rather than per-replica + per-source-IP
|
|
||||||
for whichever pod the request happened to land on.
|
|
||||||
|
|
||||||
If your threat model requires globally-enforced rate limits across
|
**Memory backend (`memory`)** — per-process map keyed by bucket key;
|
||||||
replicas, the implementation surface is roughly: swap the per-process
|
mutex-guarded; package-level LRU cap prevents unbounded growth under
|
||||||
map for a database-backed sliding window (or a Redis-backed equivalent
|
adversarial key cardinality (default 100,000 keys per limiter
|
||||||
if you already run Redis). This is on the
|
instance; oldest-by-newest-timestamp evicted under pressure).
|
||||||
[WORKSPACE-ROADMAP.md](../../WORKSPACE-ROADMAP.md) as a v3 item;
|
Implemented at `internal/ratelimit/sliding_window.go`.
|
||||||
nothing in the certctl threat model today requires it.
|
|
||||||
|
**Postgres backend (`postgres`)** — same algorithm against the
|
||||||
|
`rate_limit_buckets` table:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE rate_limit_buckets (
|
||||||
|
bucket_key TEXT PRIMARY KEY,
|
||||||
|
timestamps TIMESTAMPTZ[] NOT NULL DEFAULT '{}',
|
||||||
|
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
`Allow(key, now)` opens a transaction, ensures the row exists
|
||||||
|
(`INSERT ... ON CONFLICT DO NOTHING`), acquires the row lock
|
||||||
|
(`SELECT ... FOR UPDATE`), prunes timestamps older than `now-window`,
|
||||||
|
compares the post-prune count against `maxN`, conditionally appends
|
||||||
|
`now`, persists, and commits. The row lock is what arbitrates across
|
||||||
|
replicas: replicas A and B firing simultaneous `Allow("k")` never
|
||||||
|
race because Postgres serializes the per-key row update across the
|
||||||
|
cluster. Implemented at
|
||||||
|
`internal/ratelimit/postgres_sliding_window.go`.
|
||||||
|
|
||||||
|
### Janitor sweep (postgres backend only)
|
||||||
|
|
||||||
|
The scheduler runs a `rate_limit_buckets` janitor every
|
||||||
|
`CERTCTL_RATE_LIMIT_JANITOR_INTERVAL` (default 5m, minimum 1m). The
|
||||||
|
sweep deletes rows whose `updated_at` is older than the longest
|
||||||
|
configured window any limiter uses (24h today, matching the EST
|
||||||
|
per-principal limiter). Idempotent; repeated sweeps find zero rows.
|
||||||
|
The memory backend's prune-on-Allow path keeps buckets short-lived
|
||||||
|
without a separate sweep, so the loop is a no-op when
|
||||||
|
`backend=memory`.
|
||||||
|
|
||||||
|
### Falsifiable closure proof
|
||||||
|
|
||||||
|
The Phase 13 Sprint 13.2 integration test
|
||||||
|
`internal/integration/ratelimit_multi_replica_test.go`
|
||||||
|
(`//go:build integration`) fires 100 concurrent `Allow("test-key")`
|
||||||
|
calls round-robined across 3 independent `PostgresSlidingWindowLimiter`
|
||||||
|
instances sharing one Postgres database (`cap=10`, `window=1m`) and
|
||||||
|
asserts exactly 10 succeed + 90 return `ErrRateLimited`. If the
|
||||||
|
cross-replica row lock weren't arbitrating, each replica would
|
||||||
|
independently let through ~3-4 requests, giving 12-15 successes
|
||||||
|
total. Re-run:
|
||||||
|
|
||||||
|
```
|
||||||
|
go test -tags=integration -count=1 -run TestRateLimit_MultiReplica \
|
||||||
|
./internal/integration/...
|
||||||
|
```
|
||||||
|
|
||||||
|
### Helm chart wiring
|
||||||
|
|
||||||
|
The helm chart at `deploy/helm/certctl/` exposes the backend via
|
||||||
|
`server.rateLimiting.backend` (default `memory`). To opt into the
|
||||||
|
postgres backend for an HA deploy:
|
||||||
|
|
||||||
|
```
|
||||||
|
helm upgrade --install certctl deploy/helm/certctl \
|
||||||
|
--set server.replicas=3 \
|
||||||
|
--set server.rateLimiting.backend=postgres \
|
||||||
|
--set server.rateLimiting.janitorInterval=5m
|
||||||
|
```
|
||||||
|
|
||||||
|
`server.replicas > 1` without flipping `backend` to `postgres` works
|
||||||
|
fine — the limits stay per-process — but the operator gets a 2× /
|
||||||
|
3× / Nx effective cap depending on replica count. The chart does NOT
|
||||||
|
auto-flip on `replicas > 1` because some HA deploys deliberately want
|
||||||
|
per-process limits (sticky-session ingress + tight per-replica caps
|
||||||
|
to detect bot traffic at the edge before it hits the application).
|
||||||
|
|
||||||
### Where these numbers live
|
### Where these numbers live
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,136 @@
|
|||||||
|
# Privacy & retention (federated-user PII)
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
Sprint 6 COMP-002-RETENTION closure. certctl stores three categories
|
||||||
|
of personally-identifiable information for federated humans (Auth
|
||||||
|
Bundle 2 OIDC users):
|
||||||
|
|
||||||
|
| Column | Source | Used by |
|
||||||
|
|---|---|---|
|
||||||
|
| `users.email` | IdP claim (`email`) | Operator GUI "find user by email", display in lists, audit attribution. |
|
||||||
|
| `users.display_name` | IdP claim (`name`) | UI display string for the human. |
|
||||||
|
| `users.oidc_subject` | IdP claim (`sub`) | Stable identifier — joined with `oidc_provider_id` in the (provider, subject) UNIQUE constraint. |
|
||||||
|
|
||||||
|
Pre-fix, deactivating a user (admin-side `auth.user.deactivate`)
|
||||||
|
soft-deleted the row by setting `deactivated_at` but left the PII
|
||||||
|
columns populated indefinitely. The Sprint 6 fix adds an automatic
|
||||||
|
purge pipeline.
|
||||||
|
|
||||||
|
## Retention pipeline shape
|
||||||
|
|
||||||
|
```
|
||||||
|
Day 0 admin → POST /api/v1/auth/users/u-X/deactivate
|
||||||
|
├─ users.deactivated_at = NOW()
|
||||||
|
└─ all active sessions for u-X revoked
|
||||||
|
|
||||||
|
Day N scheduler's userRetentionLoop tick (default cadence 24h)
|
||||||
|
└─ UserRetentionService.PurgeDeactivatedUsers
|
||||||
|
├─ SELECT users WHERE deactivated_at < NOW() - retention_window
|
||||||
|
├─ For each row (batch-capped per tick):
|
||||||
|
│ UserRetentionService.DeleteUserPII(u.id)
|
||||||
|
│ ├─ revoke all active sessions (defense-in-depth)
|
||||||
|
│ ├─ email := "purged@redacted.local"
|
||||||
|
│ ├─ display_name := "[purged]"
|
||||||
|
│ ├─ oidc_subject := "sha256:" || hex(sha256(original))
|
||||||
|
│ └─ audit_events row (action=user.purge_pii, category=auth)
|
||||||
|
```
|
||||||
|
|
||||||
|
`users.id` is **preserved**. Historical `audit_events.actor = u-X`
|
||||||
|
rows still resolve to the row (now scrubbed). This is the
|
||||||
|
forensic-attribution guarantee — the operator can prove "user u-X
|
||||||
|
performed action Y on date Z" even after the PII is gone.
|
||||||
|
|
||||||
|
`oidc_subject` is **hashed**, not nullified, for two reasons:
|
||||||
|
|
||||||
|
1. The `(oidc_provider_id, oidc_subject)` UNIQUE constraint would
|
||||||
|
trip if multiple purged users converged on the same NULL.
|
||||||
|
2. Re-login under the same IdP subject creates a fresh row (different
|
||||||
|
`u-` id) because `GetByOIDCSubject` won't match the hashed token —
|
||||||
|
the original subject is unrecoverable from the hash. This is the
|
||||||
|
"right-to-be-forgotten" behavior: the same human logging back in
|
||||||
|
is functionally a new account.
|
||||||
|
|
||||||
|
## Operator configuration
|
||||||
|
|
||||||
|
| Env var | Default | Notes |
|
||||||
|
|---|---|---|
|
||||||
|
| `CERTCTL_USER_RETENTION_INTERVAL` | `24h` | Tick cadence for the scheduler's userRetentionLoop. Zero or negative ignored. |
|
||||||
|
| `CERTCTL_USER_RETENTION_WINDOW` | `30 * 24h` (30 days) | How long after `deactivated_at` a row's PII stays in the table. Operators with stricter GDPR/CCPA expectations may shorten. |
|
||||||
|
| `CERTCTL_USER_RETENTION_BATCH_CAP` | `200` | Per-tick row budget. Larger backlogs spread across multiple ticks. 0 = unbounded (test fixtures only). |
|
||||||
|
|
||||||
|
## How to verify retention is working
|
||||||
|
|
||||||
|
1. Deactivate a test user via the admin path:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST -H "X-API-Key: $ADMIN_KEY" \
|
||||||
|
https://certctl.example.com/api/v1/auth/users/u-test/deactivate
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Confirm the row's `deactivated_at` is set:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT id, email, deactivated_at FROM users WHERE id = 'u-test';
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Backdate `deactivated_at` to past the retention window (only for
|
||||||
|
testing — never in production):
|
||||||
|
|
||||||
|
```sql
|
||||||
|
UPDATE users SET deactivated_at = NOW() - INTERVAL '60 days'
|
||||||
|
WHERE id = 'u-test';
|
||||||
|
```
|
||||||
|
|
||||||
|
(Note: this UPDATE will succeed because `users` doesn't have a
|
||||||
|
WORM trigger; the audit-events WORM trigger is unrelated.)
|
||||||
|
|
||||||
|
4. Wait for the next `userRetentionLoop` tick (or restart the server
|
||||||
|
to force an immediate sweep). Confirm scrub:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT id, email, display_name, oidc_subject
|
||||||
|
FROM users
|
||||||
|
WHERE id = 'u-test';
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: `email = 'purged@redacted.local'`,
|
||||||
|
`display_name = '[purged]'`,
|
||||||
|
`oidc_subject LIKE 'sha256:%'`.
|
||||||
|
|
||||||
|
5. Confirm an audit row was emitted:
|
||||||
|
|
||||||
|
```sql
|
||||||
|
SELECT id, actor, action, resource_id, timestamp
|
||||||
|
FROM audit_events
|
||||||
|
WHERE action = 'user.purge_pii' AND resource_id = 'u-test'
|
||||||
|
ORDER BY timestamp DESC LIMIT 1;
|
||||||
|
```
|
||||||
|
|
||||||
|
## What's NOT covered (deferred work)
|
||||||
|
|
||||||
|
The Sprint 6 fix is Phase 1 of the audit's COMP-002-RETENTION
|
||||||
|
recommendation. Two further pieces are forward-looking:
|
||||||
|
|
||||||
|
- **GDPR data-subject access request (DSAR) export.** A "show me
|
||||||
|
everything you know about me" endpoint is not yet implemented.
|
||||||
|
Operators on EU-resident data should treat this as a manual SQL
|
||||||
|
procedure today; track for Phase 2.
|
||||||
|
- **Cascade purge of related rows.** Sessions are revoked (above);
|
||||||
|
api_keys with `created_by = u-X` are NOT yet purged on scrub. The
|
||||||
|
api_keys table doesn't have a foreign key to users (it indexes by
|
||||||
|
`actor_id` strings, free-form), so the cascade is a service-layer
|
||||||
|
concern that needs explicit wiring. Track for Phase 2.
|
||||||
|
- **Per-event PII redaction in `audit_events.details`.** The existing
|
||||||
|
`RedactDetailsForAudit` (`internal/service/audit_redact.go`) scrubs
|
||||||
|
credential + PII keys at write time. A future feature for
|
||||||
|
"retroactively re-redact existing rows" would interact with the WORM
|
||||||
|
trigger; out of scope today.
|
||||||
|
|
||||||
|
## See also
|
||||||
|
|
||||||
|
- `internal/service/user_retention.go` — `UserRetentionService` source.
|
||||||
|
- `internal/scheduler/scheduler.go::userRetentionLoop` — scheduler loop.
|
||||||
|
- `migrations/000036_users.up.sql` — `users` table definition.
|
||||||
|
- `migrations/000045_users_deactivated_at.up.sql` — `deactivated_at` column.
|
||||||
|
- `docs/operator/audit-chain.md` — paired Sprint 6 tamper-evidence work.
|
||||||
@@ -68,6 +68,45 @@ giving them the keys to the kingdom. The
|
|||||||
`internal/domain/auth/auditor_test.go` invariants pin this set going
|
`internal/domain/auth/auditor_test.go` invariants pin this set going
|
||||||
forward.
|
forward.
|
||||||
|
|
||||||
|
### Auditor role invariants (DOC-002 / COMP-005 closure)
|
||||||
|
|
||||||
|
Acquisition-audit DOC-002 + COMP-005 closure (Sprint 7 ACQ, 2026-05-16).
|
||||||
|
The auditor role's permission set is **pinned at exactly two
|
||||||
|
permissions** — `audit.read` and `audit.export` — and any drift breaks
|
||||||
|
the SOC 2 / FedRAMP / PCI separation. The pin is enforced at three
|
||||||
|
layers and the load-bearing layer is the unit-test set, not a bash CI
|
||||||
|
guard:
|
||||||
|
|
||||||
|
1. **Schema layer** — `migrations/000029_rbac.up.sql:261-262` seeds
|
||||||
|
exactly two `role_permissions` rows for `r-auditor`
|
||||||
|
(`r-auditor / p-audit-read / global / NULL` and
|
||||||
|
`r-auditor / p-audit-export / global / NULL`).
|
||||||
|
`migrations/000039_audit_crit1_perms.up.sql:111` adds an inline
|
||||||
|
comment confirming `r-auditor` was NOT widened by the migration that
|
||||||
|
shipped the five admin-only fine-grained perms.
|
||||||
|
2. **Code layer** — `internal/domain/auth/DefaultRoles[RoleIDAuditor]`
|
||||||
|
matches the schema. A future code change that adds a non-audit
|
||||||
|
permission to the slice is caught by:
|
||||||
|
3. **Test layer** (the load-bearing one) —
|
||||||
|
`internal/domain/auth/auditor_test.go` ships three pinning tests:
|
||||||
|
- `TestAuditorRoleHoldsExactlyAuditReadAndExport` — set-equality
|
||||||
|
comparison; fails on any add or remove
|
||||||
|
- `TestAuditorRoleDoesNotHoldMutatingOrReadingNonAuditPerms` —
|
||||||
|
enumerates the slice and rejects any permission outside the
|
||||||
|
`{audit.read, audit.export}` set; catches subtle widening even if
|
||||||
|
the set-equality test is bypassed
|
||||||
|
- `TestAuditorRoleSeparateFromViewer` — pins that the auditor and
|
||||||
|
viewer permission sets are disjoint except for `audit.read` (which
|
||||||
|
viewer shares by design); catches the "auditor inherits viewer
|
||||||
|
reads" leg
|
||||||
|
|
||||||
|
A bash CI guard was deliberately **not** added — the property is
|
||||||
|
already enforced at the Go test layer with stronger semantics
|
||||||
|
(struct-aware set equality) than `grep` could provide. If a future
|
||||||
|
contributor proposes widening `r-auditor`, the three tests above
|
||||||
|
fail at `go test ./internal/domain/auth/...` BEFORE the change can
|
||||||
|
land in a merge.
|
||||||
|
|
||||||
The five **admin-only fine-grained perms** seeded by migration
|
The five **admin-only fine-grained perms** seeded by migration
|
||||||
000030 gate the high-blast-radius endpoints:
|
000030 gate the high-blast-radius endpoints:
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,105 @@
|
|||||||
|
# Runbook: regenerating Playwright visual-regression snapshots
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
Use this when:
|
||||||
|
|
||||||
|
- You've intentionally changed UI shape (added a column, restyled a
|
||||||
|
banner, replaced an icon set) and the next `Frontend E2E` CI run
|
||||||
|
fails with `Screenshot comparison failed:` errors on multiple
|
||||||
|
`04-visual-regression.spec.ts` cases.
|
||||||
|
- A deterministic-but-platform-specific font-rendering difference
|
||||||
|
emerges (Linux runner vs your Mac dev box) and you want to refresh
|
||||||
|
baselines from the canonical CI environment.
|
||||||
|
|
||||||
|
TEST-003 closure (Sprint 5, 2026-05-16) flipped the workflow from
|
||||||
|
`continue-on-error: true` to `false`. Pre-fix you could ignore a
|
||||||
|
red E2E run and ship anyway. Post-fix the run blocks the merge, so
|
||||||
|
any change that legitimately moves pixels needs the snapshot bump
|
||||||
|
captured here.
|
||||||
|
|
||||||
|
Do NOT use this to make a real visual regression disappear. The
|
||||||
|
snapshots are version-controlled evidence — if a pixel diff fires
|
||||||
|
unexpectedly, investigate the rendering change before bumping.
|
||||||
|
|
||||||
|
## What "snapshots" means here
|
||||||
|
|
||||||
|
`web/playwright/04-visual-regression.spec.ts` calls
|
||||||
|
`toHaveScreenshot()`. Playwright stores the canonical PNG at
|
||||||
|
`web/playwright/04-visual-regression.spec.ts-snapshots/<test-name>-<browser>-<platform>.png`
|
||||||
|
on first run. Subsequent runs compare pixel-by-pixel against that
|
||||||
|
file. We commit the PNGs to git so the CI runner and local dev
|
||||||
|
share a single source of truth.
|
||||||
|
|
||||||
|
Two failure modes the diff is designed to catch:
|
||||||
|
|
||||||
|
- **Intentional UI change.** You added a new field to the Targets
|
||||||
|
table. The screenshot now has an extra column. The baseline
|
||||||
|
doesn't. Pixel diff fires — this is the "operator updates
|
||||||
|
baselines" path documented below.
|
||||||
|
- **Regression.** A CSS change inadvertently shifted spacing.
|
||||||
|
Investigate before regenerating; don't paper over the diff.
|
||||||
|
|
||||||
|
## Standard bump (one or two affected tests)
|
||||||
|
|
||||||
|
1. Run the E2E suite locally with the update flag against the
|
||||||
|
same Linux runner image Playwright uses:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd web
|
||||||
|
npx playwright test 04-visual-regression.spec.ts --update-snapshots
|
||||||
|
```
|
||||||
|
|
||||||
|
If you're on macOS, run it through Docker against the same image
|
||||||
|
the workflow uses (`mcr.microsoft.com/playwright`); font
|
||||||
|
rendering differs between platforms and Linux baselines must
|
||||||
|
come from a Linux source.
|
||||||
|
|
||||||
|
2. Inspect every regenerated PNG:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git status web/playwright/*.spec.ts-snapshots/
|
||||||
|
git diff --stat web/playwright/*.spec.ts-snapshots/
|
||||||
|
```
|
||||||
|
|
||||||
|
PNG diffs in `git diff` are unhelpful — open the files in any
|
||||||
|
image viewer and confirm the change matches your intent.
|
||||||
|
|
||||||
|
3. Commit the snapshots alongside the source change in the same
|
||||||
|
PR:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git add web/playwright/*.spec.ts-snapshots/
|
||||||
|
git commit -m "chore(e2e): refresh visual snapshots after <change>"
|
||||||
|
```
|
||||||
|
|
||||||
|
4. Push and confirm CI's E2E job greens out.
|
||||||
|
|
||||||
|
## Mass bump (font upgrade, framework migration)
|
||||||
|
|
||||||
|
Use the workflow's `workflow_dispatch` input to regenerate from
|
||||||
|
CI's canonical environment:
|
||||||
|
|
||||||
|
1. Go to `Actions` → `Frontend E2E` → `Run workflow`.
|
||||||
|
2. Set `update_snapshots: true`.
|
||||||
|
3. The workflow runs Playwright with `--update-snapshots`, then
|
||||||
|
commits + pushes the regenerated PNGs to a feature branch
|
||||||
|
`playwright/snapshot-update-<run-id>`.
|
||||||
|
4. Open a PR from that branch to master. Review the PNG diffs in
|
||||||
|
the PR view (GitHub renders image diffs side-by-side for
|
||||||
|
committed PNGs).
|
||||||
|
5. Merge.
|
||||||
|
|
||||||
|
## What NOT to do
|
||||||
|
|
||||||
|
- Don't regenerate snapshots from a developer's local machine and
|
||||||
|
push them as the canonical baseline. The Linux runner's font
|
||||||
|
hinting differs from macOS / Windows, so the baselines must come
|
||||||
|
from the same image the CI workflow runs.
|
||||||
|
- Don't add `--update-snapshots` to the always-run e2e step in
|
||||||
|
`.github/workflows/e2e.yml`. That's how snapshot regressions
|
||||||
|
become invisible — every diff gets accepted, every PR ships
|
||||||
|
fine, and the visual-regression layer becomes decorative.
|
||||||
|
- Don't bump snapshots in a "fix typo" PR. Every PNG change is
|
||||||
|
an architectural decision; pair it with the source change that
|
||||||
|
justifies it.
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
# Runbook: PostgreSQL backup for certctl
|
# Runbook: PostgreSQL backup for certctl
|
||||||
|
|
||||||
> Last reviewed: 2026-05-13
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
Use this when:
|
Use this when:
|
||||||
- You're setting up a new certctl deployment and need a backup policy
|
- You're setting up a new certctl deployment and need a backup policy
|
||||||
@@ -109,38 +109,76 @@ is the authoritative reference.
|
|||||||
|
|
||||||
## Automation paths
|
## Automation paths
|
||||||
|
|
||||||
This is the gap an acquisition reviewer typically wants to see filled.
|
certctl ships an **opt-in Helm CronJob** for the in-cluster-Postgres
|
||||||
certctl ships no backup CronJob template in the Helm chart — the
|
case (the most common bundled-deploy shape). The template lives at
|
||||||
operator owns this layer because:
|
`deploy/helm/certctl/templates/backup-cronjob.yaml` and is gated by
|
||||||
|
`backup.enabled` in `values.yaml`. Default OFF; flip it on with one
|
||||||
|
toggle and a sink choice. For managed Postgres (AWS RDS / GCP Cloud
|
||||||
|
SQL / Azure DB) the operator relies on the provider's PITR layer;
|
||||||
|
this CronJob is intentionally scoped to the in-cluster-Postgres path.
|
||||||
|
|
||||||
1. The right tool depends on the deployment topology (in-cluster
|
### Enabling the bundled CronJob
|
||||||
Postgres vs. managed Postgres vs. self-hosted on a VM).
|
|
||||||
2. The right secret-management integration depends on the operator's
|
|
||||||
existing stack (Vault, AWS Secrets Manager, GCP Secret Manager,
|
|
||||||
sealed-secrets, External Secrets).
|
|
||||||
3. The right storage backend depends on the operator's existing
|
|
||||||
off-host blob storage.
|
|
||||||
|
|
||||||
A bundled CronJob would be a half-answer for any operator with an
|
```bash
|
||||||
established backup posture, and would have to be torn out before
|
# PVC sink (in-cluster persistent volume — simplest)
|
||||||
production. Three sample recipes that cover the common cases:
|
helm upgrade --install certctl charts/certctl \
|
||||||
|
--set backup.enabled=true \
|
||||||
|
--set backup.sink=pvc \
|
||||||
|
--set backup.pvc.storageClassName=<your-storage-class> \
|
||||||
|
--set backup.pvc.size=20Gi \
|
||||||
|
--set backup.schedule="0 2 * * *"
|
||||||
|
|
||||||
- **In-cluster Postgres → S3:** a CronJob running an alpine image with
|
# S3 sink (off-cluster, recommended for any deploy past the lab)
|
||||||
`aws-cli` + the `pg_dump` command above, output piped to
|
kubectl create secret generic certctl-backup-aws \
|
||||||
`aws s3 cp`. Cosign-signed if your supply-chain policy requires it.
|
--from-literal=AWS_ACCESS_KEY_ID=AKIA... \
|
||||||
- **Managed Postgres (AWS RDS / GCP Cloud SQL / Azure DB):** rely on
|
--from-literal=AWS_SECRET_ACCESS_KEY=... \
|
||||||
the cloud provider's built-in PITR backup; configure retention
|
--namespace certctl
|
||||||
≥ 30 days; the certctl deployment surface is the connection string
|
helm upgrade --install certctl charts/certctl \
|
||||||
alone.
|
--set backup.enabled=true \
|
||||||
- **Self-hosted VM:** systemd timer + `pg_dump` + `restic` (or
|
--set backup.sink=s3 \
|
||||||
`borgbackup`) to encrypted off-host storage.
|
--set backup.s3.bucket=my-certctl-backups \
|
||||||
|
--set backup.s3.region=us-east-1 \
|
||||||
|
--set backup.s3.credentialsSecret=certctl-backup-aws \
|
||||||
|
--set backup.schedule="0 2 * * *"
|
||||||
|
```
|
||||||
|
|
||||||
Tracked in [WORKSPACE-ROADMAP.md](../../../WORKSPACE-ROADMAP.md) as a
|
The CronJob runs `pg_dump --format=custom --no-owner --no-acl
|
||||||
post-v2.1.0 nice-to-have: an opt-in Helm CronJob template for the
|
--dbname=certctl` (the same shape as the manual command earlier in
|
||||||
in-cluster-Postgres-to-S3 case as a starter. The right time to ship
|
this runbook, so a manual dump and a Job dump are byte-comparable)
|
||||||
it is when a real operator asks for it; speculatively shipping it
|
and ships the artifact to the configured sink. Off-host retention
|
||||||
without that signal would just produce a template every deployment
|
is the sink's responsibility — S3 lifecycle rules or PVC snapshot
|
||||||
ends up rewriting.
|
retention on the storage class, not the CronJob.
|
||||||
|
|
||||||
|
### When the bundled CronJob is NOT the answer
|
||||||
|
|
||||||
|
- **Managed Postgres (AWS RDS / GCP Cloud SQL / Azure DB).** Use the
|
||||||
|
provider's built-in PITR; configure retention ≥ 30 days. The
|
||||||
|
certctl deployment surface is the connection string alone — no
|
||||||
|
CronJob to run.
|
||||||
|
- **Self-hosted Postgres on a VM (no Kubernetes).** Use a systemd
|
||||||
|
timer + `pg_dump` + `restic` (or `borgbackup`) to encrypted
|
||||||
|
off-host storage. The bundled CronJob has no equivalent on bare
|
||||||
|
VMs.
|
||||||
|
- **Already running pgbackrest / wal-g.** Keep using it. The bundled
|
||||||
|
CronJob is for the operator who doesn't yet have a backup posture,
|
||||||
|
not a replacement for production-grade WAL-shipping.
|
||||||
|
|
||||||
|
### Recovery objectives
|
||||||
|
|
||||||
|
The bundled CronJob targets the same RPO/RTO that any nightly-dump
|
||||||
|
strategy gives you:
|
||||||
|
|
||||||
|
- **RPO ≈ 24h** at the default `0 2 * * *` schedule (you lose at
|
||||||
|
most one day of writes if Postgres burns down). Tighten by running
|
||||||
|
every 6h or 1h; tighten further by switching to WAL-shipping
|
||||||
|
(out of scope for the bundled CronJob).
|
||||||
|
- **RTO ≈ 30–60min** for the restore drill below — drop the dump
|
||||||
|
into a fresh Postgres instance, point certctl at it, confirm
|
||||||
|
routes return 200. Empirically measured during the
|
||||||
|
[disaster-recovery runbook](disaster-recovery.md) drill.
|
||||||
|
|
||||||
|
If your contractual RPO is below 24h, run pgbackrest WAL-shipping
|
||||||
|
alongside (or instead of) the CronJob.
|
||||||
|
|
||||||
## Verification — what to dry-run quarterly
|
## Verification — what to dry-run quarterly
|
||||||
|
|
||||||
@@ -160,6 +198,42 @@ to your quarterly on-call rotation:
|
|||||||
The [disaster-recovery runbook](disaster-recovery.md) covers what to
|
The [disaster-recovery runbook](disaster-recovery.md) covers what to
|
||||||
do when this dry-run reveals a gap.
|
do when this dry-run reveals a gap.
|
||||||
|
|
||||||
|
## CI restore verification
|
||||||
|
|
||||||
|
> Acquisition-audit DEPL-005 + DATA-012 closure (Sprint 4 ACQ,
|
||||||
|
> 2026-05-16). The quarterly dry-run above is the operator-side
|
||||||
|
> proof; the workflow below is the upstream-side proof.
|
||||||
|
|
||||||
|
The certctl repo ships a weekly GitHub Actions workflow that
|
||||||
|
exercises the **exact** pg_dump shape this runbook recommends
|
||||||
|
(`--format=custom --no-owner --no-acl`) against a real Postgres
|
||||||
|
container, then asserts the audit_events hash chain round-trips
|
||||||
|
byte-for-byte across the dump → restore boundary. A regression in
|
||||||
|
the dump format, in a Postgres minor bump, or in migration 000047's
|
||||||
|
canonical-payload serialization would surface in the next Monday
|
||||||
|
run instead of on a customer's restore day.
|
||||||
|
|
||||||
|
- **Workflow:** [`.github/workflows/backup-restore.yml`](../../../.github/workflows/backup-restore.yml)
|
||||||
|
— Mondays 07:00 UTC + `workflow_dispatch`. Postgres service
|
||||||
|
container pinned to the same SHA256 digest as
|
||||||
|
`deploy/docker-compose.yml`.
|
||||||
|
- **Harness:** [`deploy/test/backup-restore-smoke.sh`](../../../deploy/test/backup-restore-smoke.sh)
|
||||||
|
— runs the workload → `pg_dump -Fc` → `DROP SCHEMA public CASCADE`
|
||||||
|
→ `pg_restore` → verify cycle. Locally runnable against any
|
||||||
|
reachable Postgres (it DROPs the schema, so do not point it at
|
||||||
|
data you care about).
|
||||||
|
- **Workload + verifier:** [`deploy/test/backupsmoke/main.go`](../../../deploy/test/backupsmoke/main.go)
|
||||||
|
— generates 24 synthetic `audit_events` rows representing an
|
||||||
|
issue/renew/revoke/auth-login cycle, snapshots the chain head
|
||||||
|
before the backup, and after restore runs
|
||||||
|
`audit_events_verify_chain()` to confirm `first_break_id IS NULL`.
|
||||||
|
|
||||||
|
The CI workflow is not a replacement for the quarterly operator
|
||||||
|
dry-run — it does not exercise the operator-managed file material
|
||||||
|
(CA keys, RA keys, trust anchors) listed in the "What to back up"
|
||||||
|
table above. Treat it as the dump-shape regression test; the
|
||||||
|
quarterly run remains the full-restore correctness test.
|
||||||
|
|
||||||
## Related reading
|
## Related reading
|
||||||
|
|
||||||
- [`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md) — the restore companion
|
- [`docs/operator/runbooks/disaster-recovery.md`](disaster-recovery.md) — the restore companion
|
||||||
|
|||||||
@@ -0,0 +1,123 @@
|
|||||||
|
# Scale baseline — 2026 Q2 canonical-hardware capture
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
## What this file is
|
||||||
|
|
||||||
|
The canonical record of certctl's load-test baselines for the
|
||||||
|
2026-Q2 reporting window. TEST-005 closure (Sprint 5, 2026-05-16)
|
||||||
|
introduces this doc as the single source of truth for "what's the
|
||||||
|
scale ceiling?" — replacing the TBD-laden table at
|
||||||
|
[`docs/operator/scale.md`](scale.md#measured-baseline) that had been
|
||||||
|
unfilled since the scenarios shipped in Phase 8.
|
||||||
|
|
||||||
|
The numbers below come from the `loadtest` GitHub Actions workflow
|
||||||
|
running its three canonical scenarios on `ubuntu-latest` runners:
|
||||||
|
|
||||||
|
- `bulk-renewal` — 10,000-cert seed + criteria-mode
|
||||||
|
`POST /api/v1/certificates/bulk-renew`, 200 concurrent VUs over 10
|
||||||
|
minutes.
|
||||||
|
- `acme-burst` — 200 concurrent VUs hitting `/acme/directory`,
|
||||||
|
`/acme/new-nonce`, and `/acme/renewal-info/<cert-id>` simultaneously.
|
||||||
|
- `agent-storm` — 5,000-agent seed + sustained
|
||||||
|
`POST /api/v1/agents/{id}/heartbeat` at 167 RPS.
|
||||||
|
|
||||||
|
Thresholds enforced inline in `deploy/test/loadtest/k6.js` (p99 < 5s
|
||||||
|
for issuance-acceptance, p99 < 2s for list, error rate < 1%). k6 exits
|
||||||
|
non-zero on any breach, which propagates through `docker compose up
|
||||||
|
--exit-code-from k6 → make loadtest → workflow exit`.
|
||||||
|
|
||||||
|
## Capture procedure
|
||||||
|
|
||||||
|
1. Trigger the workflow:
|
||||||
|
- **Actions** → `loadtest` → **Run workflow**, branch `master`.
|
||||||
|
- Wait ~25 minutes for the three matrix legs to finish.
|
||||||
|
2. Download each scenario's artifact from the workflow run page:
|
||||||
|
- `k6-scale-bulk-renewal-<run-id>`
|
||||||
|
- `k6-scale-acme-burst-<run-id>`
|
||||||
|
- `k6-scale-agent-storm-<run-id>`
|
||||||
|
- Each archive contains the k6 `summary.json` + raw NDJSON
|
||||||
|
points (90-day GHA retention).
|
||||||
|
3. Run `scripts/scale-baseline/extract.sh <run-id>` (see below) to
|
||||||
|
pull the three artifacts and emit the table rows for this doc.
|
||||||
|
4. Paste the rows under the **Latest capture** section. Update
|
||||||
|
`> Last reviewed:` to today.
|
||||||
|
5. Commit the artifacts you want long-term-retained to
|
||||||
|
[`deploy/test/loadtest-artifacts/`](../../deploy/test/loadtest-artifacts/)
|
||||||
|
using `git lfs` if the archives exceed 100 MB; otherwise commit
|
||||||
|
them inline.
|
||||||
|
|
||||||
|
## Latest capture
|
||||||
|
|
||||||
|
| Scenario | Run ID | Date | p50 | p95 | p99 | Error rate | Peak server RSS | Notes |
|
||||||
|
|---|---|---|---|---|---|---|---|---|
|
||||||
|
| **bulk-renewal** | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | First post-TEST-005 capture; trigger via workflow_dispatch + extract via the procedure above. |
|
||||||
|
| **acme-burst** directory | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | — |
|
||||||
|
| **acme-burst** new-nonce | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | — |
|
||||||
|
| **acme-burst** renewal-info | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | — |
|
||||||
|
| **agent-storm** | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | _capture pending_ | — |
|
||||||
|
|
||||||
|
The "_capture pending_" placeholders are deliberate — the operator
|
||||||
|
fills them after the next `loadtest` workflow_dispatch run. Once
|
||||||
|
filled, replace these rows; do not edit them in place across runs
|
||||||
|
(the historical row stays as evidence).
|
||||||
|
|
||||||
|
## Why "ubuntu-latest" instead of RDS-shaped hardware
|
||||||
|
|
||||||
|
The audit's fix language preferred RDS-shaped Postgres on a
|
||||||
|
fixed-spec runner. ubuntu-latest's 2-vCPU / 7-GB-RAM shape is
|
||||||
|
narrower than typical production Postgres, but it has two virtues:
|
||||||
|
|
||||||
|
1. **Reproducibility.** Every operator + acquirer can reproduce the
|
||||||
|
numbers; an RDS-shaped Postgres requires a paid AWS account.
|
||||||
|
2. **Conservative ceiling.** If the published numbers come from a
|
||||||
|
constrained runner, real-world deployments on production Postgres
|
||||||
|
sizes (db.m5.large +) only get better.
|
||||||
|
|
||||||
|
When an acquirer or operator asks for a production-equivalent
|
||||||
|
baseline, capture a second run on whatever infrastructure they want
|
||||||
|
to validate against and add it under a new **2026 Q3 capture**
|
||||||
|
section.
|
||||||
|
|
||||||
|
## Methodology
|
||||||
|
|
||||||
|
### Hardware
|
||||||
|
|
||||||
|
- **Runner:** GitHub Actions `ubuntu-latest` (currently Ubuntu 24.04, 2-vCPU, 7-GB RAM).
|
||||||
|
- **certctl image:** built from the same commit the workflow runs on.
|
||||||
|
- **Postgres:** `postgres:16-alpine@sha256:890480b08124ce7f79960a9bb16fe39729aa302bd384bfd7c408fee6c8f7adb7`, in-cluster, default config (no operator tuning).
|
||||||
|
- **Network:** runner localhost.
|
||||||
|
|
||||||
|
### Software
|
||||||
|
|
||||||
|
- **k6:** version pinned in `deploy/test/loadtest/Dockerfile`.
|
||||||
|
- **certctl tag:** the v* tag at workflow trigger time (matches `openapi.yaml info.version`).
|
||||||
|
|
||||||
|
### Metrics captured
|
||||||
|
|
||||||
|
- **p50 / p95 / p99 latency** — k6's `http_req_duration` percentiles.
|
||||||
|
- **Error rate** — k6 `http_req_failed` rate (non-2xx + connection errors).
|
||||||
|
- **Peak server RSS** — `docker stats` polled at 1-Hz for the
|
||||||
|
duration of the run; `max(memory_stats.usage)` taken from the
|
||||||
|
emitted JSON.
|
||||||
|
- **Acceptance gate** — the k6 thresholds in `k6.js`; if exceeded
|
||||||
|
the workflow fails.
|
||||||
|
|
||||||
|
### What's NOT captured
|
||||||
|
|
||||||
|
- **Cold-start latency** — these are steady-state baselines after the
|
||||||
|
k6 warmup ramp. Cold-start is a separate concern (renewal-loop
|
||||||
|
startup, scheduler tick boundary), not covered by these scenarios.
|
||||||
|
- **WAN latency** — runs are localhost; production-WAN-RTT additions
|
||||||
|
fall outside scope.
|
||||||
|
- **Federation overhead** — single-instance only; HA + replicas runs
|
||||||
|
are a future deliverable.
|
||||||
|
|
||||||
|
## Related reading
|
||||||
|
|
||||||
|
- [`docs/operator/scale.md`](scale.md) — the operator-facing scale
|
||||||
|
posture doc; baseline rows there point at this file.
|
||||||
|
- [`deploy/test/loadtest/README.md`](../../deploy/test/loadtest/README.md) —
|
||||||
|
scenario semantics + how to read the k6 output.
|
||||||
|
- [`deploy/test/loadtest-artifacts/`](../../deploy/test/loadtest-artifacts/) —
|
||||||
|
long-term archive of captured k6 results.
|
||||||
+15
-21
@@ -1,6 +1,6 @@
|
|||||||
# Operator scale guide
|
# Operator scale guide
|
||||||
|
|
||||||
> Last reviewed: 2026-05-14
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
Use this when:
|
Use this when:
|
||||||
- You're sizing a new certctl deployment for a target fleet count.
|
- You're sizing a new certctl deployment for a target fleet count.
|
||||||
@@ -160,29 +160,23 @@ the RFC 7807 `application/problem+json` shape with the
|
|||||||
returned plain-text 429 or a different problem type would surface as
|
returned plain-text 429 or a different problem type would surface as
|
||||||
`(rate_limited_count - shape_ok_count) > 0` in the summary.
|
`(rate_limited_count - shape_ok_count) > 0` in the summary.
|
||||||
|
|
||||||
### Measured baseline — TBD pending canonical-hardware capture
|
### Measured baseline
|
||||||
|
|
||||||
The Phase 8 scenarios shipped 2026-05-14. Baseline capture on a
|
TEST-005 closure (Sprint 5, 2026-05-16) moved the baseline table out
|
||||||
canonical `ubuntu-latest` GitHub runner is the next operational step;
|
of this file into its own canonical record:
|
||||||
until then, the table below holds TBD placeholders. **Do NOT publish
|
[`docs/operator/scale-baseline-2026-Q2.md`](scale-baseline-2026-Q2.md).
|
||||||
sandbox-captured numbers here** — the same anti-pattern the original
|
That doc owns the capture procedure, the methodology, and the
|
||||||
loadtest README guards against (sandbox-aggregate placeholder vs
|
per-scenario rows; this page links to it as the authoritative
|
||||||
canonical hardware) applies to Phase 8.
|
source.
|
||||||
|
|
||||||
| Scenario | p50 | p95 | p99 | Error rate | Date measured | Commit |
|
The split exists because the baseline table is mutable on every
|
||||||
|---|---|---|---|---|---|---|
|
loadtest workflow_dispatch run, while this page (the operator-facing
|
||||||
| **bulk_renewal** | TBD | TBD | TBD | TBD | — | — |
|
scale posture doc) changes only when the underlying scenarios or
|
||||||
| **acme_burst** directory | TBD | TBD | TBD | TBD | — | — |
|
thresholds change. Keeping them in separate files avoids
|
||||||
| **acme_burst** new-nonce | TBD | TBD | TBD | TBD | — | — |
|
review-noise on per-capture commits.
|
||||||
| **acme_burst** renewal-info | TBD | TBD | TBD | TBD | — | — |
|
|
||||||
| **agent_storm** | TBD | TBD | TBD | TBD | — | — |
|
|
||||||
|
|
||||||
Capture procedure: trigger `loadtest.yml` from the Actions tab against
|
Long-term k6 NDJSON artifacts beyond GHA's 90-day retention live at
|
||||||
the current `master` SHA; wait for the `k6-scale` matrix jobs to
|
[`deploy/test/loadtest-artifacts/`](../../deploy/test/loadtest-artifacts/).
|
||||||
complete; download the per-scenario summary artifacts; copy p50/p95/
|
|
||||||
p99 from `summary-<scenario>.json` into the table; commit the
|
|
||||||
captured numbers alongside the date + SHA. Replace this paragraph
|
|
||||||
with the captured-on row when the first canonical run lands.
|
|
||||||
|
|
||||||
### How to run the scale tier locally
|
### How to run the scale tier locally
|
||||||
|
|
||||||
|
|||||||
@@ -58,7 +58,55 @@ For certificates issued to systems where revocation correctness matters:
|
|||||||
|
|
||||||
## Postgres transport encryption
|
## Postgres transport encryption
|
||||||
|
|
||||||
See [docs/database-tls.md](database-tls.md).
|
**Audit references:** SEC-013 (advisory) and SEC-014 (host-port bind),
|
||||||
|
both closed in Sprint 2 of the 2026-Q2 acquisition audit
|
||||||
|
(2026-05-16).
|
||||||
|
|
||||||
|
The full upgrade procedure (sslmode flags, CA bundle paths, Helm chart
|
||||||
|
values, AWS RDS / Google Cloud SQL / Azure Database notes) lives at
|
||||||
|
[docs/operator/database-tls.md](database-tls.md). The summary of the
|
||||||
|
two operator-visible defenses certctl ships:
|
||||||
|
|
||||||
|
### SEC-014 — Postgres host port is loopback-only
|
||||||
|
|
||||||
|
`deploy/docker-compose.yml` and `deploy/docker-compose.test.yml` both
|
||||||
|
publish Postgres on `127.0.0.1:5432:5432` rather than `5432:5432`.
|
||||||
|
The default Docker port-binding behavior is to bind to `0.0.0.0`,
|
||||||
|
which exposes Postgres on every interface of the host — including any
|
||||||
|
public-facing NICs the operator did not realize were attached. The
|
||||||
|
loopback bind closes that footgun without breaking the
|
||||||
|
certctl-server hop (which goes over the `certctl-network` Docker
|
||||||
|
bridge, not over the host port).
|
||||||
|
|
||||||
|
Operators who genuinely need to reach Postgres from another host —
|
||||||
|
e.g. a separate metrics box running `postgres_exporter` — should
|
||||||
|
either (1) attach that host into the same Docker network, (2) tunnel
|
||||||
|
through SSH (`ssh -L`), or (3) re-publish the port with explicit
|
||||||
|
`bind:` configuration and a documented network-layer access control.
|
||||||
|
Loosening the loopback bind without one of those is a regression.
|
||||||
|
|
||||||
|
### SEC-013 — advisory WARN on external `sslmode=disable`
|
||||||
|
|
||||||
|
`internal/config/config.go::Validate` emits an `slog.Warn` (NOT a
|
||||||
|
fail-closed error) when `CERTCTL_DATABASE_URL` parses as a Postgres
|
||||||
|
URL with `sslmode=disable` AND the host is outside the local
|
||||||
|
safelist (`localhost` / `127.0.0.1` / `::1` / `postgres` /
|
||||||
|
`certctl-postgres` / `*.svc.cluster.local`). The advisory exists
|
||||||
|
because the legitimate compose / Helm topology genuinely uses
|
||||||
|
`sslmode=disable` over the Docker bridge — failing closed would
|
||||||
|
break the production-shaped quickstart — but pointing
|
||||||
|
`CERTCTL_DATABASE_URL` at a managed-Postgres host (RDS, Cloud SQL,
|
||||||
|
Azure Database) without flipping `sslmode` to `verify-full` puts
|
||||||
|
the entire control plane's Postgres traffic on the wire in
|
||||||
|
cleartext. The WARN surfaces that landmine on every boot so the
|
||||||
|
operator notices it in the journal even if the rest of the boot
|
||||||
|
sequence looks healthy.
|
||||||
|
|
||||||
|
To clear the WARN: set `CERTCTL_DATABASE_URL` to a URL with
|
||||||
|
`sslmode=verify-full` and `sslrootcert=<ca-bundle-path>`. The full
|
||||||
|
procedure (CA-bundle materialization, Helm chart values, secret-
|
||||||
|
manager wiring) is in
|
||||||
|
[docs/operator/database-tls.md](database-tls.md).
|
||||||
|
|
||||||
## Encryption at rest
|
## Encryption at rest
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Architecture Guide
|
# Architecture Guide
|
||||||
|
|
||||||
> Last reviewed: 2026-05-05
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
## Contents
|
## Contents
|
||||||
|
|
||||||
@@ -55,6 +55,45 @@ New to certificates? Read the [Concepts Guide](concepts.md) first.
|
|||||||
7. **Connector Architecture** — Pluggable issuers, targets, and notifiers for extensibility
|
7. **Connector Architecture** — Pluggable issuers, targets, and notifiers for extensibility
|
||||||
8. **Self-Hosted** — No cloud lock-in; run with Docker Compose, Kubernetes, or bare metal
|
8. **Self-Hosted** — No cloud lock-in; run with Docker Compose, Kubernetes, or bare metal
|
||||||
|
|
||||||
|
### Single-tenant deployment model
|
||||||
|
|
||||||
|
certctl runs as a **single-tenant** application today. Every authenticated
|
||||||
|
request is stamped with `auth.DefaultTenantID` by the auth middleware
|
||||||
|
(`internal/auth/middleware.go` — the `TenantIDKey` context value is
|
||||||
|
constant for the process lifetime), and repository queries don't filter
|
||||||
|
on tenant. A deploy is one tenant; a buyer running multiple business
|
||||||
|
units on one cluster needs one certctl deployment per business unit.
|
||||||
|
|
||||||
|
The `tenant_id` columns sprinkled across the schema (`actors`,
|
||||||
|
`managed_certificates`, `agents`, `users`, `roles`, audit log, etc.) are
|
||||||
|
**forward-compatible scaffolding** for the multi-tenancy roadmap item
|
||||||
|
in `WORKSPACE-ROADMAP.md`, not active multi-tenant code. A repo skimmer
|
||||||
|
who sees the columns can reasonably assume tenant isolation is wired
|
||||||
|
end-to-end; it isn't. The `scripts/ci-guards/multi-tenant-query-coverage.sh`
|
||||||
|
guard exists to track the drift and is treated as informational (warns
|
||||||
|
on net-new tenant_id-less queries above a baseline) — flipping it to a
|
||||||
|
hard gate is the inflection-point work for activating multi-tenancy.
|
||||||
|
|
||||||
|
Lifting this to a multi-tenant deployment requires three pieces of
|
||||||
|
work in sequence:
|
||||||
|
|
||||||
|
1. **Request-derived tenant resolution.** Replace the constant
|
||||||
|
`DefaultTenantID` stamp with a resolution function that picks
|
||||||
|
the tenant from the actor (`actors.tenant_id`) or a hostname /
|
||||||
|
path-prefix routing convention.
|
||||||
|
2. **Per-query tenant scoping.** Every `WHERE` clause that joins
|
||||||
|
on a `tenant_id`-bearing table must add `AND tenant_id = $N`.
|
||||||
|
The multi-tenant-query-coverage guard tracks this surface;
|
||||||
|
activating multi-tenancy means driving its baseline to zero.
|
||||||
|
3. **Per-tenant resource quotas + isolation tests.** RBAC scope
|
||||||
|
types extend with `tenant`; integration tests exercise
|
||||||
|
cross-tenant data-leak prevention; quotas (certs/issuers/agents
|
||||||
|
per tenant) wire into the existing limit-enforcement layer.
|
||||||
|
|
||||||
|
Until that work lands, **the multi-tenant columns are decorative**.
|
||||||
|
Treat them as you would a Postgres `version` column on a row you
|
||||||
|
never read — the schema is forward-compat, the runtime is not.
|
||||||
|
|
||||||
## System Components
|
## System Components
|
||||||
|
|
||||||
```mermaid
|
```mermaid
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Connector Development Guide
|
# Connector Development Guide
|
||||||
|
|
||||||
> Last reviewed: 2026-05-05
|
> Last reviewed: 2026-05-16
|
||||||
>
|
>
|
||||||
> This is the canonical connector reference: interface contracts,
|
> This is the canonical connector reference: interface contracts,
|
||||||
> registry, deployment primitive, network scanner, cloud discovery.
|
> registry, deployment primitive, network scanner, cloud discovery.
|
||||||
@@ -41,13 +41,23 @@ Target connectors:
|
|||||||
- [HAProxy](haproxy.md) — combined-PEM deploy + `haproxy -c` validate
|
- [HAProxy](haproxy.md) — combined-PEM deploy + `haproxy -c` validate
|
||||||
- [IIS](iis.md) — Microsoft IIS, local PowerShell + WinRM modes
|
- [IIS](iis.md) — Microsoft IIS, local PowerShell + WinRM modes
|
||||||
- [Java Keystore](jks.md) — JKS / PKCS#12 via `keytool` with atomic snapshot rollback
|
- [Java Keystore](jks.md) — JKS / PKCS#12 via `keytool` with atomic snapshot rollback
|
||||||
- [Kubernetes Secrets](k8s.md) — k8s.io/tls Secrets atomic update
|
|
||||||
- [NGINX](nginx.md) — separate-file deploy + `nginx -t` validate
|
- [NGINX](nginx.md) — separate-file deploy + `nginx -t` validate
|
||||||
- [Postfix / Dovecot](postfix.md) — dual-mode mail-server TLS connector
|
- [Postfix / Dovecot](postfix.md) — dual-mode mail-server TLS connector
|
||||||
- [SSH (agentless)](ssh.md) — agentless deploy over SSH/SFTP for Linux/Unix targets
|
- [SSH (agentless)](ssh.md) — agentless deploy over SSH/SFTP for Linux/Unix targets
|
||||||
- [Traefik](traefik.md) — file-provider zero-reload deploy
|
- [Traefik](traefik.md) — file-provider zero-reload deploy
|
||||||
- [Windows Certificate Store](wincertstore.md) — non-IIS Windows services (Exchange, RDP, SQL, ADFS)
|
- [Windows Certificate Store](wincertstore.md) — non-IIS Windows services (Exchange, RDP, SQL, ADFS)
|
||||||
|
|
||||||
|
### Preview connectors (not in the production-ready set)
|
||||||
|
|
||||||
|
SEC-003-K8S closure (Sprint 4, 2026-05-16) moved Kubernetes Secrets
|
||||||
|
out of the canonical fourteen-target index because the production
|
||||||
|
client-go integration is not yet wired — the connector ships but
|
||||||
|
refuses to register without `CERTCTL_K8SSECRET_PREVIEW_ACK=true`
|
||||||
|
and the CRUD methods return *"real Kubernetes client not
|
||||||
|
implemented"* until the integration lands.
|
||||||
|
|
||||||
|
- [Kubernetes Secrets](k8s.md) — **preview** — k8s.io/tls Secrets atomic update. See [`docs/reference/deployment-model.md`](../deployment-model.md) row `k8ssecret` for the bundle-2 V2-blocker scope.
|
||||||
|
|
||||||
## Contents
|
## Contents
|
||||||
|
|
||||||
1. [Overview](#overview)
|
1. [Overview](#overview)
|
||||||
@@ -109,7 +119,7 @@ Target connectors:
|
|||||||
Three types of connectors:
|
Three types of connectors:
|
||||||
|
|
||||||
1. **Issuer Connector** — Obtains certificates from CAs. 12 built-in: Local CA (self-signed + sub-CA + tree mode; ADCS sub-CA mode is documented separately), ACME v2 (HTTP-01, DNS-01, DNS-PERSIST-01, ARI, EAB, profile selection), step-ca, OpenSSL/Custom CA, Vault PKI, DigiCert CertCentral, Sectigo SCM, Google CAS, AWS ACM Private CA, Entrust Certificate Services, GlobalSign Atlas HVCA, EJBCA (Keyfactor)
|
1. **Issuer Connector** — Obtains certificates from CAs. 12 built-in: Local CA (self-signed + sub-CA + tree mode; ADCS sub-CA mode is documented separately), ACME v2 (HTTP-01, DNS-01, DNS-PERSIST-01, ARI, EAB, profile selection), step-ca, OpenSSL/Custom CA, Vault PKI, DigiCert CertCentral, Sectigo SCM, Google CAS, AWS ACM Private CA, Entrust Certificate Services, GlobalSign Atlas HVCA, EJBCA (Keyfactor)
|
||||||
2. **Target Connector** — Deploys certificates to infrastructure. 15 built-in: NGINX, Apache httpd, HAProxy, Traefik, Caddy, Envoy, Postfix/Dovecot (dual-mode), IIS (local PowerShell + WinRM proxy), F5 BIG-IP (proxy agent), SSH (agentless), Windows Certificate Store, Java Keystore (JKS / PKCS#12), Kubernetes Secrets, AWS Certificate Manager, Azure Key Vault
|
2. **Target Connector** — Deploys certificates to infrastructure. 14 production-ready: NGINX, Apache httpd, HAProxy, Traefik, Caddy, Envoy, Postfix/Dovecot (dual-mode), IIS (local PowerShell + WinRM proxy), F5 BIG-IP (proxy agent), SSH (agentless), Windows Certificate Store, Java Keystore (JKS / PKCS#12), AWS Certificate Manager, Azure Key Vault. Plus Kubernetes Secrets shipped as preview — see the *Preview connectors* subsection above for the ACK gate.
|
||||||
3. **Notifier Connector** — Sends alerts about certificate events (Email, Webhooks, Slack, Microsoft Teams, PagerDuty, OpsGenie implemented)
|
3. **Notifier Connector** — Sends alerts about certificate events (Email, Webhooks, Slack, Microsoft Teams, PagerDuty, OpsGenie implemented)
|
||||||
|
|
||||||
All connectors accept JSON configuration at initialization, support config validation, and are registered in the service layer. Issuer connectors run on the control plane; target connectors run on agents. For network appliances where agents can't be installed, a **proxy agent** in the same network zone handles deployment — the server never initiates outbound connections.
|
All connectors accept JSON configuration at initialization, support config validation, and are registered in the service layer. Issuer connectors run on the control plane; target connectors run on agents. For network appliances where agents can't be installed, a **proxy agent** in the same network zone handles deployment — the server never initiates outbound connections.
|
||||||
|
|||||||
@@ -0,0 +1,111 @@
|
|||||||
|
# MCP ↔ REST API parity coverage
|
||||||
|
|
||||||
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
|
## What this file is
|
||||||
|
|
||||||
|
This is the canonical record of which certctl REST routes are exposed
|
||||||
|
as MCP (Model Context Protocol) tools, plus the explicit allowlist of
|
||||||
|
routes that are intentionally NOT exposed. The companion CI guard
|
||||||
|
`scripts/ci-guards/mcp-coverage-parity.sh` fails the build if a new
|
||||||
|
REST route lands without either an MCP tool wrapping it or an
|
||||||
|
explicit allowlist entry justifying the exclusion.
|
||||||
|
|
||||||
|
Before ARCH-004 (Sprint 4, 2026-05-16) the README said *"the full REST
|
||||||
|
API is exposed as MCP tools"* with no published coverage data. That
|
||||||
|
wording was an overclaim — see the audit trail in `git log --grep='ARCH-004'`.
|
||||||
|
|
||||||
|
## Current numbers
|
||||||
|
|
||||||
|
Re-derive at any time:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# REST routes registered by the router
|
||||||
|
grep -cE '^\s*r\.Register\(' internal/api/router/router.go
|
||||||
|
|
||||||
|
# MCP tools registered (counts gomcp.AddTool call sites)
|
||||||
|
grep -rcE 'gomcp\.AddTool' internal/mcp/ --include='*.go' \
|
||||||
|
| grep -v '_test.go' | awk -F: '{s+=$2} END{print s}'
|
||||||
|
```
|
||||||
|
|
||||||
|
At the most recent verification (2026-05-16): **221 routes / 162 tools**.
|
||||||
|
|
||||||
|
## Coverage categories
|
||||||
|
|
||||||
|
The gap between routes and tools is intentional and falls into four
|
||||||
|
named exclusion categories. Adding a new REST route in any of these
|
||||||
|
categories does NOT require a paired MCP tool — but it DOES require
|
||||||
|
an allowlist entry in the CI guard.
|
||||||
|
|
||||||
|
### 1. Protocol-conformance endpoints
|
||||||
|
|
||||||
|
Routes that implement a wire protocol an automated client (cert-manager,
|
||||||
|
certbot, lego, MS Intune, EST devices, OCSP responders, CRL fetchers)
|
||||||
|
talks to directly. These are not human-driven API calls; the MCP
|
||||||
|
"natural language → tool call" model doesn't fit them. The MCP server
|
||||||
|
SHOULD NOT wrap these because exposing them would invite operators to
|
||||||
|
ask an AI agent to "renew the cert via ACME" when the right answer is
|
||||||
|
"the ACME client your existing infra already runs handles that."
|
||||||
|
|
||||||
|
- `/acme/*` — RFC 8555 + RFC 9773 (ACME server)
|
||||||
|
- `/scep/*` — RFC 8894 (SCEP server, MS Intune)
|
||||||
|
- `/.well-known/est/*` — RFC 7030 (EST server)
|
||||||
|
- `/ocsp` — RFC 6960 (OCSP responder)
|
||||||
|
- `/.well-known/pki/crl/*` — RFC 5280 CRL distribution
|
||||||
|
|
||||||
|
### 2. Browser-only auth flow endpoints
|
||||||
|
|
||||||
|
OIDC SSO + CSRF + bootstrap routes that exist solely for the GUI's
|
||||||
|
session establishment dance. An MCP client should authenticate via
|
||||||
|
the same API-key Bearer path the REST callers use; exposing the
|
||||||
|
browser flow as a tool would be incoherent.
|
||||||
|
|
||||||
|
- `/auth/oidc/login`
|
||||||
|
- `/auth/oidc/callback`
|
||||||
|
- `/auth/oidc/back-channel-logout`
|
||||||
|
- `POST /api/v1/auth/bootstrap` (one-shot day-0 admin)
|
||||||
|
- `POST /api/v1/auth/login`, `POST /api/v1/auth/logout`
|
||||||
|
- `GET /api/v1/auth/csrf`
|
||||||
|
|
||||||
|
### 3. Liveness / readiness / version
|
||||||
|
|
||||||
|
Out of scope for natural-language workflows.
|
||||||
|
|
||||||
|
- `/health`
|
||||||
|
- `/ready`
|
||||||
|
- `/api/v1/version`
|
||||||
|
|
||||||
|
### 4. Streaming / binary download endpoints
|
||||||
|
|
||||||
|
The MCP tool contract is request → response JSON. Binary streaming
|
||||||
|
and chunked transfer don't fit the shape and would force lossy
|
||||||
|
encoding (base64-wrapped JSON blobs) the operator wouldn't actually
|
||||||
|
use through an AI assistant.
|
||||||
|
|
||||||
|
- `GET /api/v1/certificates/{id}/download` — raw PEM
|
||||||
|
- `GET /api/v1/certificates/{id}/chain` — chain PEM
|
||||||
|
- `GET /api/v1/intermediate-cas/{id}/cert` — raw cert
|
||||||
|
- `GET /api/v1/metrics/prometheus` — Prometheus text format
|
||||||
|
|
||||||
|
## How to add a new route
|
||||||
|
|
||||||
|
1. Add the route in `internal/api/router/router.go`.
|
||||||
|
2. Decide: should an AI assistant be able to invoke this?
|
||||||
|
- **Yes** → add a matching `gomcp.AddTool` call in `internal/mcp/`.
|
||||||
|
- **No** → confirm the route fits one of the four exclusion
|
||||||
|
categories above AND add an entry to the allowlist in
|
||||||
|
`scripts/ci-guards/mcp-coverage-parity.sh`.
|
||||||
|
3. The CI guard will fail until either branch is satisfied.
|
||||||
|
|
||||||
|
If the route doesn't fit any of the four categories and you don't
|
||||||
|
want it in MCP for another reason, name a fifth category in this
|
||||||
|
file and update the CI guard. The list is meant to grow with the
|
||||||
|
product, not contain it.
|
||||||
|
|
||||||
|
## Why this matters
|
||||||
|
|
||||||
|
certctl is sold to operators who'll use AI assistants to drive it.
|
||||||
|
"Most of the REST API" is a meaningful coverage claim; "the full REST
|
||||||
|
API" was not. Diligence reviewers and operators evaluating MCP-driven
|
||||||
|
workflows need the explicit gap surface — both to plan their
|
||||||
|
automation around the gap and to spot when the gap drifts.
|
||||||
@@ -4,12 +4,12 @@
|
|||||||
<!-- Re-run after adding or removing any t.Skip(). CI guard: -->
|
<!-- Re-run after adding or removing any t.Skip(). CI guard: -->
|
||||||
<!-- scripts/ci-guards/skip-inventory-drift.sh -->
|
<!-- scripts/ci-guards/skip-inventory-drift.sh -->
|
||||||
|
|
||||||
> Last reviewed: 2026-05-13
|
> Last reviewed: 2026-05-16
|
||||||
|
|
||||||
## Summary
|
## Summary
|
||||||
|
|
||||||
- Total t.Skip sites: **142**
|
- Total t.Skip sites: **147**
|
||||||
- testing.Short() guards: **76** (these gate behind `go test -short`)
|
- testing.Short() guards: **82** (these gate behind `go test -short`)
|
||||||
|
|
||||||
Re-run inventory with: `./scripts/skip-inventory.sh`.
|
Re-run inventory with: `./scripts/skip-inventory.sh`.
|
||||||
|
|
||||||
@@ -103,7 +103,7 @@ Re-run inventory with: `./scripts/skip-inventory.sh`.
|
|||||||
|
|
||||||
### `internal/auth/oidc/domain`
|
### `internal/auth/oidc/domain`
|
||||||
|
|
||||||
- `internal/auth/oidc/domain/types_test.go:186` — t.Skip()
|
- `internal/auth/oidc/domain/types_test.go:221` — t.Skip()
|
||||||
|
|
||||||
### `internal/auth/oidc`
|
### `internal/auth/oidc`
|
||||||
|
|
||||||
@@ -156,10 +156,15 @@ Re-run inventory with: `./scripts/skip-inventory.sh`.
|
|||||||
|
|
||||||
### `internal/ratelimit`
|
### `internal/ratelimit`
|
||||||
|
|
||||||
|
- `internal/ratelimit/equivalence_test.go:80` — t.Skip("race-style test under -short")
|
||||||
|
- `internal/ratelimit/equivalence_test.go:88` — t.Skip("postgres equivalence tests require testcontainers; skipped under -short")
|
||||||
- `internal/ratelimit/sliding_window_test.go:146` — t.Skip("race-style test under -short")
|
- `internal/ratelimit/sliding_window_test.go:146` — t.Skip("race-style test under -short")
|
||||||
|
|
||||||
### `internal/repository/postgres`
|
### `internal/repository/postgres`
|
||||||
|
|
||||||
|
- `internal/repository/postgres/audit_chain_test.go:137` — t.Skip("skipping integration test in short mode")
|
||||||
|
- `internal/repository/postgres/audit_chain_test.go:36` — t.Skip("skipping integration test in short mode")
|
||||||
|
- `internal/repository/postgres/audit_chain_test.go:58` — t.Skip("skipping integration test in short mode")
|
||||||
- `internal/repository/postgres/audit_worm_test.go:29` — t.Skip("skipping integration test in short mode")
|
- `internal/repository/postgres/audit_worm_test.go:29` — t.Skip("skipping integration test in short mode")
|
||||||
- `internal/repository/postgres/auth_revoke_scope_test.go:118` — t.Skip("integration test in short mode")
|
- `internal/repository/postgres/auth_revoke_scope_test.go:118` — t.Skip("integration test in short mode")
|
||||||
- `internal/repository/postgres/auth_revoke_scope_test.go:149` — t.Skip("integration test in short mode")
|
- `internal/repository/postgres/auth_revoke_scope_test.go:149` — t.Skip("integration test in short mode")
|
||||||
|
|||||||
@@ -23,12 +23,25 @@ require (
|
|||||||
github.com/leanovate/gopter v0.2.11
|
github.com/leanovate/gopter v0.2.11
|
||||||
github.com/masterzen/winrm v0.0.0-20250927112105-5f8e6c707321
|
github.com/masterzen/winrm v0.0.0-20250927112105-5f8e6c707321
|
||||||
github.com/pkg/sftp v1.13.10
|
github.com/pkg/sftp v1.13.10
|
||||||
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0
|
||||||
|
go.opentelemetry.io/otel/sdk v1.43.0
|
||||||
golang.org/x/crypto v0.50.0
|
golang.org/x/crypto v0.50.0
|
||||||
golang.org/x/oauth2 v0.36.0
|
golang.org/x/oauth2 v0.36.0
|
||||||
golang.org/x/sync v0.20.0
|
golang.org/x/sync v0.20.0
|
||||||
software.sslmate.com/src/go-pkcs12 v0.7.0
|
software.sslmate.com/src/go-pkcs12 v0.7.0
|
||||||
)
|
)
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 // indirect
|
||||||
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 // indirect
|
||||||
|
go.opentelemetry.io/proto/otlp v1.10.0 // indirect
|
||||||
|
google.golang.org/genproto/googleapis/api v0.0.0-20260504160031-60b97b32f348 // indirect
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260504160031-60b97b32f348 // indirect
|
||||||
|
google.golang.org/grpc v1.80.0 // indirect
|
||||||
|
google.golang.org/protobuf v1.36.11 // indirect
|
||||||
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
dario.cat/mergo v1.0.2 // indirect
|
dario.cat/mergo v1.0.2 // indirect
|
||||||
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
|
github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.2 // indirect
|
||||||
@@ -110,9 +123,9 @@ require (
|
|||||||
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
github.com/yusufpapurcu/wmi v1.2.4 // indirect
|
||||||
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
go.opentelemetry.io/auto/sdk v1.2.1 // indirect
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
|
||||||
go.opentelemetry.io/otel v1.41.0 // indirect
|
go.opentelemetry.io/otel v1.43.0
|
||||||
go.opentelemetry.io/otel/metric v1.41.0 // indirect
|
go.opentelemetry.io/otel/metric v1.43.0 // indirect
|
||||||
go.opentelemetry.io/otel/trace v1.41.0 // indirect
|
go.opentelemetry.io/otel/trace v1.43.0 // indirect
|
||||||
golang.org/x/net v0.53.0 // indirect
|
golang.org/x/net v0.53.0 // indirect
|
||||||
golang.org/x/sys v0.43.0 // indirect
|
golang.org/x/sys v0.43.0 // indirect
|
||||||
golang.org/x/text v0.36.0 // indirect
|
golang.org/x/text v0.36.0 // indirect
|
||||||
|
|||||||
@@ -111,6 +111,8 @@ github.com/bodgit/windows v1.0.1 h1:tF7K6KOluPYygXa3Z2594zxlkbKPAOvqr97etrGNIz4=
|
|||||||
github.com/bodgit/windows v1.0.1/go.mod h1:a6JLwrB4KrTR5hBpp8FI9/9W9jJfeQ2h4XDXU74ZCdM=
|
github.com/bodgit/windows v1.0.1/go.mod h1:a6JLwrB4KrTR5hBpp8FI9/9W9jJfeQ2h4XDXU74ZCdM=
|
||||||
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
|
||||||
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3 h1:ZN+IMa753KfX5hd8vVaMixjnqRZ3y8CuJKRKj1xcsSM=
|
||||||
|
github.com/cenkalti/backoff/v5 v5.0.3/go.mod h1:rkhZdG3JZukswDf7f0cwqPNk4K0sa+F97BxZthm/crw=
|
||||||
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
|
||||||
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
|
||||||
@@ -208,6 +210,8 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw
|
|||||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||||
github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM=
|
github.com/golang/protobuf v1.5.1/go.mod h1:DopwsBzvsk0Fs44TXzsVbJyPhcCPeIwnvohx4u74HPM=
|
||||||
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY=
|
||||||
|
github.com/golang/protobuf v1.5.4 h1:i7eJL8qZTpSEXOPTxNKhASYpMn+8e5Q6AdndVa1dWek=
|
||||||
|
github.com/golang/protobuf v1.5.4/go.mod h1:lnTiLA8Wa4RWRcIUkrtSVa5nRhsEGBg48fD6rSs7xps=
|
||||||
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
github.com/google/btree v0.0.0-20180813153112-4030bb1f1f0c/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||||
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
github.com/google/btree v1.0.0/go.mod h1:lNA+9X1NB3Zf8V7Ke586lFgjr2dZNuvo3lPJSGZ5JPQ=
|
||||||
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
|
github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M=
|
||||||
@@ -254,6 +258,8 @@ github.com/gorilla/securecookie v1.1.1/go.mod h1:ra0sb63/xPlUeL+yeDciTfxMRAA+MP+
|
|||||||
github.com/gorilla/sessions v1.2.1 h1:DHd3rPN5lE3Ts3D8rKkQ8x/0kqfeNmBAaiSi+o7FsgI=
|
github.com/gorilla/sessions v1.2.1 h1:DHd3rPN5lE3Ts3D8rKkQ8x/0kqfeNmBAaiSi+o7FsgI=
|
||||||
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
|
github.com/gorilla/sessions v1.2.1/go.mod h1:dk2InVEVJ0sfLlnXv9EAgkf6ecYs/i80K/zI+bUmuGM=
|
||||||
github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
|
github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0 h1:HWRh5R2+9EifMyIHV7ZV+MIZqgz+PMpZ14Jynv3O2Zs=
|
||||||
|
github.com/grpc-ecosystem/grpc-gateway/v2 v2.28.0/go.mod h1:JfhWUomR1baixubs02l85lZYYOm7LV6om4ceouMv45c=
|
||||||
github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
|
github.com/hashicorp/consul/api v1.1.0/go.mod h1:VmuI/Lkw1nC05EYQWNKwWGbkg+FbDBtguAZLlVdkD9Q=
|
||||||
github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
|
github.com/hashicorp/consul/sdk v0.1.1/go.mod h1:VKf9jXwCTEY1QZP2MOLRhb5i/I/ssyNV1vwHyQBF0x8=
|
||||||
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
|
github.com/hashicorp/errwrap v1.0.0/go.mod h1:YH+1FKiLXxHSkmPseP+kNlulaMuP3n2brvKWEqk/Jc4=
|
||||||
@@ -461,17 +467,25 @@ go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ
|
|||||||
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y=
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 h1:sbiXRNDSWJOTobXh5HyQKjq6wUC5tNybqjIqDpAY4CU=
|
||||||
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ=
|
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0/go.mod h1:69uWxva0WgAA/4bu2Yy70SLDBwZXuQ6PbBpbsa5iZrQ=
|
||||||
go.opentelemetry.io/otel v1.41.0 h1:YlEwVsGAlCvczDILpUXpIpPSL/VPugt7zHThEMLce1c=
|
go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I=
|
||||||
go.opentelemetry.io/otel v1.41.0/go.mod h1:Yt4UwgEKeT05QbLwbyHXEwhnjxNO6D8L5PQP51/46dE=
|
go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0=
|
||||||
go.opentelemetry.io/otel/metric v1.41.0 h1:rFnDcs4gRzBcsO9tS8LCpgR0dxg4aaxWlJxCno7JlTQ=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0 h1:88Y4s2C8oTui1LGM6bTWkw0ICGcOLCAI5l6zsD1j20k=
|
||||||
go.opentelemetry.io/otel/metric v1.41.0/go.mod h1:xPvCwd9pU0VN8tPZYzDZV/BMj9CM9vs00GuBjeKhJps=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.43.0/go.mod h1:Vl1/iaggsuRlrHf/hfPJPvVag77kKyvrLeD10kpMl+A=
|
||||||
go.opentelemetry.io/otel/sdk v1.35.0 h1:iPctf8iprVySXSKJffSS79eOjl9pvxV9ZqOWT0QejKY=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0 h1:3iZJKlCZufyRzPzlQhUIWVmfltrXuGyfjREgGP3UUjc=
|
||||||
go.opentelemetry.io/otel/sdk v1.35.0/go.mod h1:+ga1bZliga3DxJ3CQGg3updiaAJoNECOgJREo9KHGQg=
|
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.43.0/go.mod h1:/G+nUPfhq2e+qiXMGxMwumDrP5jtzU+mWN7/sjT2rak=
|
||||||
go.opentelemetry.io/otel/sdk/metric v1.35.0 h1:1RriWBmCKgkeHEhM7a2uMjMUfP7MsOF5JpUCaEqEI9o=
|
go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM=
|
||||||
go.opentelemetry.io/otel/sdk/metric v1.35.0/go.mod h1:is6XYCUMpcKi+ZsOvfluY5YstFnhW0BidkR+gL+qN+w=
|
go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY=
|
||||||
go.opentelemetry.io/otel/trace v1.41.0 h1:Vbk2co6bhj8L59ZJ6/xFTskY+tGAbOnCtQGVVa9TIN0=
|
go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg=
|
||||||
go.opentelemetry.io/otel/trace v1.41.0/go.mod h1:U1NU4ULCoxeDKc09yCWdWe+3QoyweJcISEVa1RBzOis=
|
go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg=
|
||||||
|
go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw=
|
||||||
|
go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A=
|
||||||
|
go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A=
|
||||||
|
go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0=
|
||||||
|
go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g=
|
||||||
|
go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk=
|
||||||
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
|
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
|
||||||
|
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
|
||||||
|
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
|
||||||
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
|
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
|
||||||
go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
|
go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo=
|
||||||
golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
|
||||||
@@ -731,6 +745,8 @@ golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8T
|
|||||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||||
|
gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4=
|
||||||
|
gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E=
|
||||||
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
|
google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE=
|
||||||
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
|
google.golang.org/api v0.7.0/go.mod h1:WtwebWUNSVBH/HAw79HIFXZNqEvBhG+Ra+ax0hx3E3M=
|
||||||
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
|
google.golang.org/api v0.8.0/go.mod h1:o4eAsZoiT+ibD93RtjEohWalFOjRDx6CVaqeizhEnKg=
|
||||||
@@ -801,6 +817,10 @@ google.golang.org/genproto v0.0.0-20210310155132-4ce2db91004e/go.mod h1:FWY/as6D
|
|||||||
google.golang.org/genproto v0.0.0-20210319143718-93e7006c17a6/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
|
google.golang.org/genproto v0.0.0-20210319143718-93e7006c17a6/go.mod h1:FWY/as6DDZQgahTzZj3fqbO1CbirC29ZNUFHwi0/+no=
|
||||||
google.golang.org/genproto v0.0.0-20210402141018-6c239bbf2bb1/go.mod h1:9lPAdzaEmUacj36I+k7YKbEc5CXzPIeORRgDAUOu28A=
|
google.golang.org/genproto v0.0.0-20210402141018-6c239bbf2bb1/go.mod h1:9lPAdzaEmUacj36I+k7YKbEc5CXzPIeORRgDAUOu28A=
|
||||||
google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0=
|
google.golang.org/genproto v0.0.0-20210602131652-f16073e35f0c/go.mod h1:UODoCrxHCcBojKKwX1terBiRUaqAsFqJiF615XL43r0=
|
||||||
|
google.golang.org/genproto/googleapis/api v0.0.0-20260504160031-60b97b32f348 h1:U8orV30l6KpDsi9dxU0CoJZGbjS8EEpw+6ba+XwGPQA=
|
||||||
|
google.golang.org/genproto/googleapis/api v0.0.0-20260504160031-60b97b32f348/go.mod h1:Yzdzr5OOZFgSsEV2D/Xi9NL3bszpXFAg0hFJiRohcD8=
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260504160031-60b97b32f348 h1:pfIbyB44sWzHiCpRqIen67ZQnVXSfIxWrqUMk1qwODE=
|
||||||
|
google.golang.org/genproto/googleapis/rpc v0.0.0-20260504160031-60b97b32f348/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8=
|
||||||
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c=
|
||||||
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
|
google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38=
|
||||||
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
|
google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM=
|
||||||
@@ -821,6 +841,8 @@ google.golang.org/grpc v1.35.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAG
|
|||||||
google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
|
google.golang.org/grpc v1.36.0/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
|
||||||
google.golang.org/grpc v1.36.1/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
|
google.golang.org/grpc v1.36.1/go.mod h1:qjiiYl8FncCW8feJPdyg3v6XW24KsRHe+dy9BAGRRjU=
|
||||||
google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
|
google.golang.org/grpc v1.38.0/go.mod h1:NREThFqKR1f3iQ6oBuvc5LadQuXVGo9rkm5ZGrQdJfM=
|
||||||
|
google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM=
|
||||||
|
google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4=
|
||||||
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
|
||||||
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
|
google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
|
||||||
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
|
google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
|
||||||
@@ -833,6 +855,8 @@ google.golang.org/protobuf v1.24.0/go.mod h1:r/3tXBNzIEhYS9I1OUVjXDlt8tc493IdKGj
|
|||||||
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
|
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
|
||||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||||
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
|
||||||
|
google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE=
|
||||||
|
google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco=
|
||||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
|
||||||
|
|||||||
+124
-3
@@ -201,7 +201,35 @@ check_privileges() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
# Download agent binary from GitHub Releases
|
# Download + verify agent binary from GitHub Releases.
|
||||||
|
#
|
||||||
|
# Acquisition-audit RED-007 closure (Sprint 7 ACQ, 2026-05-16). Pre-
|
||||||
|
# 2026-05-16 the script downloaded the binary with no integrity check
|
||||||
|
# — a tampered binary on the release surface, a MITM downgrade
|
||||||
|
# (HTTPS already prevents in-flight tampering but a compromised
|
||||||
|
# release-asset upload would not surface here), or a misnamed asset
|
||||||
|
# would all install silently. The download path now performs two
|
||||||
|
# independent verifications:
|
||||||
|
#
|
||||||
|
# 1. SHA-256 against the published checksums.txt sidecar
|
||||||
|
# (.github/workflows/release.yml aggregate-checksums job).
|
||||||
|
# sha256sum is in coreutils on Linux; macOS ships `shasum`,
|
||||||
|
# which we fall back to.
|
||||||
|
# 2. Cosign keyless verify against the project's GitHub OIDC
|
||||||
|
# identity (sigstore/cosign-installer pinned in release.yml).
|
||||||
|
# The signature bundle is the `<binary>.sigstore.json` sibling
|
||||||
|
# asset every release publishes. Cosign verify is OPTIONAL
|
||||||
|
# when the operator doesn't have cosign installed — the
|
||||||
|
# script logs a clear WARN and proceeds; operators in
|
||||||
|
# regulated environments MUST install cosign first
|
||||||
|
# (curl -sSL https://github.com/sigstore/cosign/releases/...)
|
||||||
|
# and re-run.
|
||||||
|
#
|
||||||
|
# Both verifications happen against the temp file BEFORE
|
||||||
|
# install_binary copies it to $INSTALL_DIR. A failed checksum
|
||||||
|
# rejects the install. A failed cosign verify also rejects the
|
||||||
|
# install. Either rejection rm -f's the temp file and exits 1.
|
||||||
|
#
|
||||||
# IMPORTANT: main() captures this function's stdout via `binary_path=$(download_binary)`,
|
# IMPORTANT: main() captures this function's stdout via `binary_path=$(download_binary)`,
|
||||||
# so every status/error message MUST go to stderr (>&2). Only the final
|
# so every status/error message MUST go to stderr (>&2). Only the final
|
||||||
# `echo "$temp_file"` is allowed on stdout — that's the return value.
|
# `echo "$temp_file"` is allowed on stdout — that's the return value.
|
||||||
@@ -222,16 +250,109 @@ download_binary() {
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
local temp_file
|
local temp_file temp_sigstore temp_checksums
|
||||||
temp_file=$(mktemp)
|
temp_file=$(mktemp)
|
||||||
|
temp_sigstore=$(mktemp --suffix=.sigstore.json 2>/dev/null || mktemp -t sigstore)
|
||||||
|
temp_checksums=$(mktemp)
|
||||||
|
|
||||||
if ! curl -sSL -f "$download_url" -o "$temp_file" >&2; then
|
if ! curl -sSL -f "$download_url" -o "$temp_file" >&2; then
|
||||||
rm -f "$temp_file"
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
echo -e "${RED}Error: Failed to download binary from $download_url${NC}" >&2
|
echo -e "${RED}Error: Failed to download binary from $download_url${NC}" >&2
|
||||||
echo "Make sure the latest release exists on GitHub with the binary asset for ${OS_TYPE}-${ARCH_TYPE}." >&2
|
echo "Make sure the latest release exists on GitHub with the binary asset for ${OS_TYPE}-${ARCH_TYPE}." >&2
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# ---- SHA-256 verify against the release-published checksums.txt ----
|
||||||
|
#
|
||||||
|
# Every release publishes a single checksums.txt (sha256sum format) +
|
||||||
|
# a cosign signature on it (checksums.txt.sigstore.json). Downloading
|
||||||
|
# via the same RELEASE_URL keeps the integrity chain rooted at the
|
||||||
|
# GitHub-release surface (not a sibling CDN), so a release-asset
|
||||||
|
# tamper is caught by the very first hash comparison.
|
||||||
|
echo -e "${YELLOW}Downloading checksums.txt for SHA-256 verification...${NC}" >&2
|
||||||
|
if ! curl -sSL -f "${RELEASE_URL}/checksums.txt" -o "$temp_checksums" >&2; then
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: Failed to download checksums.txt from ${RELEASE_URL}.${NC}" >&2
|
||||||
|
echo "The agent binary cannot be installed without integrity verification." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Look up the binary's expected hash in the checksums file.
|
||||||
|
local expected_hash
|
||||||
|
expected_hash=$(awk -v name="$binary_name" '$2 == name {print $1; exit}' "$temp_checksums")
|
||||||
|
if [[ -z "$expected_hash" ]]; then
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: checksums.txt has no entry for $binary_name.${NC}" >&2
|
||||||
|
echo "The release surface is inconsistent — refusing to install." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local actual_hash sha_tool
|
||||||
|
if command -v sha256sum &> /dev/null; then
|
||||||
|
sha_tool="sha256sum"
|
||||||
|
actual_hash=$(sha256sum "$temp_file" | awk '{print $1}')
|
||||||
|
elif command -v shasum &> /dev/null; then
|
||||||
|
sha_tool="shasum -a 256"
|
||||||
|
actual_hash=$(shasum -a 256 "$temp_file" | awk '{print $1}')
|
||||||
|
else
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: neither sha256sum nor shasum is installed.${NC}" >&2
|
||||||
|
echo "Install coreutils (Linux) or shasum (macOS) and re-run." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ "$actual_hash" != "$expected_hash" ]]; then
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: SHA-256 mismatch for $binary_name (tool: $sha_tool).${NC}" >&2
|
||||||
|
echo " expected: $expected_hash" >&2
|
||||||
|
echo " actual: $actual_hash" >&2
|
||||||
|
echo "The downloaded binary does NOT match the release-published checksum." >&2
|
||||||
|
echo "Refusing to install. Re-run after investigating the release surface." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo -e "${GREEN}SHA-256 verified ($sha_tool):${NC} $actual_hash" >&2
|
||||||
|
|
||||||
|
# ---- Cosign keyless verify (OPTIONAL — warn-mode if absent) ----
|
||||||
|
#
|
||||||
|
# The release publishes <binary>.sigstore.json next to each binary,
|
||||||
|
# signed via sigstore/cosign-installer keyless mode against the
|
||||||
|
# GitHub Actions OIDC identity for the certctl-io/certctl repo
|
||||||
|
# (see .github/workflows/release.yml). Cosign verify with the
|
||||||
|
# certificate-identity-regexp + certificate-oidc-issuer pair
|
||||||
|
# pins the signature to the repo's release workflow — a malicious
|
||||||
|
# asset signed under a different identity fails the verify.
|
||||||
|
if command -v cosign &> /dev/null; then
|
||||||
|
echo -e "${YELLOW}Cosign keyless-verifying binary signature...${NC}" >&2
|
||||||
|
if ! curl -sSL -f "${download_url}.sigstore.json" -o "$temp_sigstore" >&2; then
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: Failed to download cosign signature from ${download_url}.sigstore.json.${NC}" >&2
|
||||||
|
echo "Either the release surface is broken or this binary predates the cosign-signed releases. Refusing to install." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
if ! COSIGN_EXPERIMENTAL=1 cosign verify-blob \
|
||||||
|
--bundle "$temp_sigstore" \
|
||||||
|
--certificate-identity-regexp "^https://github.com/${GITHUB_REPO}/" \
|
||||||
|
--certificate-oidc-issuer "https://token.actions.githubusercontent.com" \
|
||||||
|
"$temp_file" >&2; then
|
||||||
|
rm -f "$temp_file" "$temp_sigstore" "$temp_checksums"
|
||||||
|
echo -e "${RED}Error: cosign verify-blob failed for $binary_name.${NC}" >&2
|
||||||
|
echo "The binary is NOT signed by the expected GitHub Actions OIDC identity." >&2
|
||||||
|
echo "Refusing to install. This is the load-bearing supply-chain check." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo -e "${GREEN}Cosign signature verified${NC} (identity matches ${GITHUB_REPO} release workflow)" >&2
|
||||||
|
else
|
||||||
|
echo -e "${YELLOW}WARNING:${NC} cosign is not installed — SKIPPING signature verification." >&2
|
||||||
|
echo " SHA-256 verification above is still in force, but the cosign signature" >&2
|
||||||
|
echo " ties the binary to the certctl-io/certctl release workflow's OIDC" >&2
|
||||||
|
echo " identity — the load-bearing supply-chain check. Operators in regulated" >&2
|
||||||
|
echo " environments MUST install cosign and re-run:" >&2
|
||||||
|
echo " curl -sSL https://github.com/sigstore/cosign/releases/latest/download/cosign-${OS_TYPE}-${ARCH_TYPE} -o /usr/local/bin/cosign" >&2
|
||||||
|
echo " chmod +x /usr/local/bin/cosign" >&2
|
||||||
|
echo " Continuing with SHA-256 verification only." >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm -f "$temp_sigstore" "$temp_checksums"
|
||||||
chmod +x "$temp_file"
|
chmod +x "$temp_file"
|
||||||
echo "$temp_file"
|
echo "$temp_file"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,18 @@ type AuditService interface {
|
|||||||
// empty string returns all categories. Used by the auditor role
|
// empty string returns all categories. Used by the auditor role
|
||||||
// (filtered to "auth" via /v1/audit?category=auth).
|
// (filtered to "auth" via /v1/audit?category=auth).
|
||||||
ListAuditEventsByCategory(ctx context.Context, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
ListAuditEventsByCategory(ctx context.Context, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
|
// ListAuditEventsByFilter (P-H2 closure, frontend-design-audit
|
||||||
|
// 2026-05-14) returns audit rows constrained by an optional time
|
||||||
|
// range AND optional category. Zero time.Time on either bound
|
||||||
|
// disables that bound. The repository already pushes the
|
||||||
|
// predicate into SQL (timestamp >=/<= since/until); this method
|
||||||
|
// just threads handler-parsed `since` / `until` query params
|
||||||
|
// through to the filter. Frontend (AuditPage) drops the pre-P-H2
|
||||||
|
// client-side time filter ("fetches the entire event window,
|
||||||
|
// throws 99% away in JS") and sends since/until directly. MCP's
|
||||||
|
// certctl_audit_list_with_category tool already advertised these
|
||||||
|
// params; this closure makes that advertised contract truthful.
|
||||||
|
ListAuditEventsByFilter(ctx context.Context, since, until time.Time, eventCategory string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
// ExportEventsByFilter returns audit events matching a
|
// ExportEventsByFilter returns audit events matching a
|
||||||
// (from, to, eventCategory) filter, capped at maxRows. Audit
|
// (from, to, eventCategory) filter, capped at maxRows. Audit
|
||||||
// 2026-05-10 HIGH-11 closure — backs the new
|
// 2026-05-10 HIGH-11 closure — backs the new
|
||||||
@@ -53,12 +65,29 @@ func NewAuditHandler(svc AuditService) AuditHandler {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// ListAuditEvents lists audit events.
|
// ListAuditEvents lists audit events.
|
||||||
// GET /api/v1/audit?page=1&per_page=50&category=auth
|
// GET /api/v1/audit?page=1&per_page=50&category=auth&since=<RFC3339>&until=<RFC3339>
|
||||||
//
|
//
|
||||||
// Bundle 1 Phase 8 adds the optional `category` query parameter for
|
// Bundle 1 Phase 8 added the optional `category` query parameter for
|
||||||
// auditor-role filtering. Allowed values: cert_lifecycle, auth, config.
|
// auditor-role filtering. Allowed values: cert_lifecycle, auth, config.
|
||||||
// Unknown values surface 400 so misuse is caught loud (instead of
|
// Unknown values surface 400 so misuse is caught loud (instead of
|
||||||
// silently returning all rows).
|
// silently returning all rows).
|
||||||
|
//
|
||||||
|
// P-H2 closure (frontend-design-audit 2026-05-14) adds the optional
|
||||||
|
// `since` / `until` time-range query parameters. Both accept RFC3339
|
||||||
|
// (e.g. "2026-04-01T00:00:00Z"). Either bound can be omitted to leave
|
||||||
|
// that side open-ended. The repository already pushes the timestamp
|
||||||
|
// predicate into the SQL query, and migration 000032's
|
||||||
|
// (event_category, timestamp DESC) composite index makes the
|
||||||
|
// predicate hit an index scan rather than a sequential scan.
|
||||||
|
//
|
||||||
|
// Note on naming: this endpoint uses `since` / `until` to match the
|
||||||
|
// existing MCP `certctl_audit_list_with_category` tool's published
|
||||||
|
// contract (internal/mcp/tools_audit_fix.go:174) and the audit-text
|
||||||
|
// framing of the P-H2 finding. The sibling /api/v1/audit/export
|
||||||
|
// endpoint uses `from` / `to` for compliance-window semantics
|
||||||
|
// (required, ≤ 90-day range, NDJSON streaming); the two endpoints
|
||||||
|
// share data but have different param semantics and the names were
|
||||||
|
// chosen to reflect that.
|
||||||
func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
||||||
if r.Method != http.MethodGet {
|
if r.Method != http.MethodGet {
|
||||||
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
Error(w, http.StatusMethodNotAllowed, "Method not allowed")
|
||||||
@@ -93,16 +122,39 @@ func (h AuditHandler) ListAuditEvents(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
var (
|
// P-H2: optional time-range bounds. RFC3339 parse with explicit
|
||||||
events []domain.AuditEvent
|
// 400 on malformed input — silently dropping a malformed `since`
|
||||||
total int64
|
// would be worse than rejecting it (operator gets unfiltered
|
||||||
err error
|
// results when they thought they were filtering).
|
||||||
)
|
var since, until time.Time
|
||||||
if category != "" {
|
if s := query.Get("since"); s != "" {
|
||||||
events, total, err = h.svc.ListAuditEventsByCategory(r.Context(), category, page, perPage)
|
parsed, err := time.Parse(time.RFC3339, s)
|
||||||
} else {
|
if err != nil {
|
||||||
events, total, err = h.svc.ListAuditEvents(r.Context(), page, perPage)
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`since` must be RFC3339 (e.g. 2026-04-01T00:00:00Z)",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
since = parsed
|
||||||
|
}
|
||||||
|
if u := query.Get("until"); u != "" {
|
||||||
|
parsed, err := time.Parse(time.RFC3339, u)
|
||||||
|
if err != nil {
|
||||||
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`until` must be RFC3339 (e.g. 2026-05-01T00:00:00Z)",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
until = parsed
|
||||||
|
}
|
||||||
|
if !since.IsZero() && !until.IsZero() && !until.After(since) {
|
||||||
|
ErrorWithRequestID(w, http.StatusBadRequest,
|
||||||
|
"`until` must be after `since`",
|
||||||
|
requestID)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
events, total, err := h.svc.ListAuditEventsByFilter(r.Context(), since, until, category, page, perPage)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to list audit events", requestID)
|
ErrorWithRequestID(w, http.StatusInternalServerError, "Failed to list audit events", requestID)
|
||||||
return
|
return
|
||||||
|
|||||||
@@ -17,11 +17,16 @@ import (
|
|||||||
type mockAuditService struct {
|
type mockAuditService struct {
|
||||||
listFunc func(page, perPage int) ([]domain.AuditEvent, int64, error)
|
listFunc func(page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
listByCatFunc func(category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
listByCatFunc func(category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
|
listByFiltFunc func(since, until time.Time, category string, page, perPage int) ([]domain.AuditEvent, int64, error)
|
||||||
getFunc func(id string) (*domain.AuditEvent, error)
|
getFunc func(id string) (*domain.AuditEvent, error)
|
||||||
// HIGH-11 self-audit trace — last RecordEventWithCategory call.
|
// HIGH-11 self-audit trace — last RecordEventWithCategory call.
|
||||||
lastAuditActor string
|
lastAuditActor string
|
||||||
lastAuditAction string
|
lastAuditAction string
|
||||||
lastAuditCategory string
|
lastAuditCategory string
|
||||||
|
// P-H2 trace — last ListAuditEventsByFilter args.
|
||||||
|
lastFilterSince time.Time
|
||||||
|
lastFilterUntil time.Time
|
||||||
|
lastFilterCategory string
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *mockAuditService) ListAuditEvents(_ context.Context, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
func (m *mockAuditService) ListAuditEvents(_ context.Context, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
||||||
@@ -41,6 +46,27 @@ func (m *mockAuditService) ListAuditEventsByCategory(_ context.Context, category
|
|||||||
return nil, 0, nil
|
return nil, 0, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListAuditEventsByFilter satisfies the P-H2 interface extension. The
|
||||||
|
// test fixture remembers the (since, until, category) tuple so
|
||||||
|
// per-subtest assertions can pin that the handler threaded the
|
||||||
|
// query-string params through correctly. Falls back to listFunc /
|
||||||
|
// listByCatFunc so existing tests don't need to set listByFiltFunc.
|
||||||
|
func (m *mockAuditService) ListAuditEventsByFilter(_ context.Context, since, until time.Time, category string, page, perPage int) ([]domain.AuditEvent, int64, error) {
|
||||||
|
m.lastFilterSince = since
|
||||||
|
m.lastFilterUntil = until
|
||||||
|
m.lastFilterCategory = category
|
||||||
|
if m.listByFiltFunc != nil {
|
||||||
|
return m.listByFiltFunc(since, until, category, page, perPage)
|
||||||
|
}
|
||||||
|
if category != "" && m.listByCatFunc != nil {
|
||||||
|
return m.listByCatFunc(category, page, perPage)
|
||||||
|
}
|
||||||
|
if m.listFunc != nil {
|
||||||
|
return m.listFunc(page, perPage)
|
||||||
|
}
|
||||||
|
return nil, 0, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *mockAuditService) GetAuditEvent(_ context.Context, id string) (*domain.AuditEvent, error) {
|
func (m *mockAuditService) GetAuditEvent(_ context.Context, id string) (*domain.AuditEvent, error) {
|
||||||
if m.getFunc != nil {
|
if m.getFunc != nil {
|
||||||
return m.getFunc(id)
|
return m.getFunc(id)
|
||||||
@@ -325,6 +351,153 @@ func TestListAuditEvents_MethodNotAllowed(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ── P-H2 closure (since / until time-range query params) ───────────
|
||||||
|
|
||||||
|
// TestListAuditEvents_WithSinceUntil pins the happy path — both bounds
|
||||||
|
// supplied in RFC3339, mock observes them threaded into the service
|
||||||
|
// call, response is 200.
|
||||||
|
func TestListAuditEvents_WithSinceUntil(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{
|
||||||
|
listByFiltFunc: func(s, u time.Time, _ string, _, _ int) ([]domain.AuditEvent, int64, error) {
|
||||||
|
if !s.Equal(since) {
|
||||||
|
t.Errorf("service since = %v, want %v", s, since)
|
||||||
|
}
|
||||||
|
if !u.Equal(until) {
|
||||||
|
t.Errorf("service until = %v, want %v", u, until)
|
||||||
|
}
|
||||||
|
return []domain.AuditEvent{}, 0, nil
|
||||||
|
},
|
||||||
|
}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, err := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("NewRequest failed: %v", err)
|
||||||
|
}
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("mock recorded since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.Equal(until) {
|
||||||
|
t.Errorf("mock recorded until = %v, want %v", mockSvc.lastFilterUntil, until)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_SinceOnly pins one-sided bound — only `since`
|
||||||
|
// supplied, `until` stays zero. Closure of "operator filters to events
|
||||||
|
// from the last hour" via since=<now-1h>.
|
||||||
|
func TestListAuditEvents_SinceOnly(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, "/api/v1/audit?since="+since.Format(time.RFC3339), nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.IsZero() {
|
||||||
|
t.Errorf("until = %v, want zero (open-ended)", mockSvc.lastFilterUntil)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_InvalidSince pins the parse-error 400 path.
|
||||||
|
// Silently dropping a malformed since would return ALL rows when the
|
||||||
|
// operator thought they were filtering — worse than rejecting.
|
||||||
|
func TestListAuditEvents_InvalidSince(t *testing.T) {
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, "/api/v1/audit?since=not-a-date", nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("status = %d, want 400; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.IsZero() {
|
||||||
|
t.Error("service should NOT have been called on bad since")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_UntilBeforeSince pins the order assertion — a
|
||||||
|
// reversed range surfaces 400, doesn't quietly return empty.
|
||||||
|
func TestListAuditEvents_UntilBeforeSince(t *testing.T) {
|
||||||
|
since := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusBadRequest {
|
||||||
|
t.Errorf("status = %d, want 400; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestListAuditEvents_TimeRangePlusCategory pins that since/until
|
||||||
|
// compose with category (the auditor-role narrow-to-auth use case
|
||||||
|
// extended to "auth events from yesterday" without a separate
|
||||||
|
// endpoint).
|
||||||
|
func TestListAuditEvents_TimeRangePlusCategory(t *testing.T) {
|
||||||
|
since := time.Date(2026, 4, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
until := time.Date(2026, 5, 1, 0, 0, 0, 0, time.UTC)
|
||||||
|
|
||||||
|
mockSvc := &mockAuditService{}
|
||||||
|
handler := NewAuditHandler(mockSvc)
|
||||||
|
|
||||||
|
url := "/api/v1/audit?category=auth&since=" + since.Format(time.RFC3339) + "&until=" + until.Format(time.RFC3339)
|
||||||
|
req, _ := http.NewRequest(http.MethodGet, url, nil)
|
||||||
|
ctx := context.WithValue(req.Context(), middleware.RequestIDKey{}, "test-req-id")
|
||||||
|
req = req.WithContext(ctx)
|
||||||
|
|
||||||
|
w := httptest.NewRecorder()
|
||||||
|
handler.ListAuditEvents(w, req)
|
||||||
|
|
||||||
|
if w.Code != http.StatusOK {
|
||||||
|
t.Errorf("status = %d, want 200; body=%s", w.Code, w.Body.String())
|
||||||
|
}
|
||||||
|
if mockSvc.lastFilterCategory != "auth" {
|
||||||
|
t.Errorf("category = %q, want auth", mockSvc.lastFilterCategory)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterSince.Equal(since) {
|
||||||
|
t.Errorf("since = %v, want %v", mockSvc.lastFilterSince, since)
|
||||||
|
}
|
||||||
|
if !mockSvc.lastFilterUntil.Equal(until) {
|
||||||
|
t.Errorf("until = %v, want %v", mockSvc.lastFilterUntil, until)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestGetAuditEvent_Success(t *testing.T) {
|
func TestGetAuditEvent_Success(t *testing.T) {
|
||||||
event := &domain.AuditEvent{
|
event := &domain.AuditEvent{
|
||||||
ID: "ev-123",
|
ID: "ev-123",
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ type AuthBreakglassHandler struct {
|
|||||||
// nil-safe: when unset, the handler skips the limiter check and
|
// nil-safe: when unset, the handler skips the limiter check and
|
||||||
// relies on the service-layer Argon2id lockout. Production deploys
|
// relies on the service-layer Argon2id lockout. Production deploys
|
||||||
// MUST set this via SetLoginRateLimiter.
|
// MUST set this via SetLoginRateLimiter.
|
||||||
loginLimiter *ratelimit.SlidingWindowLimiter
|
loginLimiter ratelimit.Limiter
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewAuthBreakglassHandler constructs the handler.
|
// NewAuthBreakglassHandler constructs the handler.
|
||||||
@@ -89,7 +89,7 @@ func NewAuthBreakglassHandler(svc BreakglassService, cookieAttrs SessionCookieAt
|
|||||||
// SetLoginRateLimiter wires the per-source-IP rate limiter the Login
|
// SetLoginRateLimiter wires the per-source-IP rate limiter the Login
|
||||||
// handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler
|
// handler enforces. Bundle 5 closure (S1) — see the AuthBreakglassHandler
|
||||||
// type docstring for the full rationale.
|
// type docstring for the full rationale.
|
||||||
func (h *AuthBreakglassHandler) SetLoginRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *AuthBreakglassHandler) SetLoginRateLimiter(l ratelimit.Limiter) {
|
||||||
h.loginLimiter = l
|
h.loginLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,6 +14,7 @@ import (
|
|||||||
|
|
||||||
gooidc "github.com/coreos/go-oidc/v3/oidc"
|
gooidc "github.com/coreos/go-oidc/v3/oidc"
|
||||||
|
|
||||||
|
oidcsvc "github.com/certctl-io/certctl/internal/auth/oidc"
|
||||||
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
"github.com/certctl-io/certctl/internal/repository"
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
)
|
)
|
||||||
@@ -122,7 +123,13 @@ func (v *DefaultBCLVerifier) Verify(ctx context.Context, logoutToken string) (is
|
|||||||
if v.verifyOverride != nil {
|
if v.verifyOverride != nil {
|
||||||
idToken, err = v.verifyOverride(ctx, matched.IssuerURL, logoutToken)
|
idToken, err = v.verifyOverride(ctx, matched.IssuerURL, logoutToken)
|
||||||
} else {
|
} else {
|
||||||
provider, perr := gooidc.NewProvider(ctx, matched.IssuerURL)
|
// Acquisition-audit SEC-021 closure (Sprint 1 follow-up to SEC-001,
|
||||||
|
// 2026-05-16). Per-request discovery re-fetch threaded through
|
||||||
|
// SafeOIDCContext so the dial-time SSRF guard
|
||||||
|
// (validation.SafeHTTPDialContext) re-resolves the issuer host and
|
||||||
|
// refuses reserved-address answers — matching the SEC-001 sweep
|
||||||
|
// over the runtime + dry-run discovery legs in internal/auth/oidc.
|
||||||
|
provider, perr := gooidc.NewProvider(oidcsvc.SafeOIDCContext(ctx), matched.IssuerURL)
|
||||||
if perr != nil {
|
if perr != nil {
|
||||||
return "", "", "", "", 0, fmt.Errorf("provider discovery: %w", perr)
|
return "", "", "", "", 0, fmt.Errorf("provider discovery: %w", perr)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,77 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package handler
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"encoding/base64"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Acquisition-audit SEC-021 closure (Sprint 1 follow-up to SEC-001,
|
||||||
|
// 2026-05-16). DefaultBCLVerifier.Verify performs a per-request
|
||||||
|
// discovery re-fetch via gooidc.NewProvider(ctx, matched.IssuerURL).
|
||||||
|
// Pre-fix, the bare ctx fell through to http.DefaultClient at the dial
|
||||||
|
// layer — no SSRF guard, no DNS-rebinding re-resolve. The fix wraps
|
||||||
|
// ctx via oidcsvc.SafeOIDCContext so the dial-time
|
||||||
|
// validation.SafeHTTPDialContext refuses reserved-address answers
|
||||||
|
// (loopback / link-local / cloud-metadata).
|
||||||
|
//
|
||||||
|
// This test pins the wrap end-to-end:
|
||||||
|
//
|
||||||
|
// 1. Construct a stubProviderRepo with one provider whose IssuerURL is
|
||||||
|
// a literal-loopback http:// URL (the literal-IP class that
|
||||||
|
// SafeHTTPDialContext.isReservedIPForDial refuses up-front, before
|
||||||
|
// any DNS resolution attempt).
|
||||||
|
// 2. Hand-roll a 3-segment JWT whose payload base64url-decodes to
|
||||||
|
// {"iss":"<loopback url>"} so peekIssuer extracts the matching
|
||||||
|
// issuer and provs.List() returns the seeded provider.
|
||||||
|
// 3. Call Verify. The discovery NewProvider call now routes through
|
||||||
|
// SafeOIDCContext; SafeHTTPDialContext sees the literal 127.0.0.1
|
||||||
|
// and refuses with "refusing to dial reserved address <ip>".
|
||||||
|
// 4. Assert the returned error wraps that rejection (substring match
|
||||||
|
// on "refusing to dial" / "reserved address") rather than a
|
||||||
|
// generic connect-refused or "did not respond" wrap.
|
||||||
|
//
|
||||||
|
// Companion to TestFetchUserinfoGroups_SSRF_BlocksReservedAddress in
|
||||||
|
// internal/auth/oidc/service_test.go which exercises the same wrap on
|
||||||
|
// the userinfo-fallback leg. Together they pin the post-SEC-001 sweep.
|
||||||
|
func TestDefaultBCLVerifier_SSRF_BlocksReservedAddress(t *testing.T) {
|
||||||
|
// Literal-loopback issuer URL. Port :1 keeps the URL syntactically
|
||||||
|
// valid; SafeHTTPDialContext refuses on the literal-IP check before
|
||||||
|
// the dial-time TCP connect, so the port choice is moot.
|
||||||
|
const reservedIssuer = "http://127.0.0.1:1"
|
||||||
|
|
||||||
|
provs := &stubProviderRepo{
|
||||||
|
provs: []*oidcdomain.OIDCProvider{
|
||||||
|
{ID: "op-loopback", IssuerURL: reservedIssuer, ClientID: "test-client"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
v := NewDefaultBCLVerifier(provs, "t-default", nil)
|
||||||
|
|
||||||
|
// Hand-roll the JWT. peekIssuer (see auth_session_oidc_bcl.go) parses
|
||||||
|
// only the iss claim from the 2nd segment (payload), so the header +
|
||||||
|
// signature segments only need to be syntactically present.
|
||||||
|
header := base64.RawURLEncoding.EncodeToString([]byte(`{"alg":"RS256"}`))
|
||||||
|
payload := base64.RawURLEncoding.EncodeToString([]byte(`{"iss":"` + reservedIssuer + `"}`))
|
||||||
|
logoutToken := header + "." + payload + ".sig"
|
||||||
|
|
||||||
|
_, _, _, _, _, err := v.Verify(context.Background(), logoutToken)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("Verify against literal-loopback issuer URL: expected SSRF reject; got nil")
|
||||||
|
}
|
||||||
|
msg := err.Error()
|
||||||
|
if !strings.Contains(msg, "refusing to dial") && !strings.Contains(msg, "reserved address") {
|
||||||
|
t.Errorf("Verify err = %q; want SafeHTTPDialContext reserved-address rejection", msg)
|
||||||
|
}
|
||||||
|
// Also confirm the error is wrapped through the Verify "provider
|
||||||
|
// discovery:" prefix so callers can distinguish a discovery-time
|
||||||
|
// dial failure from a signature-verification failure.
|
||||||
|
if !strings.Contains(msg, "provider discovery") {
|
||||||
|
t.Errorf("Verify err = %q; want \"provider discovery:\" wrap", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -255,6 +255,14 @@ func (s *stubUserRepo) ListAll(_ context.Context, _ string) ([]*userdomain.User,
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListDeactivatedBefore satisfies the Sprint 6 COMP-002-RETENTION
|
||||||
|
// interface addition. The phase-5 OIDC handler tests don't exercise
|
||||||
|
// retention paths, so an empty result keeps the contract without
|
||||||
|
// changing test semantics.
|
||||||
|
func (s *stubUserRepo) ListDeactivatedBefore(_ context.Context, _ time.Time) ([]*userdomain.User, error) {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
|
||||||
type phase5StubAudit struct {
|
type phase5StubAudit struct {
|
||||||
events []string
|
events []string
|
||||||
// Audit 2026-05-11 Fix 13 — capture the details map so the
|
// Audit 2026-05-11 Fix 13 — capture the details map so the
|
||||||
|
|||||||
@@ -83,6 +83,20 @@ func (s *stubFullUserRepo) ListAll(_ context.Context, tenantID string) ([]*userd
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListDeactivatedBefore satisfies the Sprint 6 COMP-002-RETENTION
|
||||||
|
// interface addition. Walk rows, filter by DeactivatedAt-before-threshold.
|
||||||
|
// Order is intentionally not stabilised — the auth_users handler tests
|
||||||
|
// don't exercise the retention loop.
|
||||||
|
func (s *stubFullUserRepo) ListDeactivatedBefore(_ context.Context, threshold time.Time) ([]*userdomain.User, error) {
|
||||||
|
var out []*userdomain.User
|
||||||
|
for _, u := range s.rows {
|
||||||
|
if u.DeactivatedAt != nil && u.DeactivatedAt.Before(threshold) {
|
||||||
|
out = append(out, u)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
// stubRevoker records cascade-revoke calls.
|
// stubRevoker records cascade-revoke calls.
|
||||||
type stubRevoker struct {
|
type stubRevoker struct {
|
||||||
called bool
|
called bool
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ type CertificateService interface {
|
|||||||
// CertificateHandler handles HTTP requests for certificate operations.
|
// CertificateHandler handles HTTP requests for certificate operations.
|
||||||
type CertificateHandler struct {
|
type CertificateHandler struct {
|
||||||
svc CertificateService
|
svc CertificateService
|
||||||
ocspLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3 — per-source-IP cap on OCSP
|
ocspLimiter ratelimit.Limiter // production hardening II Phase 3 — per-source-IP cap on OCSP
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewCertificateHandler creates a new CertificateHandler with a service dependency.
|
// NewCertificateHandler creates a new CertificateHandler with a service dependency.
|
||||||
@@ -65,7 +65,7 @@ func NewCertificateHandler(svc CertificateService) CertificateHandler {
|
|||||||
// cmd/server/main.go): 1000 req/min/IP. Setting to nil disables the
|
// cmd/server/main.go): 1000 req/min/IP. Setting to nil disables the
|
||||||
// limit; the limiter's own NewSlidingWindowLimiter(maxN<=0, ...)
|
// limit; the limiter's own NewSlidingWindowLimiter(maxN<=0, ...)
|
||||||
// also produces a no-op limiter, so the env-var-zero case is safe.
|
// also produces a no-op limiter, so the env-var-zero case is safe.
|
||||||
func (h *CertificateHandler) SetOCSPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *CertificateHandler) SetOCSPRateLimiter(l ratelimit.Limiter) {
|
||||||
h.ocspLimiter = l
|
h.ocspLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -100,13 +100,13 @@ type ESTHandler struct {
|
|||||||
// EST RFC 7030 hardening Phase 3.3: per-handler source-IP rate
|
// EST RFC 7030 hardening Phase 3.3: per-handler source-IP rate
|
||||||
// limiter for FAILED HTTP Basic auth attempts. Keyed by sourceIP so
|
// limiter for FAILED HTTP Basic auth attempts. Keyed by sourceIP so
|
||||||
// a hostile network segment can't burn through the password.
|
// a hostile network segment can't burn through the password.
|
||||||
failedBasicLimiter *ratelimit.SlidingWindowLimiter
|
failedBasicLimiter ratelimit.Limiter
|
||||||
|
|
||||||
// EST RFC 7030 hardening Phase 4.2: per-handler per-principal sliding-
|
// EST RFC 7030 hardening Phase 4.2: per-handler per-principal sliding-
|
||||||
// window rate limit. Keyed by (CSR-CN, sourceIP) so a stolen
|
// window rate limit. Keyed by (CSR-CN, sourceIP) so a stolen
|
||||||
// bootstrap cert AND a known device CN can't be used to flood the
|
// bootstrap cert AND a known device CN can't be used to flood the
|
||||||
// issuer. Disabled when nil; configured per-profile.
|
// issuer. Disabled when nil; configured per-profile.
|
||||||
perPrincipalLimiter *ratelimit.SlidingWindowLimiter
|
perPrincipalLimiter ratelimit.Limiter
|
||||||
|
|
||||||
// labelForLog gives observability code a per-profile string to
|
// labelForLog gives observability code a per-profile string to
|
||||||
// include in audit log lines / Prometheus labels. Defaults to
|
// include in audit log lines / Prometheus labels. Defaults to
|
||||||
@@ -170,7 +170,7 @@ func (h *ESTHandler) SetEnrollmentPassword(pw string) { h.basicPassword = pw }
|
|||||||
// rate limiter. Phase 3.3. Disabled when nil — but Validate() at
|
// rate limiter. Phase 3.3. Disabled when nil — but Validate() at
|
||||||
// startup refuses an enabled basic-auth profile without a configured
|
// startup refuses an enabled basic-auth profile without a configured
|
||||||
// limiter, so a real deploy always wires one.
|
// limiter, so a real deploy always wires one.
|
||||||
func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ESTHandler) SetSourceIPRateLimiter(l ratelimit.Limiter) {
|
||||||
h.failedBasicLimiter = l
|
h.failedBasicLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -179,7 +179,7 @@ func (h *ESTHandler) SetSourceIPRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
|||||||
// every successful enrollment, NOT just failures — the goal is to
|
// every successful enrollment, NOT just failures — the goal is to
|
||||||
// bound enrollment-flooding from a compromised credential, not just
|
// bound enrollment-flooding from a compromised credential, not just
|
||||||
// failed-auth brute force.
|
// failed-auth brute force.
|
||||||
func (h *ESTHandler) SetPerPrincipalRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ESTHandler) SetPerPrincipalRateLimiter(l ratelimit.Limiter) {
|
||||||
h.perPrincipalLimiter = l
|
h.perPrincipalLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ type ExportService interface {
|
|||||||
// ExportHandler handles HTTP requests for certificate export operations.
|
// ExportHandler handles HTTP requests for certificate export operations.
|
||||||
type ExportHandler struct {
|
type ExportHandler struct {
|
||||||
svc ExportService
|
svc ExportService
|
||||||
exportLimiter *ratelimit.SlidingWindowLimiter // production hardening II Phase 3
|
exportLimiter ratelimit.Limiter // production hardening II Phase 3
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewExportHandler creates a new ExportHandler with a service dependency.
|
// NewExportHandler creates a new ExportHandler with a service dependency.
|
||||||
@@ -40,7 +40,7 @@ func NewExportHandler(svc ExportService) ExportHandler {
|
|||||||
// Production hardening II Phase 3. Default cap (when set in
|
// Production hardening II Phase 3. Default cap (when set in
|
||||||
// cmd/server/main.go): 50 exports/hr/operator. Setting to nil
|
// cmd/server/main.go): 50 exports/hr/operator. Setting to nil
|
||||||
// disables the limit.
|
// disables the limit.
|
||||||
func (h *ExportHandler) SetExportRateLimiter(l *ratelimit.SlidingWindowLimiter) {
|
func (h *ExportHandler) SetExportRateLimiter(l ratelimit.Limiter) {
|
||||||
h.exportLimiter = l
|
h.exportLimiter = l
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -102,6 +102,20 @@ type ExpiryAlertSnapshotter interface {
|
|||||||
SnapshotExpiryAlerts() []service.ExpiryAlertSnapshotEntry
|
SnapshotExpiryAlerts() []service.ExpiryAlertSnapshotEntry
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AuditChainCounterSnapshotter is the surface MetricsHandler consumes
|
||||||
|
// to emit the Sprint 6 COMP-001-HASH tamper-evidence counters:
|
||||||
|
//
|
||||||
|
// certctl_audit_chain_break_detected_total counter
|
||||||
|
// certctl_audit_chain_verify_total counter
|
||||||
|
// certctl_audit_chain_rows gauge
|
||||||
|
// certctl_audit_chain_last_verified_at gauge (unix seconds)
|
||||||
|
//
|
||||||
|
// *service.AuditChainCounter satisfies this. nil disables emission;
|
||||||
|
// cmd/server/main.go wires the instance at startup.
|
||||||
|
type AuditChainCounterSnapshotter interface {
|
||||||
|
Snapshot() service.AuditChainSnapshot
|
||||||
|
}
|
||||||
|
|
||||||
// MetricsHandler handles HTTP requests for metrics.
|
// MetricsHandler handles HTTP requests for metrics.
|
||||||
// Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format
|
// Supports both JSON format (GET /api/v1/metrics) and Prometheus exposition format
|
||||||
// (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc.
|
// (GET /api/v1/metrics/prometheus) for integration with Prometheus, Grafana, Datadog, etc.
|
||||||
@@ -129,6 +143,10 @@ type MetricsHandler struct {
|
|||||||
// 2026-05-03 Infisical deep-research deliverable. nil disables
|
// 2026-05-03 Infisical deep-research deliverable. nil disables
|
||||||
// emission of certctl_expiry_alerts_total{channel,threshold,result}.
|
// emission of certctl_expiry_alerts_total{channel,threshold,result}.
|
||||||
expiryAlerts ExpiryAlertSnapshotter
|
expiryAlerts ExpiryAlertSnapshotter
|
||||||
|
// Sprint 6 COMP-001-HASH tamper-evidence counters. nil disables
|
||||||
|
// emission of certctl_audit_chain_* metrics. *service.AuditChainCounter
|
||||||
|
// is the production wiring; cmd/server/main.go sets this at startup.
|
||||||
|
auditChainCounter AuditChainCounterSnapshotter
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewMetricsHandler creates a new MetricsHandler with a service dependency.
|
// NewMetricsHandler creates a new MetricsHandler with a service dependency.
|
||||||
@@ -177,6 +195,14 @@ func (h *MetricsHandler) SetExpiryAlerts(c ExpiryAlertSnapshotter) {
|
|||||||
h.expiryAlerts = c
|
h.expiryAlerts = c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SetAuditChainCounter wires the Sprint 6 COMP-001-HASH tamper-evidence
|
||||||
|
// counters for the Prometheus exposition. nil disables the block.
|
||||||
|
// The counter is also passed to scheduler.SetAuditChainBreakRecorder so
|
||||||
|
// the verify loop writes to the same instance the handler reads.
|
||||||
|
func (h *MetricsHandler) SetAuditChainCounter(c AuditChainCounterSnapshotter) {
|
||||||
|
h.auditChainCounter = c
|
||||||
|
}
|
||||||
|
|
||||||
// MetricsResponse represents the JSON metrics response for V2.
|
// MetricsResponse represents the JSON metrics response for V2.
|
||||||
type MetricsResponse struct {
|
type MetricsResponse struct {
|
||||||
Gauge MetricsGauge `json:"gauge"`
|
Gauge MetricsGauge `json:"gauge"`
|
||||||
@@ -523,6 +549,29 @@ func (h MetricsHandler) GetPrometheusMetrics(w http.ResponseWriter, r *http.Requ
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sprint 6 COMP-001-HASH tamper-evidence counters. Emitted as four
|
||||||
|
// adjacent series so an alert rule can fire on any non-zero
|
||||||
|
// certctl_audit_chain_break_detected_total (the operator-actionable
|
||||||
|
// signal — see docs/operator/audit-chain.md).
|
||||||
|
if h.auditChainCounter != nil {
|
||||||
|
snap := h.auditChainCounter.Snapshot()
|
||||||
|
fmt.Fprintf(w, "\n# HELP certctl_audit_chain_break_detected_total Number of audit_events hash-chain breaks detected (Sprint 6 COMP-001-HASH).\n")
|
||||||
|
fmt.Fprintf(w, "# TYPE certctl_audit_chain_break_detected_total counter\n")
|
||||||
|
fmt.Fprintf(w, "certctl_audit_chain_break_detected_total %d\n", snap.BreaksDetected)
|
||||||
|
|
||||||
|
fmt.Fprintf(w, "# HELP certctl_audit_chain_verify_total Number of audit_events_verify_chain() walks completed by the scheduler.\n")
|
||||||
|
fmt.Fprintf(w, "# TYPE certctl_audit_chain_verify_total counter\n")
|
||||||
|
fmt.Fprintf(w, "certctl_audit_chain_verify_total %d\n", snap.WalksCompleted)
|
||||||
|
|
||||||
|
fmt.Fprintf(w, "# HELP certctl_audit_chain_rows Most recent walk's row count (gauge — last-write-wins).\n")
|
||||||
|
fmt.Fprintf(w, "# TYPE certctl_audit_chain_rows gauge\n")
|
||||||
|
fmt.Fprintf(w, "certctl_audit_chain_rows %d\n", snap.LastRowCount)
|
||||||
|
|
||||||
|
fmt.Fprintf(w, "# HELP certctl_audit_chain_last_verified_at Unix seconds of most recent walk (0 = never).\n")
|
||||||
|
fmt.Fprintf(w, "# TYPE certctl_audit_chain_last_verified_at gauge\n")
|
||||||
|
fmt.Fprintf(w, "certctl_audit_chain_last_verified_at %d\n", snap.LastVerifiedAtUnix)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// formatLE formats a histogram bucket boundary the way Prometheus
|
// formatLE formats a histogram bucket boundary the way Prometheus
|
||||||
|
|||||||
@@ -170,6 +170,14 @@ func (r *intuneE2EAuditRepo) List(_ context.Context, _ *repository.AuditFilter)
|
|||||||
return nil, nil
|
return nil, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// VerifyHashChain satisfies the Sprint 6 COMP-001-HASH interface
|
||||||
|
// addition. In-memory stub: always clean.
|
||||||
|
func (r *intuneE2EAuditRepo) VerifyHashChain(_ context.Context) (string, int, int, error) {
|
||||||
|
r.mu.Lock()
|
||||||
|
defer r.mu.Unlock()
|
||||||
|
return "", -1, len(r.events), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (r *intuneE2EAuditRepo) actions() []string {
|
func (r *intuneE2EAuditRepo) actions() []string {
|
||||||
r.mu.Lock()
|
r.mu.Lock()
|
||||||
defer r.mu.Unlock()
|
defer r.mu.Unlock()
|
||||||
|
|||||||
@@ -241,6 +241,35 @@ func (r *etagRecorder) writeHeadersToWire() {
|
|||||||
if r.bodyTruncated && r.headerWrittenOnWire {
|
if r.bodyTruncated && r.headerWrittenOnWire {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// Hotfix #12 (CodeQL alert #34 — go/reflected-xss): defense-in-
|
||||||
|
// depth Content-Type guard. This middleware is wired ONLY to JSON
|
||||||
|
// list endpoints (GET /api/v1/{certificates,agents,jobs,audit,
|
||||||
|
// discovered-certificates} — see internal/api/router/router.go).
|
||||||
|
// Every wrapped handler currently sets Content-Type:
|
||||||
|
// application/json via handler.JSON() before the first Write. But
|
||||||
|
// the recorder is a generic byte forwarder; CodeQL's data-flow
|
||||||
|
// query sees `r.ResponseWriter.Write(b)` at the sink and can't
|
||||||
|
// see that the wrapped handler set a non-HTML Content-Type — so
|
||||||
|
// it flags reflected-XSS even though browsers don't render
|
||||||
|
// application/json as HTML. The fix is to make the Content-Type
|
||||||
|
// guarantee explicit at the chokepoint: if the wrapped handler
|
||||||
|
// forgot to set Content-Type, default to application/json +
|
||||||
|
// charset=utf-8 here. Behavior-preserving for the 5 current
|
||||||
|
// handlers (they all set Content-Type) and a safe guard against
|
||||||
|
// a future handler bug that would otherwise let the browser
|
||||||
|
// content-sniff a JSON body as text/html.
|
||||||
|
//
|
||||||
|
// Drop the embedded-field selector for Header() — etagRecorder
|
||||||
|
// doesn't override Header(), so r.Header() resolves to the
|
||||||
|
// embedded ResponseWriter.Header() (staticcheck QF1008). The
|
||||||
|
// neighboring r.ResponseWriter.WriteHeader / r.ResponseWriter.Write
|
||||||
|
// calls intentionally KEEP the explicit selector because
|
||||||
|
// etagRecorder.Write / etagRecorder.WriteHeader override them
|
||||||
|
// and the embedded form is required to bypass recursion.
|
||||||
|
hdr := r.Header()
|
||||||
|
if hdr.Get("Content-Type") == "" {
|
||||||
|
hdr.Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
}
|
||||||
r.ResponseWriter.WriteHeader(r.status)
|
r.ResponseWriter.WriteHeader(r.status)
|
||||||
r.headerWrittenOnWire = true
|
r.headerWrittenOnWire = true
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/google/uuid"
|
"github.com/google/uuid"
|
||||||
@@ -152,6 +153,14 @@ type RateLimitConfig struct {
|
|||||||
// PerUserBurstSize overrides BurstSize for authenticated callers.
|
// PerUserBurstSize overrides BurstSize for authenticated callers.
|
||||||
// Zero means "use BurstSize".
|
// Zero means "use BurstSize".
|
||||||
PerUserBurstSize int
|
PerUserBurstSize int
|
||||||
|
|
||||||
|
// BucketTTL bounds the lifetime of an unused token bucket in the
|
||||||
|
// per-key map. The background sweeper runs every (BucketTTL/4) and
|
||||||
|
// removes entries whose last allow() call is older than BucketTTL.
|
||||||
|
// Zero or negative values fall through to a 1-hour default; values
|
||||||
|
// below 1 minute are clamped up to 1 minute (sweeper sanity).
|
||||||
|
// SEC-006 closure (Sprint 2, 2026-05-16).
|
||||||
|
BucketTTL time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewRateLimiter creates a per-key token bucket rate limiting middleware.
|
// NewRateLimiter creates a per-key token bucket rate limiting middleware.
|
||||||
@@ -166,11 +175,18 @@ type RateLimitConfig struct {
|
|||||||
// - Unauthenticated: "ip:" + r.RemoteAddr's host portion
|
// - Unauthenticated: "ip:" + r.RemoteAddr's host portion
|
||||||
//
|
//
|
||||||
// The bucket map is sync.RWMutex-guarded; create-on-demand for new keys.
|
// The bucket map is sync.RWMutex-guarded; create-on-demand for new keys.
|
||||||
// There is no eviction; for a long-running server with millions of unique
|
//
|
||||||
// IPs this can leak memory. A future enhancement is per-key TTL via a
|
// SEC-006 closure (Sprint 2, 2026-05-16). Pre-fix the bucket map had no
|
||||||
// lazy sweeper. For now the leak is bounded by realistic operator IP
|
// eviction, so high-cardinality unauthenticated traffic (CGNAT churn,
|
||||||
// fan-out and is acceptable per OWASP ASVS L2 (the threat model is abuse
|
// Tor exit lists, botnets, infinite-cardinality scanners) grew process
|
||||||
// by a known set of clients, not infinite-cardinality scanners).
|
// memory unboundedly. Each bucket now carries `lastAccess`; a background
|
||||||
|
// sweeper goroutine (one per limiter) wakes every (bucketTTL / 4) and
|
||||||
|
// removes entries whose lastAccess is older than `bucketTTL`. Default
|
||||||
|
// TTL is 1 hour — well above realistic operator IP churn windows so a
|
||||||
|
// returning client gets the same bucket, but bounded enough that a
|
||||||
|
// scanner's churn is reclaimed within an hour. Operators can override
|
||||||
|
// via cfg.BucketTTL (or the CERTCTL_RATE_LIMIT_BUCKET_TTL env var that
|
||||||
|
// cmd/server/main.go threads through).
|
||||||
func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
|
func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
|
||||||
// Default per-user budgets to the IP-keyed budget when not overridden.
|
// Default per-user budgets to the IP-keyed budget when not overridden.
|
||||||
perUserRPS := cfg.PerUserRPS
|
perUserRPS := cfg.PerUserRPS
|
||||||
@@ -182,14 +198,33 @@ func NewRateLimiter(cfg RateLimitConfig) func(http.Handler) http.Handler {
|
|||||||
perUserBurst = float64(cfg.BurstSize)
|
perUserBurst = float64(cfg.BurstSize)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SEC-006: bucket TTL eviction. Default 1h; minimum 1m to keep
|
||||||
|
// the sweeper from running pathologically often if an operator
|
||||||
|
// sets a tiny value.
|
||||||
|
bucketTTL := cfg.BucketTTL
|
||||||
|
if bucketTTL <= 0 {
|
||||||
|
bucketTTL = time.Hour
|
||||||
|
}
|
||||||
|
if bucketTTL < time.Minute {
|
||||||
|
bucketTTL = time.Minute
|
||||||
|
}
|
||||||
|
|
||||||
limiter := &keyedRateLimiter{
|
limiter := &keyedRateLimiter{
|
||||||
ipRate: cfg.RPS,
|
ipRate: cfg.RPS,
|
||||||
ipBurst: float64(cfg.BurstSize),
|
ipBurst: float64(cfg.BurstSize),
|
||||||
userRate: perUserRPS,
|
userRate: perUserRPS,
|
||||||
userBurst: perUserBurst,
|
userBurst: perUserBurst,
|
||||||
buckets: make(map[string]*tokenBucket),
|
buckets: make(map[string]*tokenBucket),
|
||||||
|
bucketTTL: bucketTTL,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sweeper goroutine. Single goroutine per limiter; production wires
|
||||||
|
// 2 limiters (default + no-auth-fallback) so the cost is 2 idle
|
||||||
|
// goroutines per server. Lives for the process lifetime; no
|
||||||
|
// shutdown handle is exposed because main.go owns both limiters
|
||||||
|
// for the entire run.
|
||||||
|
go limiter.sweepLoop()
|
||||||
|
|
||||||
return func(next http.Handler) http.Handler {
|
return func(next http.Handler) http.Handler {
|
||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
key, isUser := rateLimitKey(r)
|
key, isUser := rateLimitKey(r)
|
||||||
@@ -231,6 +266,12 @@ func rateLimitKey(r *http.Request) (string, bool) {
|
|||||||
|
|
||||||
// keyedRateLimiter holds a token bucket per (user-or-ip) key with separate
|
// keyedRateLimiter holds a token bucket per (user-or-ip) key with separate
|
||||||
// rate / burst defaults for the user-keyed and ip-keyed dimensions.
|
// rate / burst defaults for the user-keyed and ip-keyed dimensions.
|
||||||
|
//
|
||||||
|
// SEC-006: bucketTTL bounds the unused-bucket lifetime; sweepLoop runs
|
||||||
|
// in a goroutine spawned by NewRateLimiter and evicts entries whose
|
||||||
|
// lastAccess is older than bucketTTL on every (bucketTTL/4) tick.
|
||||||
|
// evictedTotal exposes the lifetime eviction count (atomic-loaded by
|
||||||
|
// tests and the operator stats endpoint).
|
||||||
type keyedRateLimiter struct {
|
type keyedRateLimiter struct {
|
||||||
mu sync.RWMutex
|
mu sync.RWMutex
|
||||||
buckets map[string]*tokenBucket
|
buckets map[string]*tokenBucket
|
||||||
@@ -238,6 +279,14 @@ type keyedRateLimiter struct {
|
|||||||
ipBurst float64
|
ipBurst float64
|
||||||
userRate float64
|
userRate float64
|
||||||
userBurst float64
|
userBurst float64
|
||||||
|
|
||||||
|
bucketTTL time.Duration
|
||||||
|
evictedTotal atomic.Uint64
|
||||||
|
// sweepTick is the channel sweepLoop ticks on. Default time.Ticker;
|
||||||
|
// tests swap to a manual chan time.Time for deterministic eviction.
|
||||||
|
// Set via the (test-only) seam noted below; production never
|
||||||
|
// reassigns this field.
|
||||||
|
sweepTickCh <-chan time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
|
func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
|
||||||
@@ -260,22 +309,90 @@ func (k *keyedRateLimiter) allow(key string, isUser bool) bool {
|
|||||||
burstSize: burst,
|
burstSize: burst,
|
||||||
tokens: burst,
|
tokens: burst,
|
||||||
lastRefill: time.Now(),
|
lastRefill: time.Now(),
|
||||||
|
lastAccess: time.Now(),
|
||||||
}
|
}
|
||||||
k.buckets[key] = tb
|
k.buckets[key] = tb
|
||||||
}
|
}
|
||||||
k.mu.Unlock()
|
k.mu.Unlock()
|
||||||
}
|
}
|
||||||
return tb.allow()
|
allowed := tb.allow()
|
||||||
|
// SEC-006: update lastAccess on every call (cheap; same mutex
|
||||||
|
// the bucket already holds via tb.allow's mu). Sweeper reads
|
||||||
|
// this to decide eviction.
|
||||||
|
tb.touch()
|
||||||
|
return allowed
|
||||||
|
}
|
||||||
|
|
||||||
|
// sweepLoop is the background eviction goroutine spawned by
|
||||||
|
// NewRateLimiter. It wakes every bucketTTL/4 and removes any bucket
|
||||||
|
// whose lastAccess is older than bucketTTL. The (bucketTTL/4) cadence
|
||||||
|
// is a compromise — fast enough to keep the map ceiling tight,
|
||||||
|
// slow enough that the sweep cost amortises across many requests.
|
||||||
|
// SEC-006 closure.
|
||||||
|
func (k *keyedRateLimiter) sweepLoop() {
|
||||||
|
// Test seam: if a manual tick channel is wired, use it. Production
|
||||||
|
// always uses time.NewTicker which time.Time-types the channel
|
||||||
|
// identically.
|
||||||
|
if k.sweepTickCh != nil {
|
||||||
|
for range k.sweepTickCh {
|
||||||
|
k.sweep()
|
||||||
|
}
|
||||||
|
return
|
||||||
|
}
|
||||||
|
period := k.bucketTTL / 4
|
||||||
|
if period < time.Second {
|
||||||
|
period = time.Second
|
||||||
|
}
|
||||||
|
t := time.NewTicker(period)
|
||||||
|
defer t.Stop()
|
||||||
|
for range t.C {
|
||||||
|
k.sweep()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// sweep removes every bucket whose lastAccess is older than bucketTTL
|
||||||
|
// and bumps evictedTotal. Exported for tests via a same-package alias.
|
||||||
|
func (k *keyedRateLimiter) sweep() {
|
||||||
|
cutoff := time.Now().Add(-k.bucketTTL)
|
||||||
|
k.mu.Lock()
|
||||||
|
defer k.mu.Unlock()
|
||||||
|
for key, tb := range k.buckets {
|
||||||
|
if tb.lastAccessTime().Before(cutoff) {
|
||||||
|
delete(k.buckets, key)
|
||||||
|
k.evictedTotal.Add(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// tokenBucket implements a simple thread-safe token bucket rate limiter.
|
// tokenBucket implements a simple thread-safe token bucket rate limiter.
|
||||||
// This avoids importing golang.org/x/time/rate to keep dependencies minimal.
|
// This avoids importing golang.org/x/time/rate to keep dependencies minimal.
|
||||||
|
//
|
||||||
|
// SEC-006: lastAccess is updated on every allow() call (via touch()) so
|
||||||
|
// the keyedRateLimiter sweeper can evict idle buckets without a second
|
||||||
|
// per-key map. Guarded by the same mu as rate-limiting state.
|
||||||
type tokenBucket struct {
|
type tokenBucket struct {
|
||||||
mu sync.Mutex
|
mu sync.Mutex
|
||||||
rate float64 // tokens per second
|
rate float64 // tokens per second
|
||||||
burstSize float64 // max tokens
|
burstSize float64 // max tokens
|
||||||
tokens float64 // current tokens
|
tokens float64 // current tokens
|
||||||
lastRefill time.Time // last refill time
|
lastRefill time.Time // last refill time
|
||||||
|
lastAccess time.Time // last allow() call — for SEC-006 sweeper
|
||||||
|
}
|
||||||
|
|
||||||
|
// touch updates the bucket's lastAccess timestamp under its own mutex.
|
||||||
|
// Called from keyedRateLimiter.allow after the rate-limit decision.
|
||||||
|
func (tb *tokenBucket) touch() {
|
||||||
|
tb.mu.Lock()
|
||||||
|
tb.lastAccess = time.Now()
|
||||||
|
tb.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
// lastAccessTime is the sweeper's read accessor. Uses the bucket's
|
||||||
|
// own mutex so the read is consistent with concurrent touch() calls.
|
||||||
|
func (tb *tokenBucket) lastAccessTime() time.Time {
|
||||||
|
tb.mu.Lock()
|
||||||
|
defer tb.mu.Unlock()
|
||||||
|
return tb.lastAccess
|
||||||
}
|
}
|
||||||
|
|
||||||
func (tb *tokenBucket) allow() bool {
|
func (tb *tokenBucket) allow() bool {
|
||||||
|
|||||||
@@ -2,9 +2,11 @@ package middleware
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/certctl-io/certctl/internal/auth"
|
"github.com/certctl-io/certctl/internal/auth"
|
||||||
)
|
)
|
||||||
@@ -188,3 +190,94 @@ func TestRateLimiter_M025_EmptyUserKeyTreatedAsAnonymous(t *testing.T) {
|
|||||||
t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code)
|
t.Errorf("second anonymous request from different IP should still pass (independent IP buckets); got %d", rr.Code)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// SEC-006 closure (Sprint 2, 2026-05-16). The token-bucket map now has
|
||||||
|
// a background sweeper that evicts buckets whose last allow() call is
|
||||||
|
// older than the configured BucketTTL. This test pins the eviction
|
||||||
|
// path against a synthetic 1000-key load and asserts:
|
||||||
|
//
|
||||||
|
// 1. Buckets created by N distinct keys land in the map.
|
||||||
|
// 2. After the simulated TTL elapses and the sweeper runs, the map
|
||||||
|
// is reclaimed and evictedTotal reflects the count.
|
||||||
|
// 3. A subsequent request from a fresh key creates a new bucket
|
||||||
|
// (i.e. the map isn't poisoned by the eviction).
|
||||||
|
//
|
||||||
|
// The test calls sweep() directly rather than relying on the goroutine
|
||||||
|
// + time.Ticker so it stays deterministic and fast. The sweeper
|
||||||
|
// goroutine itself is exercised in production; this test pins the
|
||||||
|
// eviction predicate.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestKeyedRateLimiter_SweepEvictsIdleBuckets(t *testing.T) {
|
||||||
|
limiter := &keyedRateLimiter{
|
||||||
|
ipRate: 1000,
|
||||||
|
ipBurst: 1000,
|
||||||
|
userRate: 1000,
|
||||||
|
userBurst: 1000,
|
||||||
|
buckets: make(map[string]*tokenBucket),
|
||||||
|
bucketTTL: 100 * time.Millisecond,
|
||||||
|
}
|
||||||
|
|
||||||
|
// Populate 1000 buckets from a synthetic IP-key churn.
|
||||||
|
for i := 0; i < 1000; i++ {
|
||||||
|
key := "ip:198.51.100." + fmt.Sprintf("%d", i%256) + "/" + fmt.Sprintf("%d", i)
|
||||||
|
if !limiter.allow(key, false) {
|
||||||
|
t.Fatalf("synthetic IP-key %d: allow returned false on first call", i)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
limiter.mu.RLock()
|
||||||
|
if got := len(limiter.buckets); got != 1000 {
|
||||||
|
limiter.mu.RUnlock()
|
||||||
|
t.Fatalf("post-populate bucket count = %d; want 1000", got)
|
||||||
|
}
|
||||||
|
limiter.mu.RUnlock()
|
||||||
|
|
||||||
|
// Advance past the TTL boundary, then sweep.
|
||||||
|
time.Sleep(110 * time.Millisecond)
|
||||||
|
limiter.sweep()
|
||||||
|
|
||||||
|
limiter.mu.RLock()
|
||||||
|
remaining := len(limiter.buckets)
|
||||||
|
limiter.mu.RUnlock()
|
||||||
|
if remaining != 0 {
|
||||||
|
t.Errorf("post-sweep bucket count = %d; want 0 (all should have been evicted)", remaining)
|
||||||
|
}
|
||||||
|
if got := limiter.evictedTotal.Load(); got != 1000 {
|
||||||
|
t.Errorf("evictedTotal = %d; want 1000", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
// A fresh request creates a new bucket — map isn't poisoned.
|
||||||
|
if !limiter.allow("ip:203.0.113.7", false) {
|
||||||
|
t.Errorf("fresh key: allow returned false on first call after sweep")
|
||||||
|
}
|
||||||
|
limiter.mu.RLock()
|
||||||
|
defer limiter.mu.RUnlock()
|
||||||
|
if got := len(limiter.buckets); got != 1 {
|
||||||
|
t.Errorf("post-sweep-plus-one bucket count = %d; want 1", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestKeyedRateLimiter_SweepKeepsActiveBuckets pins the inverse — a
|
||||||
|
// bucket touched within the TTL window survives the sweep. Catches a
|
||||||
|
// future regression that inverts the cutoff comparison.
|
||||||
|
func TestKeyedRateLimiter_SweepKeepsActiveBuckets(t *testing.T) {
|
||||||
|
limiter := &keyedRateLimiter{
|
||||||
|
ipRate: 1000,
|
||||||
|
ipBurst: 1000,
|
||||||
|
userRate: 1000,
|
||||||
|
userBurst: 1000,
|
||||||
|
buckets: make(map[string]*tokenBucket),
|
||||||
|
bucketTTL: 1 * time.Hour, // generous so test timing doesn't flake
|
||||||
|
}
|
||||||
|
limiter.allow("ip:198.51.100.42", false)
|
||||||
|
limiter.sweep()
|
||||||
|
limiter.mu.RLock()
|
||||||
|
defer limiter.mu.RUnlock()
|
||||||
|
if got := len(limiter.buckets); got != 1 {
|
||||||
|
t.Errorf("active-bucket count = %d; want 1 (sweep should not evict within TTL)", got)
|
||||||
|
}
|
||||||
|
if got := limiter.evictedTotal.Load(); got != 0 {
|
||||||
|
t.Errorf("evictedTotal = %d; want 0 (no evictions expected)", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ type SecurityHeadersConfig struct {
|
|||||||
ContentTypeOptions string // X-Content-Type-Options
|
ContentTypeOptions string // X-Content-Type-Options
|
||||||
ReferrerPolicy string // Referrer-Policy
|
ReferrerPolicy string // Referrer-Policy
|
||||||
ContentSecurityPolicy string // Content-Security-Policy
|
ContentSecurityPolicy string // Content-Security-Policy
|
||||||
|
PermissionsPolicy string // Permissions-Policy (SEC-008 closure, Sprint 2 ACQ 2026-05-16)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SecurityHeadersDefaults returns a recommended baseline.
|
// SecurityHeadersDefaults returns a recommended baseline.
|
||||||
@@ -32,9 +33,35 @@ type SecurityHeadersConfig struct {
|
|||||||
// CSP: default-src 'self' confines fetches to the same origin.
|
// CSP: default-src 'self' confines fetches to the same origin.
|
||||||
// img-src 'self' data: allows inline base64 images (used by the
|
// img-src 'self' data: allows inline base64 images (used by the
|
||||||
// dashboard's certctl-logo and a few status icons).
|
// dashboard's certctl-logo and a few status icons).
|
||||||
// style-src 'self' 'unsafe-inline' is required because Tailwind
|
// style-src 'self' 'unsafe-inline' — the 'unsafe-inline' grant
|
||||||
// (via Vite) injects per-component <style> blocks at build time;
|
// is required by React's inline `style={...}` attribute model,
|
||||||
// without 'unsafe-inline' the dashboard would render unstyled.
|
// which emits HTML `style="..."` attributes that the browser
|
||||||
|
// treats as inline styles for CSP purposes. The dashboard has 5
|
||||||
|
// load-bearing dynamic-style sites: Tooltip's Floating-UI
|
||||||
|
// position (left/top px values computed per-tick),
|
||||||
|
// AgentFleetPage's dynamic color+width chart bars,
|
||||||
|
// dashboard/charts.tsx Recharts color props, CertificatesPage's
|
||||||
|
// progress-bar percent width, IssuerHierarchyPage's depth-based
|
||||||
|
// marginLeft. The static-pixel uses (UsersPage filter + table UI,
|
||||||
|
// DigestPage iframe min-height, AuthProvider demo-mode banner)
|
||||||
|
// were migrated to Tailwind utility classes via FE-M6 closure
|
||||||
|
// 2026-05-14.
|
||||||
|
//
|
||||||
|
// FE-M6 audit-framing correction: this comment USED TO say
|
||||||
|
// "Tailwind (via Vite) injects per-component <style> blocks at
|
||||||
|
// build time." That was factually wrong. Vite's CSS output is a
|
||||||
|
// single .css file linked via <link rel="stylesheet"> — verified
|
||||||
|
// against dist/index.html post-build: zero <style> tags emitted.
|
||||||
|
// The 'unsafe-inline' grant exists for React's style-attribute
|
||||||
|
// output path, not for Vite or Tailwind.
|
||||||
|
//
|
||||||
|
// Fully eliminating 'unsafe-inline' would require either banning
|
||||||
|
// dynamic `style={...}` (rewriting the 5 load-bearing sites with
|
||||||
|
// a CSS-in-JS library that emits hashed/nonce'd <style> blocks)
|
||||||
|
// or adopting CSP nonces with React 18+'s style runtime. Neither
|
||||||
|
// fits the original FE-M6 phase budget; tracked as a future
|
||||||
|
// security-hardening item.
|
||||||
|
//
|
||||||
// 'unsafe-inline' is intentionally NOT in script-src — the
|
// 'unsafe-inline' is intentionally NOT in script-src — the
|
||||||
// front-end ships as a bundled JS file, no inline scripts.
|
// front-end ships as a bundled JS file, no inline scripts.
|
||||||
//
|
//
|
||||||
@@ -52,6 +79,19 @@ type SecurityHeadersConfig struct {
|
|||||||
// Referrer-Policy: no-referrer-when-downgrade — preserves Referer
|
// Referrer-Policy: no-referrer-when-downgrade — preserves Referer
|
||||||
// for same-origin navigation (useful for support/diagnostics) but
|
// for same-origin navigation (useful for support/diagnostics) but
|
||||||
// strips it on HTTPS→HTTP transitions.
|
// strips it on HTTPS→HTTP transitions.
|
||||||
|
//
|
||||||
|
// Permissions-Policy: deny-all-browser-features default. Acquisition-
|
||||||
|
// audit SEC-008 closure (Sprint 2 ACQ, 2026-05-16). certctl is a
|
||||||
|
// control-plane API + dashboard; no part of the surface needs
|
||||||
|
// access to the camera, microphone, geolocation, accelerometer,
|
||||||
|
// payment, USB, or the deprecated `interest-cohort` (FLoC) browser
|
||||||
|
// feature. The deny-all default removes those attack/fingerprint
|
||||||
|
// surfaces if certctl is ever embedded in a malicious page or if a
|
||||||
|
// dashboard route is XSS-compromised post-CSP-bypass. Operators
|
||||||
|
// running certctl with intentional dependence on any of these (e.g.
|
||||||
|
// hardware-attestation flows wanting WebAuthn's USB transport) can
|
||||||
|
// set `Cfg.PermissionsPolicy: ""` to suppress the header entirely,
|
||||||
|
// or override with their own narrowed allowlist.
|
||||||
func SecurityHeadersDefaults() SecurityHeadersConfig {
|
func SecurityHeadersDefaults() SecurityHeadersConfig {
|
||||||
return SecurityHeadersConfig{
|
return SecurityHeadersConfig{
|
||||||
HSTS: "max-age=31536000; includeSubDomains",
|
HSTS: "max-age=31536000; includeSubDomains",
|
||||||
@@ -59,6 +99,7 @@ func SecurityHeadersDefaults() SecurityHeadersConfig {
|
|||||||
ContentTypeOptions: "nosniff",
|
ContentTypeOptions: "nosniff",
|
||||||
ReferrerPolicy: "no-referrer-when-downgrade",
|
ReferrerPolicy: "no-referrer-when-downgrade",
|
||||||
ContentSecurityPolicy: "default-src 'self'; img-src 'self' data:; style-src 'self' 'unsafe-inline'; script-src 'self'; connect-src 'self'; frame-ancestors 'none'",
|
ContentSecurityPolicy: "default-src 'self'; img-src 'self' data:; style-src 'self' 'unsafe-inline'; script-src 'self'; connect-src 'self'; frame-ancestors 'none'",
|
||||||
|
PermissionsPolicy: "accelerometer=(), camera=(), geolocation=(), microphone=(), payment=(), usb=(), interest-cohort=()",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -74,7 +115,7 @@ func SecurityHeaders(cfg SecurityHeadersConfig) func(http.Handler) http.Handler
|
|||||||
// Pre-trim each value once; the per-request hot path stays a
|
// Pre-trim each value once; the per-request hot path stays a
|
||||||
// straight set of map writes.
|
// straight set of map writes.
|
||||||
type headerEntry struct{ name, value string }
|
type headerEntry struct{ name, value string }
|
||||||
entries := make([]headerEntry, 0, 5)
|
entries := make([]headerEntry, 0, 6)
|
||||||
add := func(name, value string) {
|
add := func(name, value string) {
|
||||||
v := strings.TrimSpace(value)
|
v := strings.TrimSpace(value)
|
||||||
if v != "" {
|
if v != "" {
|
||||||
@@ -86,6 +127,7 @@ func SecurityHeaders(cfg SecurityHeadersConfig) func(http.Handler) http.Handler
|
|||||||
add("X-Content-Type-Options", cfg.ContentTypeOptions)
|
add("X-Content-Type-Options", cfg.ContentTypeOptions)
|
||||||
add("Referrer-Policy", cfg.ReferrerPolicy)
|
add("Referrer-Policy", cfg.ReferrerPolicy)
|
||||||
add("Content-Security-Policy", cfg.ContentSecurityPolicy)
|
add("Content-Security-Policy", cfg.ContentSecurityPolicy)
|
||||||
|
add("Permissions-Policy", cfg.PermissionsPolicy)
|
||||||
|
|
||||||
return func(next http.Handler) http.Handler {
|
return func(next http.Handler) http.Handler {
|
||||||
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ func TestSecurityHeaders_DefaultsAllPresent(t *testing.T) {
|
|||||||
"X-Content-Type-Options",
|
"X-Content-Type-Options",
|
||||||
"Referrer-Policy",
|
"Referrer-Policy",
|
||||||
"Content-Security-Policy",
|
"Content-Security-Policy",
|
||||||
|
"Permissions-Policy",
|
||||||
} {
|
} {
|
||||||
if got := rec.Header().Get(h); got == "" {
|
if got := rec.Header().Get(h); got == "" {
|
||||||
t.Errorf("expected header %q to be set, got empty", h)
|
t.Errorf("expected header %q to be set, got empty", h)
|
||||||
@@ -102,3 +103,51 @@ func TestSecurityHeaders_AppliedOnErrorResponses(t *testing.T) {
|
|||||||
t.Errorf("CSP missing on 401 response")
|
t.Errorf("CSP missing on 401 response")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestSecurityHeaders_PermissionsPolicyDefault pins the literal value
|
||||||
|
// of the default Permissions-Policy header. Acquisition-audit SEC-008
|
||||||
|
// closure (Sprint 2 ACQ, 2026-05-16). The deny-all baseline removes
|
||||||
|
// camera/microphone/geolocation/accelerometer/payment/USB/interest-cohort
|
||||||
|
// attack + fingerprint surfaces — none of which the certctl control
|
||||||
|
// plane needs. A regression here (e.g. someone widening to allow
|
||||||
|
// camera=*) would surface as a failing test.
|
||||||
|
func TestSecurityHeaders_PermissionsPolicyDefault(t *testing.T) {
|
||||||
|
mw := SecurityHeaders(SecurityHeadersDefaults())
|
||||||
|
handler := mw(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
|
||||||
|
got := rec.Header().Get("Permissions-Policy")
|
||||||
|
if got == "" {
|
||||||
|
t.Fatal("Permissions-Policy missing from default response")
|
||||||
|
}
|
||||||
|
want := "accelerometer=(), camera=(), geolocation=(), microphone=(), payment=(), usb=(), interest-cohort=()"
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("Permissions-Policy default = %q; want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestSecurityHeaders_PermissionsPolicyOverrideToEmptySuppresses pins
|
||||||
|
// the operator escape hatch: setting Cfg.PermissionsPolicy = "" makes
|
||||||
|
// the middleware omit the header entirely (per the per-field empty-
|
||||||
|
// string suppression contract), without affecting the other defaults.
|
||||||
|
// Acquisition-audit SEC-008 closure (Sprint 2 ACQ, 2026-05-16).
|
||||||
|
func TestSecurityHeaders_PermissionsPolicyOverrideToEmptySuppresses(t *testing.T) {
|
||||||
|
cfg := SecurityHeadersDefaults()
|
||||||
|
cfg.PermissionsPolicy = ""
|
||||||
|
mw := SecurityHeaders(cfg)
|
||||||
|
handler := mw(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
|
||||||
|
w.WriteHeader(http.StatusOK)
|
||||||
|
}))
|
||||||
|
rec := httptest.NewRecorder()
|
||||||
|
handler.ServeHTTP(rec, httptest.NewRequest(http.MethodGet, "/", nil))
|
||||||
|
|
||||||
|
if got := rec.Header().Get("Permissions-Policy"); got != "" {
|
||||||
|
t.Errorf("Permissions-Policy = %q; want empty (operator override-to-empty suppression)", got)
|
||||||
|
}
|
||||||
|
if got := rec.Header().Get("Strict-Transport-Security"); got == "" {
|
||||||
|
t.Errorf("HSTS suppressed too; the empty-string override is per-field")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ import (
|
|||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"strings"
|
"strings"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/validation"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Coverage fill — v2.1.0 release gate Phase 3.
|
// Coverage fill — v2.1.0 release gate Phase 3.
|
||||||
@@ -59,6 +61,54 @@ func TestJWKSStatus_ReturnsSnapshot_AfterAuthRequestPopulatesEntry(t *testing.T)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestTestDiscovery_RejectsSSRFIssuer_AtEarlyFailRail pins the
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16): TestDiscovery refuses
|
||||||
|
// reserved-address issuers up-front via validateIssuerSSRF, surfacing
|
||||||
|
// a clean "issuer_url failed SSRF policy" error in the result's
|
||||||
|
// Errors slice without ever hitting the dial path. The package-wide
|
||||||
|
// setup_test.go init() swaps validateIssuerSSRF to a no-op so the
|
||||||
|
// other tests can use httptest loopback servers; this test temporarily
|
||||||
|
// restores the production gate (validation.ValidateSafeURL) and
|
||||||
|
// asserts the rejection fires.
|
||||||
|
func TestTestDiscovery_RejectsSSRFIssuer_AtEarlyFailRail(t *testing.T) {
|
||||||
|
prev := validateIssuerSSRF
|
||||||
|
validateIssuerSSRF = validation.ValidateSafeURL
|
||||||
|
defer func() { validateIssuerSSRF = prev }()
|
||||||
|
|
||||||
|
svc := newServiceForUnitTest(t)
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
issuer string
|
||||||
|
}{
|
||||||
|
{"loopback_v4", "https://127.0.0.1/realms/certctl"},
|
||||||
|
{"loopback_v6", "https://[::1]/realms/certctl"},
|
||||||
|
{"cloud_metadata", "https://169.254.169.254/latest/meta-data/"},
|
||||||
|
{"link_local_v4", "https://169.254.10.5/realms/certctl"},
|
||||||
|
{"link_local_v6", "https://[fe80::1]/realms/certctl"},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
res, err := svc.TestDiscovery(context.Background(), tc.issuer)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("TestDiscovery (non-fatal): %v", err)
|
||||||
|
}
|
||||||
|
if res == nil {
|
||||||
|
t.Fatalf("expected non-nil result")
|
||||||
|
}
|
||||||
|
if res.DiscoverySucceeded {
|
||||||
|
t.Errorf("expected DiscoverySucceeded=false for SSRF issuer; got true")
|
||||||
|
}
|
||||||
|
if len(res.Errors) == 0 {
|
||||||
|
t.Fatalf("expected non-empty Errors slice")
|
||||||
|
}
|
||||||
|
joined := strings.Join(res.Errors, "|")
|
||||||
|
if !strings.Contains(joined, "SSRF policy") {
|
||||||
|
t.Errorf("expected 'SSRF policy' in errors; got %v", res.Errors)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// TestTestDiscovery_DiscoveryFailure_ReturnsErrorsSlice points
|
// TestTestDiscovery_DiscoveryFailure_ReturnsErrorsSlice points
|
||||||
// TestDiscovery at a URL that doesn't serve a discovery doc; the
|
// TestDiscovery at a URL that doesn't serve a discovery doc; the
|
||||||
// function MUST return res with DiscoverySucceeded=false and a
|
// function MUST return res with DiscoverySucceeded=false and a
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import (
|
|||||||
"time"
|
"time"
|
||||||
|
|
||||||
authdomain "github.com/certctl-io/certctl/internal/domain/auth"
|
authdomain "github.com/certctl-io/certctl/internal/domain/auth"
|
||||||
|
"github.com/certctl-io/certctl/internal/validation"
|
||||||
)
|
)
|
||||||
|
|
||||||
// OIDCProvider describes a configured OpenID Connect identity provider
|
// OIDCProvider describes a configured OpenID Connect identity provider
|
||||||
@@ -160,6 +161,16 @@ func (p *OIDCProvider) Validate() error {
|
|||||||
if _, err := url.Parse(p.IssuerURL); err != nil {
|
if _, err := url.Parse(p.IssuerURL); err != nil {
|
||||||
return fmt.Errorf("oidc: issuer_url is not a valid URL: %w", err)
|
return fmt.Errorf("oidc: issuer_url is not a valid URL: %w", err)
|
||||||
}
|
}
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16): reject reserved-address
|
||||||
|
// issuers (loopback / RFC 1918 / link-local / cloud metadata) at
|
||||||
|
// provider-creation time. Defense-in-depth alongside
|
||||||
|
// oidc.SafeOIDCContext, which is the authoritative dial-time
|
||||||
|
// re-resolution + reject. The static URL check stops the obvious
|
||||||
|
// case ("https://169.254.169.254/...") before the row is persisted
|
||||||
|
// or the dry-run validator runs.
|
||||||
|
if err := validation.ValidateSafeURL(p.IssuerURL); err != nil {
|
||||||
|
return fmt.Errorf("oidc: issuer_url failed SSRF policy: %w", err)
|
||||||
|
}
|
||||||
if strings.TrimSpace(p.ClientID) == "" {
|
if strings.TrimSpace(p.ClientID) == "" {
|
||||||
return ErrOIDCEmptyClientID
|
return ErrOIDCEmptyClientID
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -82,6 +82,41 @@ func TestOIDCProvider_Validate_RejectsNonHTTPSIssuer(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16). The IssuerURL Validate gate
|
||||||
|
// now refuses reserved-address issuers (loopback, RFC 1918,
|
||||||
|
// link-local, IPv6 loopback, IPv6 link-local, cloud metadata) so a
|
||||||
|
// row claiming https://127.0.0.1/... or https://169.254.169.254/...
|
||||||
|
// never makes it to the persistence layer or the runtime discovery
|
||||||
|
// dial. Authoritative dial-time rejection lives in
|
||||||
|
// internal/validation.SafeHTTPDialContext (DNS-rebinding-safe); this
|
||||||
|
// test pins the static URL gate that surfaces the policy violation
|
||||||
|
// with a clean error before any network I/O.
|
||||||
|
func TestOIDCProvider_Validate_RejectsSSRFIssuer(t *testing.T) {
|
||||||
|
cases := []struct {
|
||||||
|
name string
|
||||||
|
issuer string
|
||||||
|
}{
|
||||||
|
{"loopback_v4", "https://127.0.0.1/realms/certctl"},
|
||||||
|
{"loopback_v6", "https://[::1]/realms/certctl"},
|
||||||
|
{"cloud_metadata", "https://169.254.169.254/latest/meta-data/"},
|
||||||
|
{"link_local_v4", "https://169.254.10.5/realms/certctl"},
|
||||||
|
{"link_local_v6", "https://[fe80::1]/realms/certctl"},
|
||||||
|
}
|
||||||
|
for _, tc := range cases {
|
||||||
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
|
p := validProvider()
|
||||||
|
p.IssuerURL = tc.issuer
|
||||||
|
err := p.Validate()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("issuer=%q: Validate returned nil; want SSRF policy rejection", tc.issuer)
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "SSRF policy") {
|
||||||
|
t.Errorf("issuer=%q: err=%v; want error mentioning SSRF policy", tc.issuer, err)
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestOIDCProvider_Validate_RejectsEmptyClientID(t *testing.T) {
|
func TestOIDCProvider_Validate_RejectsEmptyClientID(t *testing.T) {
|
||||||
p := validProvider()
|
p := validProvider()
|
||||||
p.ClientID = ""
|
p.ClientID = ""
|
||||||
|
|||||||
@@ -0,0 +1,122 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package oidc
|
||||||
|
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16). Pre-fix, two OIDC discovery
|
||||||
|
// call sites passed the bare request context to gooidc.NewProvider:
|
||||||
|
//
|
||||||
|
// - test_discovery.go:65 (dry-run validator from the GUI)
|
||||||
|
// - service.go:1066 (runtime provider load on first cache miss)
|
||||||
|
//
|
||||||
|
// Acquisition-audit follow-up SEC-020 + SEC-021 (Sprint 1 follow-up,
|
||||||
|
// 2026-05-16) extended the same wrap to two adjacent call sites that
|
||||||
|
// the original SEC-001 sweep missed:
|
||||||
|
//
|
||||||
|
// - service.go::fetchUserinfoGroups (~L948-961, SEC-020 closure) —
|
||||||
|
// the userinfo-fallback path called entry.provider.UserInfo(ctx, ts)
|
||||||
|
// with bare ctx. go-oidc/v3 Provider.UserInfo derives its HTTP
|
||||||
|
// client from the context via getClient(ctx) (oidc.go:61-65);
|
||||||
|
// without an override, the internal doRequest falls through to
|
||||||
|
// http.DefaultClient.
|
||||||
|
// - internal/api/handler/auth_session_oidc_bcl.go::Verify (~L125,
|
||||||
|
// SEC-021 closure) — the back-channel-logout verifier performs a
|
||||||
|
// per-request discovery re-fetch via gooidc.NewProvider(ctx, ...)
|
||||||
|
// with bare ctx; SafeOIDCContext now wraps before the call.
|
||||||
|
//
|
||||||
|
// Context-key shape: gooidc.ClientContext is implemented as
|
||||||
|
// context.WithValue(ctx, oauth2.HTTPClient, client)
|
||||||
|
// (go-oidc v3.18.0 oidc.go:57-59). Both go-oidc's getClient AND
|
||||||
|
// golang.org/x/oauth2's internal.ContextClient read oauth2.HTTPClient,
|
||||||
|
// so the SINGLE SafeOIDCContext wrap covers go-oidc-driven HTTP calls
|
||||||
|
// (Provider.UserInfo / NewProvider discovery / Verifier JWKS) AND
|
||||||
|
// oauth2-driven HTTP calls (Config.TokenSource refresh / Exchange).
|
||||||
|
// No additional context.WithValue(ctx, oauth2.HTTPClient, ...) is
|
||||||
|
// required alongside the wrap.
|
||||||
|
//
|
||||||
|
// gooidc.NewProvider derives its HTTP client from the context via
|
||||||
|
// oidc.ClientContext; with no override it falls through to
|
||||||
|
// http.DefaultClient. The default client has no SSRF guard, so an admin
|
||||||
|
// with `auth.oidc.create` could induce server-side HTTPS egress to
|
||||||
|
// loopback (127.0.0.1, ::1), RFC 1918 (10/8 / 172.16/12 / 192.168/16),
|
||||||
|
// link-local (169.254.169.254 — cloud-instance metadata), and IPv6
|
||||||
|
// link-local (fe80::/10).
|
||||||
|
//
|
||||||
|
// The companion JWKS reachability probe (jwksReachable + jwksProbeClient
|
||||||
|
// in this package) was already routed through SafeHTTPDialContext via
|
||||||
|
// the Bundle 5 R6 closure; the discovery + claims path bypassed that
|
||||||
|
// guard.
|
||||||
|
//
|
||||||
|
// This file adds the symmetric guard for the discovery leg:
|
||||||
|
//
|
||||||
|
// - oidcDiscoveryClient — an *http.Client wrapping a Transport whose
|
||||||
|
// DialContext is SafeHTTPDialContext, sized to the same outbound
|
||||||
|
// budget as jwksProbeClient (oidcOutboundTimeout = 10s).
|
||||||
|
// - SafeOIDCContext(ctx) — returns a context that gooidc.NewProvider
|
||||||
|
// and the resulting Verifier will use for every outbound call.
|
||||||
|
//
|
||||||
|
// The two call sites above are rewritten to thread their context through
|
||||||
|
// SafeOIDCContext before NewProvider runs. The fail-closed posture is
|
||||||
|
// owned by validation.SafeHTTPDialContext — DNS-rebinding-safe by
|
||||||
|
// re-resolving at dial time and rejecting any reserved address that
|
||||||
|
// surfaces in the resolution.
|
||||||
|
//
|
||||||
|
// Defense-in-depth: domain/types.go.Validate also calls
|
||||||
|
// validation.ValidateSafeURL on the persisted IssuerURL at provider-
|
||||||
|
// creation time so reserved-address issuers fail before they ever reach
|
||||||
|
// the cache + dial path.
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"net/http"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
gooidc "github.com/coreos/go-oidc/v3/oidc"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/validation"
|
||||||
|
)
|
||||||
|
|
||||||
|
// oidcDiscoveryClient is the *http.Client gooidc.NewProvider uses for
|
||||||
|
// the discovery doc fetch + the per-Verifier JWKS read it issues
|
||||||
|
// internally on first sig-verify. Routed through SafeHTTPDialContext
|
||||||
|
// so the dial-time guard re-resolves the issuer host and rejects
|
||||||
|
// loopback / link-local / private / cloud-metadata before any HTTP
|
||||||
|
// byte goes out. Mirrors jwksProbeClient (test_discovery.go) so both
|
||||||
|
// outbound paths share an identical SSRF posture.
|
||||||
|
//
|
||||||
|
// Package-level var so the test suite can swap it for an
|
||||||
|
// SSRF-guard-bypassed client when exercising the discovery code path
|
||||||
|
// against httptest.NewServer (which binds to 127.0.0.1 and would
|
||||||
|
// otherwise be refused). Mirrors the webhook/slack/teams test-seam
|
||||||
|
// pattern. Production code never reassigns this var.
|
||||||
|
var oidcDiscoveryClient = &http.Client{
|
||||||
|
Timeout: oidcOutboundTimeout,
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: validation.SafeHTTPDialContext(oidcOutboundTimeout),
|
||||||
|
MaxIdleConns: 10,
|
||||||
|
IdleConnTimeout: 90 * time.Second,
|
||||||
|
TLSHandshakeTimeout: 10 * time.Second,
|
||||||
|
ExpectContinueTimeout: 1 * time.Second,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
// SafeOIDCContext returns a derived context that carries the SSRF-safe
|
||||||
|
// discovery http.Client. Pass the result to gooidc.NewProvider so that
|
||||||
|
// the discovery doc fetch + the internal JWKS fetch the resulting
|
||||||
|
// Verifier issues both run through SafeHTTPDialContext.
|
||||||
|
//
|
||||||
|
// Callers SHOULD use this wrapper for every gooidc.NewProvider call
|
||||||
|
// site; the package's own callers (service.go runtime load,
|
||||||
|
// test_discovery.go dry-run validator) do this unconditionally.
|
||||||
|
func SafeOIDCContext(ctx context.Context) context.Context {
|
||||||
|
return gooidc.ClientContext(ctx, oidcDiscoveryClient)
|
||||||
|
}
|
||||||
|
|
||||||
|
// validateIssuerSSRF is the package-level seam tests substitute for the
|
||||||
|
// static issuer-URL SSRF gate. Production callers always run through
|
||||||
|
// validation.ValidateSafeURL; tests using httptest.NewServer (which
|
||||||
|
// binds to 127.0.0.1) swap this to a no-op in setup_test.go so the
|
||||||
|
// loopback URL doesn't trip the early-fail rail. Mirrors the
|
||||||
|
// jwksProbeClient / oidcDiscoveryClient test-seam pattern. Production
|
||||||
|
// code MUST NOT reassign this var.
|
||||||
|
var validateIssuerSSRF = validation.ValidateSafeURL
|
||||||
@@ -948,8 +948,19 @@ func (s *Service) fetchUserinfoGroups(
|
|||||||
if entry.provider.UserInfoEndpoint() == "" {
|
if entry.provider.UserInfoEndpoint() == "" {
|
||||||
return nil, fmt.Errorf("oidc: userinfo fallback configured but provider has no userinfo endpoint")
|
return nil, fmt.Errorf("oidc: userinfo fallback configured but provider has no userinfo endpoint")
|
||||||
}
|
}
|
||||||
ts := entry.oauthConfig.TokenSource(ctx, token)
|
// Acquisition-audit SEC-020 closure (Sprint 1 follow-up to SEC-001,
|
||||||
uinfo, err := entry.provider.UserInfo(ctx, ts)
|
// 2026-05-16). Wrap ctx via SafeOIDCContext before TokenSource +
|
||||||
|
// UserInfo so the SSRF guard owned by validation.SafeHTTPDialContext
|
||||||
|
// re-resolves the userinfo endpoint at dial time and refuses reserved
|
||||||
|
// addresses (loopback / link-local / cloud-metadata). The single wrap
|
||||||
|
// covers both legs because gooidc.ClientContext and oauth2.TokenSource
|
||||||
|
// both read the same oauth2.HTTPClient context key (see go-oidc/v3
|
||||||
|
// oidc.go:57-65 and golang.org/x/oauth2 oauth2.go:339-341). Production
|
||||||
|
// provider-load paths in this package already use SafeOIDCContext; the
|
||||||
|
// userinfo fallback was missed in the SEC-001 sweep.
|
||||||
|
safeCtx := SafeOIDCContext(ctx)
|
||||||
|
ts := entry.oauthConfig.TokenSource(safeCtx, token)
|
||||||
|
uinfo, err := entry.provider.UserInfo(safeCtx, ts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("oidc: userinfo fetch: %w", err)
|
return nil, fmt.Errorf("oidc: userinfo fetch: %w", err)
|
||||||
}
|
}
|
||||||
@@ -1063,7 +1074,14 @@ func (s *Service) getOrLoad(ctx context.Context, providerID string) (*providerEn
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Fetch + cache the discovery doc + JWKS via go-oidc.
|
// Fetch + cache the discovery doc + JWKS via go-oidc.
|
||||||
provider, err := gooidc.NewProvider(ctx, cfgRow.IssuerURL)
|
//
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16): the bare `ctx` is wrapped
|
||||||
|
// in SafeOIDCContext so the discovery fetch + every subsequent
|
||||||
|
// Verifier-issued JWKS fetch run through validation.SafeHTTPDialContext.
|
||||||
|
// Pre-fix this path used http.DefaultClient and could be aimed at
|
||||||
|
// loopback / RFC 1918 / link-local / cloud-metadata addresses via the
|
||||||
|
// admin-supplied issuer URL. See safehttp.go for the full closure note.
|
||||||
|
provider, err := gooidc.NewProvider(SafeOIDCContext(ctx), cfgRow.IssuerURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("oidc: discovery fetch failed for %s: %w", providerID, err)
|
return nil, fmt.Errorf("oidc: discovery fetch failed for %s: %w", providerID, err)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,11 +19,15 @@ import (
|
|||||||
|
|
||||||
"github.com/go-jose/go-jose/v4"
|
"github.com/go-jose/go-jose/v4"
|
||||||
"github.com/go-jose/go-jose/v4/jwt"
|
"github.com/go-jose/go-jose/v4/jwt"
|
||||||
|
"golang.org/x/oauth2"
|
||||||
|
|
||||||
|
gooidc "github.com/coreos/go-oidc/v3/oidc"
|
||||||
|
|
||||||
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
oidcdomain "github.com/certctl-io/certctl/internal/auth/oidc/domain"
|
||||||
userdomain "github.com/certctl-io/certctl/internal/auth/user/domain"
|
userdomain "github.com/certctl-io/certctl/internal/auth/user/domain"
|
||||||
cryptopkg "github.com/certctl-io/certctl/internal/crypto"
|
cryptopkg "github.com/certctl-io/certctl/internal/crypto"
|
||||||
"github.com/certctl-io/certctl/internal/repository"
|
"github.com/certctl-io/certctl/internal/repository"
|
||||||
|
"github.com/certctl-io/certctl/internal/validation"
|
||||||
)
|
)
|
||||||
|
|
||||||
// sha384New returns a SHA-384 hash via crypto/sha512 (Go stdlib).
|
// sha384New returns a SHA-384 hash via crypto/sha512 (Go stdlib).
|
||||||
@@ -392,6 +396,20 @@ func (s *stubUsers) ListAll(_ context.Context, _ string) ([]*userdomain.User, er
|
|||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListDeactivatedBefore satisfies the Sprint 6 COMP-002-RETENTION
|
||||||
|
// interface addition. Stub-side: walk byID and filter on the
|
||||||
|
// DeactivatedAt cursor; OIDC service tests don't care about ordering
|
||||||
|
// stability.
|
||||||
|
func (s *stubUsers) ListDeactivatedBefore(_ context.Context, threshold time.Time) ([]*userdomain.User, error) {
|
||||||
|
var out []*userdomain.User
|
||||||
|
for _, u := range s.byID {
|
||||||
|
if u.DeactivatedAt != nil && u.DeactivatedAt.Before(threshold) {
|
||||||
|
out = append(out, u)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out, nil
|
||||||
|
}
|
||||||
|
|
||||||
type stubSessions struct {
|
type stubSessions struct {
|
||||||
cookieValue string
|
cookieValue string
|
||||||
csrfToken string
|
csrfToken string
|
||||||
@@ -2386,3 +2404,106 @@ func TestService_UpsertUser_ValidateErrorOnEmptyEmail(t *testing.T) {
|
|||||||
t.Errorf("err = %v; want validate wrap", err)
|
t.Errorf("err = %v; want validate wrap", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Acquisition-audit SEC-020 closure (Sprint 1 follow-up to SEC-001,
|
||||||
|
// 2026-05-16). fetchUserinfoGroups previously called
|
||||||
|
// entry.provider.UserInfo(ctx, ts) with the bare request context. go-oidc
|
||||||
|
// /v3's Provider.UserInfo derives its http.Client from ctx via
|
||||||
|
// getClient(ctx) (oidc.go:61-65); without an override the internal
|
||||||
|
// doRequest falls through to http.DefaultClient — an unwrapped client
|
||||||
|
// with no SSRF guard. The fix wraps ctx via SafeOIDCContext so the
|
||||||
|
// dial-time SafeHTTPDialContext guard re-resolves the userinfo
|
||||||
|
// endpoint and rejects reserved-address answers.
|
||||||
|
//
|
||||||
|
// This test exercises the wrap end-to-end:
|
||||||
|
//
|
||||||
|
// 1. Stand up a discovery httptest server (loopback) whose discovery
|
||||||
|
// doc advertises userinfo_endpoint = "http://169.254.169.254/userinfo"
|
||||||
|
// (link-local cloud-metadata range — rejected by
|
||||||
|
// validation.SafeHTTPDialContext.isReservedIPForDial).
|
||||||
|
// 2. Construct the *gooidc.Provider via the test-bypassed
|
||||||
|
// oidcDiscoveryClient (setup_test.go's init() leaves it bypassed for
|
||||||
|
// the package).
|
||||||
|
// 3. Restore the production-shape oidcDiscoveryClient (the one whose
|
||||||
|
// Transport.DialContext is validation.SafeHTTPDialContext) BEFORE
|
||||||
|
// calling fetchUserinfoGroups, so the SafeOIDCContext wrap inside
|
||||||
|
// the function captures the production guard at ctx-wrap time.
|
||||||
|
// 4. Call fetchUserinfoGroups and assert the resulting error wraps the
|
||||||
|
// dial-time reserved-address rejection (substring "refusing to
|
||||||
|
// dial" / "reserved address"), not a generic transport error.
|
||||||
|
//
|
||||||
|
// The test does NOT use t.Parallel() — it mutates the package-level
|
||||||
|
// oidcDiscoveryClient and must run serially against any other test that
|
||||||
|
// reads the same var.
|
||||||
|
func TestFetchUserinfoGroups_SSRF_BlocksReservedAddress(t *testing.T) {
|
||||||
|
// Stand up a loopback discovery server. Discovery doc's
|
||||||
|
// userinfo_endpoint points at the link-local cloud-metadata IP so
|
||||||
|
// the subsequent UserInfo dial trips SafeHTTPDialContext.
|
||||||
|
var discoveryURL string
|
||||||
|
mux := http.NewServeMux()
|
||||||
|
mux.HandleFunc("/.well-known/openid-configuration", func(w http.ResponseWriter, r *http.Request) {
|
||||||
|
doc := map[string]interface{}{
|
||||||
|
"issuer": discoveryURL,
|
||||||
|
"authorization_endpoint": discoveryURL + "/authorize",
|
||||||
|
"token_endpoint": discoveryURL + "/token",
|
||||||
|
"jwks_uri": discoveryURL + "/jwks",
|
||||||
|
"userinfo_endpoint": "http://169.254.169.254/userinfo",
|
||||||
|
"id_token_signing_alg_values_supported": []string{"RS256"},
|
||||||
|
"response_types_supported": []string{"code"},
|
||||||
|
"subject_types_supported": []string{"public"},
|
||||||
|
}
|
||||||
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
_ = json.NewEncoder(w).Encode(doc)
|
||||||
|
})
|
||||||
|
srv := httptest.NewServer(mux)
|
||||||
|
defer srv.Close()
|
||||||
|
discoveryURL = srv.URL
|
||||||
|
|
||||||
|
// Build the *gooidc.Provider using the test-bypassed discovery
|
||||||
|
// client (setup_test.go init() already swapped oidcDiscoveryClient
|
||||||
|
// to a DefaultTransport-backed client so the httptest loopback URL
|
||||||
|
// resolves cleanly).
|
||||||
|
ctx := context.Background()
|
||||||
|
provider, err := gooidc.NewProvider(SafeOIDCContext(ctx), discoveryURL)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("NewProvider against loopback discovery server: %v", err)
|
||||||
|
}
|
||||||
|
if got := provider.UserInfoEndpoint(); got != "http://169.254.169.254/userinfo" {
|
||||||
|
t.Fatalf("provider.UserInfoEndpoint() = %q; want link-local override", got)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Restore the production-shape SafeHTTPDialContext-backed client
|
||||||
|
// just before the call. SafeOIDCContext inside fetchUserinfoGroups
|
||||||
|
// will pick THIS client up because gooidc.ClientContext reads the
|
||||||
|
// package-level var at wrap time.
|
||||||
|
saved := oidcDiscoveryClient
|
||||||
|
t.Cleanup(func() { oidcDiscoveryClient = saved })
|
||||||
|
oidcDiscoveryClient = &http.Client{
|
||||||
|
Timeout: oidcOutboundTimeout,
|
||||||
|
Transport: &http.Transport{
|
||||||
|
DialContext: validation.SafeHTTPDialContext(oidcOutboundTimeout),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
entry := &providerEntry{
|
||||||
|
provider: provider,
|
||||||
|
oauthConfig: &oauth2.Config{
|
||||||
|
ClientID: "test-client",
|
||||||
|
ClientSecret: "test-secret",
|
||||||
|
Endpoint: oauth2.Endpoint{TokenURL: discoveryURL + "/token"},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
svc := &Service{}
|
||||||
|
_, err = svc.fetchUserinfoGroups(ctx, entry, &oauth2.Token{AccessToken: "test-access-token"}, "groups")
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("fetchUserinfoGroups against link-local userinfo endpoint: expected SSRF reject; got nil")
|
||||||
|
}
|
||||||
|
msg := err.Error()
|
||||||
|
// SafeHTTPDialContext emits one of two messages for the literal-IP
|
||||||
|
// case: "refusing to dial reserved address <ip>". Either is the
|
||||||
|
// load-bearing signal we want — a generic connect-refused / EOF
|
||||||
|
// would mean the guard didn't fire.
|
||||||
|
if !strings.Contains(msg, "refusing to dial") && !strings.Contains(msg, "reserved address") {
|
||||||
|
t.Errorf("fetchUserinfoGroups err = %q; want SafeHTTPDialContext reserved-address rejection", msg)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -29,4 +29,14 @@ func init() {
|
|||||||
Timeout: 10 * time.Second,
|
Timeout: 10 * time.Second,
|
||||||
Transport: http.DefaultTransport,
|
Transport: http.DefaultTransport,
|
||||||
}
|
}
|
||||||
|
// SEC-001 closure companion: same SSRF-bypass for the discovery
|
||||||
|
// fetch's http.Client + the static issuer-URL gate. Tests using
|
||||||
|
// httptest.NewServer get a loopback URL; the production
|
||||||
|
// SafeHTTPDialContext + validateIssuerSSRF would reject these.
|
||||||
|
// Production code never reassigns either var.
|
||||||
|
oidcDiscoveryClient = &http.Client{
|
||||||
|
Timeout: 10 * time.Second,
|
||||||
|
Transport: http.DefaultTransport,
|
||||||
|
}
|
||||||
|
validateIssuerSSRF = func(string) error { return nil }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -58,11 +58,31 @@ type TestDiscoveryResult struct {
|
|||||||
func (s *Service) TestDiscovery(ctx context.Context, issuerURL string) (*TestDiscoveryResult, error) {
|
func (s *Service) TestDiscovery(ctx context.Context, issuerURL string) (*TestDiscoveryResult, error) {
|
||||||
res := &TestDiscoveryResult{}
|
res := &TestDiscoveryResult{}
|
||||||
|
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16): refuse reserved-address
|
||||||
|
// issuers up-front so operators see a clear policy error instead
|
||||||
|
// of the lower-level dial-rejection wrap from SafeHTTPDialContext.
|
||||||
|
// The dial-time guard remains the authoritative DNS-rebinding-safe
|
||||||
|
// defense; this is the early-fail UX rail. Routed through the
|
||||||
|
// validateIssuerSSRF package-level seam so tests using
|
||||||
|
// httptest.NewServer can swap it for a no-op (see setup_test.go).
|
||||||
|
if vErr := validateIssuerSSRF(issuerURL); vErr != nil {
|
||||||
|
res.Errors = append(res.Errors, fmt.Sprintf("issuer_url failed SSRF policy: %v", vErr))
|
||||||
|
return res, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Step 1 — discovery. gooidc.NewProvider fetches
|
// Step 1 — discovery. gooidc.NewProvider fetches
|
||||||
// `<issuer>/.well-known/openid-configuration` and runs the iss
|
// `<issuer>/.well-known/openid-configuration` and runs the iss
|
||||||
// match check internally; on failure it returns a fmt-style
|
// match check internally; on failure it returns a fmt-style
|
||||||
// wrapped error.
|
// wrapped error.
|
||||||
provider, err := gooidc.NewProvider(ctx, issuerURL)
|
//
|
||||||
|
// SEC-001 closure (Sprint 1, 2026-05-16): the bare `ctx` is wrapped
|
||||||
|
// in SafeOIDCContext so the discovery fetch + the resulting
|
||||||
|
// Verifier's internal JWKS fetch both run through a transport
|
||||||
|
// whose DialContext is validation.SafeHTTPDialContext. Pre-fix the
|
||||||
|
// default HTTP client could be aimed at loopback / RFC 1918 /
|
||||||
|
// link-local / cloud-metadata addresses via the admin-supplied
|
||||||
|
// issuer URL. See safehttp.go for the full closure note.
|
||||||
|
provider, err := gooidc.NewProvider(SafeOIDCContext(ctx), issuerURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
res.Errors = append(res.Errors, fmt.Sprintf("discovery fetch failed: %v", err))
|
res.Errors = append(res.Errors, fmt.Sprintf("discovery fetch failed: %v", err))
|
||||||
return res, nil // Non-fatal at this layer; the response carries the per-leg failure.
|
return res, nil // Non-fatal at this layer; the response carries the per-leg failure.
|
||||||
|
|||||||
+25
-9
@@ -138,15 +138,13 @@ const (
|
|||||||
// docs/architecture.md "Authenticating-gateway pattern".
|
// docs/architecture.md "Authenticating-gateway pattern".
|
||||||
AuthTypeNone AuthType = "none"
|
AuthTypeNone AuthType = "none"
|
||||||
|
|
||||||
// AuthTypeOIDC (Auth Bundle 2 Phase 0) reserves the literal that the
|
// AuthTypeOIDC drives the OIDC SSO handler chain (Bundle 2 Phase 5+6).
|
||||||
// OIDC handler chain (Bundle 2 Phase 5+6) consumes. Pre-Bundle-2
|
// ARCH-002 closure (Sprint 4, 2026-05-16): the Phase-0 runtime guard
|
||||||
// behavior: the literal is allowed by the validator but the handler
|
// at cmd/server/main.go that refused to boot on this literal has
|
||||||
// chain is not yet wired, so the runtime guard in cmd/server/main.go
|
// been relaxed — every prerequisite (session.NewService,
|
||||||
// surfaces a clear "oidc auth-type configured but Bundle 2 handlers
|
// oidcsvc.NewService, ChainAuthSessionThenBearer, the OIDC handler
|
||||||
// not registered" error rather than silently falling back to api-key
|
// routes) ships, so CERTCTL_AUTH_TYPE=oidc is now a fully-supported
|
||||||
// (the failure mode that drove G-1's jwt-literal removal). Once
|
// production auth mode alongside api-key + none.
|
||||||
// Bundle 2's session middleware + OIDC service ship, the runtime
|
|
||||||
// guard relaxes and CERTCTL_AUTH_TYPE=oidc routes through them.
|
|
||||||
//
|
//
|
||||||
// Note: this is the AUTH-TYPE literal value, NOT the JWT alg literal.
|
// Note: this is the AUTH-TYPE literal value, NOT the JWT alg literal.
|
||||||
// ID tokens are JWTs internally but the auth-type config string is
|
// ID tokens are JWTs internally but the auth-type config string is
|
||||||
@@ -171,6 +169,24 @@ func ValidAuthTypes() []AuthType {
|
|||||||
return []AuthType{AuthTypeAPIKey, AuthTypeNone, AuthTypeOIDC}
|
return []AuthType{AuthTypeAPIKey, AuthTypeNone, AuthTypeOIDC}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// IsRuntimeSupportedAuthType reports whether the cmd/server/main.go
|
||||||
|
// runtime guard accepts this auth-type literal at boot. ARCH-002
|
||||||
|
// closure (Sprint 4, 2026-05-16): post-fix this returns true for
|
||||||
|
// every entry in ValidAuthTypes() — the Bundle-2-Phase-0 stale guard
|
||||||
|
// that exited on AuthTypeOIDC has been relaxed, since the full
|
||||||
|
// session middleware + OIDC handler chain ships. The helper exists
|
||||||
|
// as a single source of truth so the test suite can pin the
|
||||||
|
// invariant `ValidAuthTypes ⊆ runtime-supported` (which protects
|
||||||
|
// against future drift in either direction).
|
||||||
|
func IsRuntimeSupportedAuthType(t AuthType) bool {
|
||||||
|
switch t {
|
||||||
|
case AuthTypeAPIKey, AuthTypeNone, AuthTypeOIDC:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// AuthConfig contains authentication configuration.
|
// AuthConfig contains authentication configuration.
|
||||||
type AuthConfig struct {
|
type AuthConfig struct {
|
||||||
// Type sets the authentication mechanism for the REST API.
|
// Type sets the authentication mechanism for the REST API.
|
||||||
|
|||||||
+347
-13
@@ -8,6 +8,7 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"net/url"
|
||||||
"os"
|
"os"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -104,6 +105,110 @@ type Config struct {
|
|||||||
Encryption EncryptionConfig
|
Encryption EncryptionConfig
|
||||||
CloudDiscovery CloudDiscoveryConfig
|
CloudDiscovery CloudDiscoveryConfig
|
||||||
OCSPResponder OCSPResponderConfig
|
OCSPResponder OCSPResponderConfig
|
||||||
|
// AuditChain holds the Sprint 6 COMP-001-HASH chain-verify tick
|
||||||
|
// cadence. Scheduler loop auditChainVerifyLoop reads VerifyInterval;
|
||||||
|
// the metric-side counter is wired separately in cmd/server/main.go.
|
||||||
|
AuditChain AuditChainConfig
|
||||||
|
// UserRetention holds the Sprint 6 COMP-002-RETENTION purge cadence
|
||||||
|
// + window. The scheduler's userRetentionLoop reads Interval; the
|
||||||
|
// UserRetentionService reads RetentionWindow + BatchCap.
|
||||||
|
UserRetention UserRetentionConfig
|
||||||
|
// Network holds outbound-egress policy tunables. Acquisition-audit
|
||||||
|
// SEC-009 + RED-005 closure (Sprint 5 ACQ, 2026-05-16). Today the
|
||||||
|
// only field is BlockRFC1918Outbound; future egress-policy knobs
|
||||||
|
// (per-host allowlists, max-dial-time overrides) go here.
|
||||||
|
Network NetworkConfig
|
||||||
|
// Observability holds the optional OpenTelemetry seed config.
|
||||||
|
// Acquisition-audit DEPL-006 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
// Default Enabled=false — operators opt in via CERTCTL_OTEL_ENABLED=true.
|
||||||
|
Observability ObservabilityConfig
|
||||||
|
}
|
||||||
|
|
||||||
|
// ObservabilityConfig is the operator-facing config surface for the
|
||||||
|
// OTel seed. Acquisition-audit DEPL-006 closure (Sprint 6 ACQ,
|
||||||
|
// 2026-05-16). Plumbed through to internal/observability.Init at
|
||||||
|
// boot from cmd/server/main.go.
|
||||||
|
//
|
||||||
|
// The single gate is CERTCTL_OTEL_ENABLED. Everything else (endpoint,
|
||||||
|
// headers, protocol, service name, resource attributes) flows
|
||||||
|
// through the standard OTEL_* env vars the OTel SDK's
|
||||||
|
// resource.WithFromEnv + otlptracehttp.New honor directly — no
|
||||||
|
// certctl-specific re-implementation of those env vars (avoids the
|
||||||
|
// "lying field" footgun where an env var exists in code but doesn't
|
||||||
|
// reach the consumer).
|
||||||
|
type ObservabilityConfig struct {
|
||||||
|
// OTelEnabled gates the optional OpenTelemetry tracer-provider
|
||||||
|
// initialization. Default false (zero behavior change for
|
||||||
|
// operators who don't opt in). When true, the boot path wires
|
||||||
|
// up an OTLP/HTTP exporter and registers it as the otel global
|
||||||
|
// tracer provider. CERTCTL_OTEL_ENABLED.
|
||||||
|
//
|
||||||
|
// Per-handler / per-query / per-connector span instrumentation
|
||||||
|
// is NOT added by Sprint 6 — this commit stands up the surface
|
||||||
|
// only; instrumentation is a v2.3 follow-up. Operators who
|
||||||
|
// enable the toggle today will see process-level resource
|
||||||
|
// attributes and (eventually) any spans the OTel SDK emits
|
||||||
|
// from its own internal paths, but no certctl-domain spans
|
||||||
|
// until the v2.3 work lands.
|
||||||
|
OTelEnabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// NetworkConfig is the outbound-egress policy surface for certctl.
|
||||||
|
// Acquisition-audit SEC-009 + RED-005 closure (Sprint 5 ACQ,
|
||||||
|
// 2026-05-16).
|
||||||
|
type NetworkConfig struct {
|
||||||
|
// BlockRFC1918Outbound, when true, extends the SSRF reserved-IP
|
||||||
|
// gate (internal/validation/ssrf.go::IsReservedIP) to include the
|
||||||
|
// three RFC 1918 ranges (10.0.0.0/8, 172.16.0.0/12,
|
||||||
|
// 192.168.0.0/16). Default false (preserves the certctl threat-
|
||||||
|
// model default that RFC1918 is legitimate destination space).
|
||||||
|
// Operators on hosted IaaS where RFC1918 is internal trust
|
||||||
|
// (Kubernetes service CIDRs that expose the API server inside
|
||||||
|
// RFC1918, internal-only monitoring stacks, etc.) opt in via
|
||||||
|
// CERTCTL_BLOCK_RFC1918_OUTBOUND=true. Wired at boot from
|
||||||
|
// cmd/server/main.go via validation.SetBlockRFC1918Outbound.
|
||||||
|
//
|
||||||
|
// IMPORTANT: enabling this also blocks RFC1918 from the certctl
|
||||||
|
// network scanner. Operators who scan their own RFC1918 space
|
||||||
|
// for cert-discovery MUST leave this disabled.
|
||||||
|
BlockRFC1918Outbound bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// AuditChainConfig configures the audit_events tamper-evidence
|
||||||
|
// chain-verify scheduler loop (Sprint 6 COMP-001-HASH closure).
|
||||||
|
//
|
||||||
|
// The walk runs migration 000047's audit_events_verify_chain()
|
||||||
|
// plpgsql function entirely server-side and emits the
|
||||||
|
// certctl_audit_chain_break_detected_total counter on any detection.
|
||||||
|
type AuditChainConfig struct {
|
||||||
|
// VerifyInterval is the tick cadence for the chain-verify sweep.
|
||||||
|
// Default 6h. Operators with huge audit_events tables (millions of
|
||||||
|
// rows) may want to lengthen; operators with stricter detection
|
||||||
|
// targets may shorten — the walk is O(N) plpgsql and finishes in
|
||||||
|
// seconds even at the 1M-row mark.
|
||||||
|
// Setting: CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL.
|
||||||
|
VerifyInterval time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// UserRetentionConfig configures the Sprint 6 COMP-002-RETENTION user
|
||||||
|
// PII purge sweeper. The scheduler's userRetentionLoop walks every
|
||||||
|
// user with deactivated_at older than RetentionWindow and scrubs the
|
||||||
|
// PII columns via UserRetentionService.DeleteUserPII.
|
||||||
|
type UserRetentionConfig struct {
|
||||||
|
// Interval is the tick cadence. Default 24h.
|
||||||
|
// Setting: CERTCTL_USER_RETENTION_INTERVAL.
|
||||||
|
Interval time.Duration
|
||||||
|
// RetentionWindow is how long after deactivated_at a row's PII
|
||||||
|
// stays in the table. Default 30 days. Operators with strict
|
||||||
|
// GDPR / CCPA expectations may shorten; operators who need
|
||||||
|
// forensic recovery latitude may lengthen.
|
||||||
|
// Setting: CERTCTL_USER_RETENTION_WINDOW.
|
||||||
|
RetentionWindow time.Duration
|
||||||
|
// BatchCap bounds how many users a single tick processes. Default
|
||||||
|
// 200 — keeps blast radius predictable. Set to 0 to disable the
|
||||||
|
// cap (test fixtures only).
|
||||||
|
// Setting: CERTCTL_USER_RETENTION_BATCH_CAP.
|
||||||
|
BatchCap int
|
||||||
}
|
}
|
||||||
|
|
||||||
// OCSPResponderConfig configures the dedicated OCSP-responder cert
|
// OCSPResponderConfig configures the dedicated OCSP-responder cert
|
||||||
@@ -333,7 +438,23 @@ func Load() (*Config, error) {
|
|||||||
AuditFlushTimeoutSeconds: getEnvInt("CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS", 30),
|
AuditFlushTimeoutSeconds: getEnvInt("CERTCTL_AUDIT_FLUSH_TIMEOUT_SECONDS", 30),
|
||||||
},
|
},
|
||||||
Database: DatabaseConfig{
|
Database: DatabaseConfig{
|
||||||
URL: getEnv("CERTCTL_DATABASE_URL", "postgres://localhost/certctl"),
|
// DEPL-004 closure (Sprint 3, 2026-05-16). The Helm chart's
|
||||||
|
// _helpers.tpl renders the bundled-Postgres URL with a literal
|
||||||
|
// `$(POSTGRES_PASSWORD)` placeholder (see
|
||||||
|
// deploy/helm/certctl/templates/_helpers.tpl line 133). The
|
||||||
|
// Kubernetes env-substitution `$(VAR)` syntax ONLY expands
|
||||||
|
// when the value is a string literal in `env:` — values
|
||||||
|
// sourced from Secrets (via `valueFrom.secretKeyRef`) are
|
||||||
|
// passed through verbatim with no expansion. Pre-fix the
|
||||||
|
// server received the literal "postgres://user:$(POSTGRES_PASSWORD)@..."
|
||||||
|
// string and tried to dial Postgres with that as the password,
|
||||||
|
// failing with auth error and leaking the placeholder into
|
||||||
|
// error logs. expandDatabaseURL substitutes the placeholder
|
||||||
|
// with os.Getenv("POSTGRES_PASSWORD") when present; external-
|
||||||
|
// Postgres deploys that bake the password directly into the
|
||||||
|
// URL string are unaffected because there is no placeholder
|
||||||
|
// to match.
|
||||||
|
URL: expandDatabaseURL(getEnv("CERTCTL_DATABASE_URL", "postgres://localhost/certctl")),
|
||||||
// Phase 6 SCALE-M1 closure (2026-05-14): bumped default from
|
// Phase 6 SCALE-M1 closure (2026-05-14): bumped default from
|
||||||
// 25 → 50 to relieve pool-saturation pressure on 1K+ agent /
|
// 25 → 50 to relieve pool-saturation pressure on 1K+ agent /
|
||||||
// 10K+ cert fleets. Postgres default max_connections is 100
|
// 10K+ cert fleets. Postgres default max_connections is 100
|
||||||
@@ -351,6 +472,11 @@ func Load() (*Config, error) {
|
|||||||
// Audit fix #9 — per-tick concurrency cap on the renewal/issuance/
|
// Audit fix #9 — per-tick concurrency cap on the renewal/issuance/
|
||||||
// deployment goroutine fan-out. ≤0 → 1 (sequential).
|
// deployment goroutine fan-out. ≤0 → 1 (sequential).
|
||||||
RenewalConcurrency: getEnvInt("CERTCTL_RENEWAL_CONCURRENCY", 25),
|
RenewalConcurrency: getEnvInt("CERTCTL_RENEWAL_CONCURRENCY", 25),
|
||||||
|
// SCALE-001 closure (Sprint 2, 2026-05-16) — per-tick claim cap on
|
||||||
|
// the scheduler's ClaimPendingJobs sweep. Default 1000 keeps the
|
||||||
|
// fan-out busy (≈40× the renewal-concurrency cap) without
|
||||||
|
// page-thrashing on a 100K-job burst. ≤0 → 1000 (fail-safe).
|
||||||
|
JobClaimLimit: getEnvInt("CERTCTL_SCHEDULER_JOB_CLAIM_LIMIT", 1000),
|
||||||
AgentHealthCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL", 2*time.Minute),
|
AgentHealthCheckInterval: getEnvDuration("CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL", 2*time.Minute),
|
||||||
NotificationProcessInterval: getEnvDuration("CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL", 1*time.Minute),
|
NotificationProcessInterval: getEnvDuration("CERTCTL_SCHEDULER_NOTIFICATION_PROCESS_INTERVAL", 1*time.Minute),
|
||||||
// I-005: retry sweep for failed notifications. Mirrors RetryInterval
|
// I-005: retry sweep for failed notifications. Mirrors RetryInterval
|
||||||
@@ -397,10 +523,18 @@ func Load() (*Config, error) {
|
|||||||
// NamedKeys is populated from CERTCTL_API_KEYS_NAMED below so Load()
|
// NamedKeys is populated from CERTCTL_API_KEYS_NAMED below so Load()
|
||||||
// can surface parse errors alongside other config errors.
|
// can surface parse errors alongside other config errors.
|
||||||
|
|
||||||
// Bundle-5 / Audit H-007: agent-registration bootstrap secret.
|
// Bundle-5 / Audit H-007 + acquisition-audit RED-003 closure
|
||||||
// Empty (default) = warn-mode pass-through; v2.2.0 will require it.
|
// (Sprint 5 ACQ, 2026-05-16): agent-registration bootstrap
|
||||||
|
// secret. The deny-empty default flipped from false → true
|
||||||
|
// on 2026-05-16. Operators upgrading from v2.1.x can re-
|
||||||
|
// open the warn-mode escape hatch by explicitly setting
|
||||||
|
// CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=false (one
|
||||||
|
// upgrade window); see CHANGELOG v2.2.0 for the migration
|
||||||
|
// note. Demo mode (CERTCTL_DEMO_MODE_ACK=true) keeps the
|
||||||
|
// pre-flip warn-mode for the screenshot path — see
|
||||||
|
// Validate() for the override site.
|
||||||
AgentBootstrapToken: getEnv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", ""),
|
AgentBootstrapToken: getEnv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", ""),
|
||||||
AgentBootstrapTokenDenyEmpty: getEnvBool("CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY", false),
|
AgentBootstrapTokenDenyEmpty: getEnvBool("CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY", true),
|
||||||
// Bundle 1 Phase 6: one-shot bootstrap token for the
|
// Bundle 1 Phase 6: one-shot bootstrap token for the
|
||||||
// /v1/auth/bootstrap endpoint that mints the first admin
|
// /v1/auth/bootstrap endpoint that mints the first admin
|
||||||
// key. Empty = bootstrap endpoint disabled (default).
|
// key. Empty = bootstrap endpoint disabled (default).
|
||||||
@@ -446,6 +580,13 @@ func Load() (*Config, error) {
|
|||||||
BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100),
|
BurstSize: getEnvInt("CERTCTL_RATE_LIMIT_BURST", 100),
|
||||||
PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0),
|
PerUserRPS: getEnvFloat("CERTCTL_RATE_LIMIT_PER_USER_RPS", 0),
|
||||||
PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0),
|
PerUserBurstSize: getEnvInt("CERTCTL_RATE_LIMIT_PER_USER_BURST", 0),
|
||||||
|
// SEC-006 closure (Sprint 2, 2026-05-16): bounded unused-bucket
|
||||||
|
// lifetime. 1h chosen to be well above realistic operator IP
|
||||||
|
// churn (returning clients keep their bucket) and well below
|
||||||
|
// the unbounded-leak window the pre-fix code allowed.
|
||||||
|
BucketTTL: getEnvDuration("CERTCTL_RATE_LIMIT_BUCKET_TTL", 1*time.Hour),
|
||||||
|
SlidingWindowBackend: getEnv("CERTCTL_RATE_LIMIT_BACKEND", "memory"),
|
||||||
|
SlidingWindowJanitorInterval: getEnvDuration("CERTCTL_RATE_LIMIT_JANITOR_INTERVAL", 5*time.Minute),
|
||||||
},
|
},
|
||||||
CORS: CORSConfig{
|
CORS: CORSConfig{
|
||||||
AllowedOrigins: getEnvList("CERTCTL_CORS_ORIGINS", nil),
|
AllowedOrigins: getEnvList("CERTCTL_CORS_ORIGINS", nil),
|
||||||
@@ -472,6 +613,12 @@ func Load() (*Config, error) {
|
|||||||
SMTPPassword: getEnv("CERTCTL_SMTP_PASSWORD", ""),
|
SMTPPassword: getEnv("CERTCTL_SMTP_PASSWORD", ""),
|
||||||
SMTPFromAddress: getEnv("CERTCTL_SMTP_FROM_ADDRESS", ""),
|
SMTPFromAddress: getEnv("CERTCTL_SMTP_FROM_ADDRESS", ""),
|
||||||
SMTPUseTLS: getEnvBool("CERTCTL_SMTP_USE_TLS", true),
|
SMTPUseTLS: getEnvBool("CERTCTL_SMTP_USE_TLS", true),
|
||||||
|
// Acquisition-audit DOC-001 closure (Sprint 7 ACQ, 2026-05-16).
|
||||||
|
// Wire the previously-orphan webhook notifier
|
||||||
|
// (internal/connector/notifier/webhook/) into the boot
|
||||||
|
// path. Empty WebhookURL = notifier disabled.
|
||||||
|
WebhookURL: getEnv("CERTCTL_WEBHOOK_URL", ""),
|
||||||
|
WebhookSecret: getEnv("CERTCTL_WEBHOOK_SECRET", ""),
|
||||||
},
|
},
|
||||||
NetworkScan: NetworkScanConfig{
|
NetworkScan: NetworkScanConfig{
|
||||||
Enabled: getEnvBool("CERTCTL_NETWORK_SCAN_ENABLED", false),
|
Enabled: getEnvBool("CERTCTL_NETWORK_SCAN_ENABLED", false),
|
||||||
@@ -672,6 +819,36 @@ func Load() (*Config, error) {
|
|||||||
RotationGrace: getEnvDuration("CERTCTL_OCSP_RESPONDER_ROTATION_GRACE", 7*24*time.Hour),
|
RotationGrace: getEnvDuration("CERTCTL_OCSP_RESPONDER_ROTATION_GRACE", 7*24*time.Hour),
|
||||||
Validity: getEnvDuration("CERTCTL_OCSP_RESPONDER_VALIDITY", 30*24*time.Hour),
|
Validity: getEnvDuration("CERTCTL_OCSP_RESPONDER_VALIDITY", 30*24*time.Hour),
|
||||||
},
|
},
|
||||||
|
AuditChain: AuditChainConfig{
|
||||||
|
VerifyInterval: getEnvDuration("CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL", 6*time.Hour),
|
||||||
|
},
|
||||||
|
UserRetention: UserRetentionConfig{
|
||||||
|
Interval: getEnvDuration("CERTCTL_USER_RETENTION_INTERVAL", 24*time.Hour),
|
||||||
|
RetentionWindow: getEnvDuration("CERTCTL_USER_RETENTION_WINDOW", 30*24*time.Hour),
|
||||||
|
BatchCap: getEnvInt("CERTCTL_USER_RETENTION_BATCH_CAP", 200),
|
||||||
|
},
|
||||||
|
// Acquisition-audit SEC-009 + RED-005 closure (Sprint 5 ACQ,
|
||||||
|
// 2026-05-16). Default false preserves the existing threat-model
|
||||||
|
// default (RFC1918 is legitimate destination space); operators
|
||||||
|
// on hosted IaaS opt in via CERTCTL_BLOCK_RFC1918_OUTBOUND=true.
|
||||||
|
// Wired into validation.SetBlockRFC1918Outbound at boot from
|
||||||
|
// cmd/server/main.go.
|
||||||
|
Network: NetworkConfig{
|
||||||
|
BlockRFC1918Outbound: getEnvBool("CERTCTL_BLOCK_RFC1918_OUTBOUND", false),
|
||||||
|
},
|
||||||
|
// Acquisition-audit DEPL-006 closure (Sprint 6 ACQ,
|
||||||
|
// 2026-05-16). Optional OpenTelemetry seed. Default Enabled=false
|
||||||
|
// preserves zero-overhead behavior for operators who don't opt
|
||||||
|
// in; the boot path calls observability.Init unconditionally
|
||||||
|
// (observability.Init short-circuits to a no-op shutdown when
|
||||||
|
// disabled). Operators set CERTCTL_OTEL_ENABLED=true plus the
|
||||||
|
// standard OTEL_* env vars (OTEL_EXPORTER_OTLP_ENDPOINT, etc.)
|
||||||
|
// to wire spans to their collector. Per-handler / per-query
|
||||||
|
// instrumentation is a v2.3 roadmap follow-up; this sprint
|
||||||
|
// stands up the surface only.
|
||||||
|
Observability: ObservabilityConfig{
|
||||||
|
OTelEnabled: getEnvBool("CERTCTL_OTEL_ENABLED", false),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse CERTCTL_API_KEYS_NAMED for named key authentication (M-002).
|
// Parse CERTCTL_API_KEYS_NAMED for named key authentication (M-002).
|
||||||
@@ -764,6 +941,36 @@ func (c *Config) Validate() error {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 closure (ARCH-M1): validate
|
||||||
|
// CERTCTL_RATE_LIMIT_BACKEND is one of the two supported values.
|
||||||
|
// Fail-closed on any other input so a typo doesn't silently fall
|
||||||
|
// back to the wrong backend (the operator picked "postgress" and
|
||||||
|
// got memory rate-limits in a 3-replica cluster).
|
||||||
|
switch c.RateLimit.SlidingWindowBackend {
|
||||||
|
case "", "memory", "postgres":
|
||||||
|
// "" is treated as "memory" — test-built Configs (which
|
||||||
|
// construct the struct literal directly without going
|
||||||
|
// through Load()) don't get the default; Load() always
|
||||||
|
// fills "memory". Either path lands the runtime on the
|
||||||
|
// in-memory backend.
|
||||||
|
default:
|
||||||
|
return fmt.Errorf(
|
||||||
|
"invalid CERTCTL_RATE_LIMIT_BACKEND=%q — refuse to start: must be \"memory\" (default, per-process limits; for single-replica deploys) or \"postgres\" (cross-replica-consistent via the rate_limit_buckets table; required for HA deploys). See docs/operator/observability.md.",
|
||||||
|
c.RateLimit.SlidingWindowBackend,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
// Janitor interval lower bound — 1 minute. Below this the sweep
|
||||||
|
// cost outweighs the row-cleanup benefit; above this still
|
||||||
|
// matches the operator's bound (5 minutes default; can be raised
|
||||||
|
// indefinitely).
|
||||||
|
if c.RateLimit.SlidingWindowJanitorInterval > 0 &&
|
||||||
|
c.RateLimit.SlidingWindowJanitorInterval < time.Minute {
|
||||||
|
return fmt.Errorf(
|
||||||
|
"invalid CERTCTL_RATE_LIMIT_JANITOR_INTERVAL=%v — refuse to start: must be ≥ 1 minute (default 5m).",
|
||||||
|
c.RateLimit.SlidingWindowJanitorInterval,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// Validate database configuration
|
// Validate database configuration
|
||||||
if c.Database.URL == "" {
|
if c.Database.URL == "" {
|
||||||
return fmt.Errorf("database URL is required")
|
return fmt.Errorf("database URL is required")
|
||||||
@@ -830,15 +1037,21 @@ func (c *Config) Validate() error {
|
|||||||
return fmt.Errorf("auth secret is required for auth type %s", c.Auth.Type)
|
return fmt.Errorf("auth secret is required for auth type %s", c.Auth.Type)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 2 SEC-H1 closure (2026-05-13): the AgentBootstrapTokenDenyEmpty
|
// Phase 2 SEC-H1 closure (2026-05-13) + acquisition-audit RED-003
|
||||||
// staged feature flag. When the operator opts in via
|
// closure (Sprint 5 ACQ, 2026-05-16): the AgentBootstrapTokenDenyEmpty
|
||||||
// CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=true AND the bootstrap
|
// fail-closed gate. The flag flipped default from false → true on
|
||||||
// token is empty, Validate() returns a fail-closed error. Default
|
// 2026-05-16; operators upgrading from v2.1.x can reopen the
|
||||||
// flag value is false, preserving the existing v2.1.x warn-mode
|
// warn-mode escape hatch with CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=false
|
||||||
// pass-through behavior for backward compatibility. The default-flip
|
// for one upgrade window. CHANGELOG v2.2.0 documents the cutover.
|
||||||
// to true is scheduled for v2.2.0 in WORKSPACE-ROADMAP.md — operators
|
//
|
||||||
// get one upgrade window to set a real token.
|
// Demo-mode override: a screenshot/demo deploy with
|
||||||
if c.Auth.AgentBootstrapTokenDenyEmpty && c.Auth.AgentBootstrapToken == "" {
|
// CERTCTL_DEMO_MODE_ACK=true skips this guard so the demo path
|
||||||
|
// stays one-command-up. The accompanying boot banner WARN in
|
||||||
|
// cmd/server/main.go keeps the posture visible — demo deploys
|
||||||
|
// already log a prominent "DEMO MODE ACTIVE" line at every boot.
|
||||||
|
// Production deploys never set DemoModeAck, so this override
|
||||||
|
// cannot inadvertently re-enable warn-mode in production.
|
||||||
|
if c.Auth.AgentBootstrapTokenDenyEmpty && c.Auth.AgentBootstrapToken == "" && !c.Auth.DemoModeAck {
|
||||||
return fmt.Errorf("phase-2 SEC-H1 fail-closed guard: %w", ErrAgentBootstrapTokenRequired)
|
return fmt.Errorf("phase-2 SEC-H1 fail-closed guard: %w", ErrAgentBootstrapTokenRequired)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -986,6 +1199,27 @@ func (c *Config) Validate() error {
|
|||||||
if !validKeygenModes[c.Keygen.Mode] {
|
if !validKeygenModes[c.Keygen.Mode] {
|
||||||
return fmt.Errorf("invalid keygen mode: %s (must be 'agent' or 'server')", c.Keygen.Mode)
|
return fmt.Errorf("invalid keygen mode: %s (must be 'agent' or 'server')", c.Keygen.Mode)
|
||||||
}
|
}
|
||||||
|
// ARCH-003 closure (Sprint 4, 2026-05-16). README L12 + L82 say
|
||||||
|
// "private keys stay on your infrastructure" and "never touch the
|
||||||
|
// control plane" as blanket claims. CERTCTL_KEYGEN_MODE=server
|
||||||
|
// breaks both claims — the control plane mints the keys directly,
|
||||||
|
// in process memory, and writes them to the renewal job for
|
||||||
|
// delivery. Pre-fix the server printed a boot WARN and started
|
||||||
|
// anyway, so the blanket claim was silently false in any deploy
|
||||||
|
// where the operator flipped the flag without reading their logs.
|
||||||
|
// Mirror the Phase-2 SEC-H3 DemoModeAck pattern: refuse to boot
|
||||||
|
// in server-keygen mode unless the operator has explicitly
|
||||||
|
// acknowledged the demo posture via CERTCTL_DEMO_MODE_ACK=true.
|
||||||
|
// Bypass for tests that legitimately exercise the server-keygen
|
||||||
|
// path: those construct Config directly without going through
|
||||||
|
// Validate(), so this gate doesn't fire there.
|
||||||
|
if c.Keygen.Mode == "server" && !c.Auth.DemoModeAck {
|
||||||
|
return fmt.Errorf(
|
||||||
|
"CERTCTL_KEYGEN_MODE=server is demo-only — the control plane mints private keys in process memory, " +
|
||||||
|
"breaking the 'keys never touch the control plane' production posture. Set " +
|
||||||
|
"CERTCTL_DEMO_MODE_ACK=true + CERTCTL_DEMO_MODE_ACK_TS=$(date +%%s) to acknowledge, " +
|
||||||
|
"OR set CERTCTL_KEYGEN_MODE=agent (the default) for production")
|
||||||
|
}
|
||||||
|
|
||||||
// SCEP fail-loud startup gate (H-2, CWE-306).
|
// SCEP fail-loud startup gate (H-2, CWE-306).
|
||||||
//
|
//
|
||||||
@@ -1216,9 +1450,78 @@ func (c *Config) Validate() error {
|
|||||||
return fmt.Errorf("awaiting approval timeout must be at least 1 second")
|
return fmt.Errorf("awaiting approval timeout must be at least 1 second")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Acquisition-audit SEC-013 closure (Sprint 2, 2026-05-16).
|
||||||
|
// Post-validate advisory WARN — NOT fail-closed — when
|
||||||
|
// CERTCTL_DATABASE_URL carries sslmode=disable AND the host is
|
||||||
|
// external (not loopback / not a known in-cluster service name).
|
||||||
|
// The compose bridge network legitimately uses sslmode=disable on
|
||||||
|
// the docker-internal hop to postgres:5432; failing closed would
|
||||||
|
// break the production-shaped quickstart. The WARN catches the
|
||||||
|
// real-world landmine: an operator who points CERTCTL_DATABASE_URL
|
||||||
|
// at an RDS / managed-Postgres host outside the bridge network
|
||||||
|
// without flipping sslmode to verify-full.
|
||||||
|
warnExternalSslmodeDisable(c.Database.URL, slog.Default())
|
||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// dbHostLocalSafelist is the set of hosts where sslmode=disable is an
|
||||||
|
// acceptable default (loopback + in-cluster service-name conventions).
|
||||||
|
// SEC-013 closure (Sprint 2 ACQ, 2026-05-16). Match is exact host
|
||||||
|
// equality except for the .svc.cluster.local suffix which is a
|
||||||
|
// substring match. Adding entries here is an operator-judgment call;
|
||||||
|
// keep the list tight (a too-permissive list silences a real
|
||||||
|
// landmine warning).
|
||||||
|
var dbHostLocalSafelist = map[string]struct{}{
|
||||||
|
"localhost": {},
|
||||||
|
"127.0.0.1": {},
|
||||||
|
"::1": {},
|
||||||
|
"postgres": {},
|
||||||
|
"certctl-postgres": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
// warnExternalSslmodeDisable emits an slog.Warn (matching the
|
||||||
|
// cmd/server/main.go demo-mode WARN shape) when the database URL
|
||||||
|
// parses as a Postgres URL with sslmode=disable AND the host is
|
||||||
|
// outside the local-safelist. The function is intentionally
|
||||||
|
// permissive on parse failures — if the URL is malformed, the
|
||||||
|
// downstream sql.Open will surface a clearer error than a noisy
|
||||||
|
// WARN here would. SEC-013 closure (Sprint 2 ACQ).
|
||||||
|
func warnExternalSslmodeDisable(rawURL string, logger *slog.Logger) {
|
||||||
|
if logger == nil {
|
||||||
|
logger = slog.Default()
|
||||||
|
}
|
||||||
|
if rawURL == "" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
u, err := url.Parse(rawURL)
|
||||||
|
if err != nil || u == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if u.Scheme != "postgres" && u.Scheme != "postgresql" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
q := u.Query()
|
||||||
|
if q.Get("sslmode") != "disable" {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
host := u.Hostname()
|
||||||
|
if _, ok := dbHostLocalSafelist[host]; ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// In-cluster service names of the form <name>.svc.cluster.local
|
||||||
|
// (or longer K8s cluster-domain variants) are acceptable; the
|
||||||
|
// docker-bridge / pod-network hop is treated as trusted by the
|
||||||
|
// existing compose + Helm conventions.
|
||||||
|
if strings.HasSuffix(host, ".svc.cluster.local") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
logger.Warn("CERTCTL_DATABASE_URL points at a non-local Postgres host with sslmode=disable — Postgres traffic crosses an untrusted network in cleartext. Set sslmode=verify-full and provide a CA bundle. See docs/operator/database-tls.md for the full upgrade procedure. Override env var: CERTCTL_DATABASE_URL (set the URL with sslmode=verify-full + sslrootcert=<ca-path>).",
|
||||||
|
"host", host,
|
||||||
|
"sslmode", "disable",
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
// getEnv reads a string environment variable with the given key and default value.
|
// getEnv reads a string environment variable with the given key and default value.
|
||||||
func getEnv(key, defaultValue string) string {
|
func getEnv(key, defaultValue string) string {
|
||||||
if value := os.Getenv(key); value != "" {
|
if value := os.Getenv(key); value != "" {
|
||||||
@@ -1227,6 +1530,37 @@ func getEnv(key, defaultValue string) string {
|
|||||||
return defaultValue
|
return defaultValue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// expandDatabaseURL substitutes the literal "$(POSTGRES_PASSWORD)"
|
||||||
|
// placeholder in a database URL with the value of the POSTGRES_PASSWORD
|
||||||
|
// environment variable. DEPL-004 closure (Sprint 3, 2026-05-16).
|
||||||
|
//
|
||||||
|
// Kubernetes ONLY expands `$(VAR)` syntax when the env value is a
|
||||||
|
// string literal in the Pod spec. Values sourced from
|
||||||
|
// `valueFrom.secretKeyRef` (which is how the Helm chart wires
|
||||||
|
// CERTCTL_DATABASE_URL) are NOT expanded — the literal makes it all
|
||||||
|
// the way to the application. This helper does the expansion in-process
|
||||||
|
// so the bundled-Postgres flow Just Works without a per-pod entrypoint
|
||||||
|
// shim.
|
||||||
|
//
|
||||||
|
// Conservative: a strings.Replace on exactly one well-known token
|
||||||
|
// (the chart's `_helpers.tpl` produces `$(POSTGRES_PASSWORD)` and
|
||||||
|
// nothing else). External-Postgres deploys whose URL embeds the
|
||||||
|
// real password don't match the placeholder and pass through untouched.
|
||||||
|
// When POSTGRES_PASSWORD is unset, the URL is left as-is so the
|
||||||
|
// downstream connection failure is the same as before (and a missing
|
||||||
|
// password is the operator's mis-config, not our regression).
|
||||||
|
func expandDatabaseURL(url string) string {
|
||||||
|
const placeholder = "$(POSTGRES_PASSWORD)"
|
||||||
|
if !strings.Contains(url, placeholder) {
|
||||||
|
return url
|
||||||
|
}
|
||||||
|
pw := os.Getenv("POSTGRES_PASSWORD")
|
||||||
|
if pw == "" {
|
||||||
|
return url
|
||||||
|
}
|
||||||
|
return strings.ReplaceAll(url, placeholder, pw)
|
||||||
|
}
|
||||||
|
|
||||||
// getEnvInt reads an integer environment variable with the given key and default value.
|
// getEnvInt reads an integer environment variable with the given key and default value.
|
||||||
func getEnvInt(key string, defaultValue int) int {
|
func getEnvInt(key string, defaultValue int) int {
|
||||||
if value := os.Getenv(key); value != "" {
|
if value := os.Getenv(key); value != "" {
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ func TestESTConfig_LegacyFlatFields_SynthesizeSingleProfile(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
@@ -102,6 +103,7 @@ func TestESTConfig_DisabledNoLegacyShim(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
@@ -152,6 +154,7 @@ func TestESTConfig_MultipleProfiles_LoadFromEnv(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
@@ -234,6 +237,7 @@ func TestESTConfig_StructuredFormBeatsLegacy(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
|
|||||||
@@ -68,6 +68,7 @@ func TestSCEPConfig_LegacyFlatFields_SynthesizeSingleProfile(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
@@ -116,6 +117,7 @@ func TestSCEPConfig_MultipleProfiles_LoadFromEnv(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
@@ -162,6 +164,7 @@ func TestSCEPConfig_StructuredFormBeatsLegacy(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
t.Setenv("CERTCTL_DB_URL", "postgres://localhost/certctl?sslmode=disable")
|
||||||
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
t.Setenv("CERTCTL_AUTH_TYPE", "api-key")
|
||||||
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
t.Setenv("CERTCTL_AUTH_SECRET", "test-secret")
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
srv := validServerConfig(t)
|
srv := validServerConfig(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", srv.TLS.CertPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", srv.TLS.KeyPath)
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
package config
|
package config
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"bytes"
|
||||||
"crypto/ecdsa"
|
"crypto/ecdsa"
|
||||||
"crypto/elliptic"
|
"crypto/elliptic"
|
||||||
"crypto/rand"
|
"crypto/rand"
|
||||||
@@ -52,6 +53,14 @@ func setMinimalValidEnv(t *testing.T) {
|
|||||||
certPath, keyPath := generateTestTLSPair(t)
|
certPath, keyPath := generateTestTLSPair(t)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", certPath)
|
t.Setenv("CERTCTL_SERVER_TLS_CERT_PATH", certPath)
|
||||||
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", keyPath)
|
t.Setenv("CERTCTL_SERVER_TLS_KEY_PATH", keyPath)
|
||||||
|
// Acquisition-audit RED-003 closure (Sprint 5 ACQ, 2026-05-16):
|
||||||
|
// the deny-empty default flipped to true, so Load() now refuses
|
||||||
|
// to start with an empty bootstrap token. Supply a placeholder
|
||||||
|
// so Load()-based tests that don't specifically test the
|
||||||
|
// deny-empty gate continue to pass. Tests that DO exercise the
|
||||||
|
// empty-token gate explicitly override via
|
||||||
|
// t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "") after this helper.
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "test-bootstrap-token-placeholder")
|
||||||
}
|
}
|
||||||
|
|
||||||
// generateTestTLSPair writes an ECDSA P-256 self-signed certificate + private
|
// generateTestTLSPair writes an ECDSA P-256 self-signed certificate + private
|
||||||
@@ -232,6 +241,14 @@ func TestLoad_AllEnvVarsSet(t *testing.T) {
|
|||||||
t.Setenv("CERTCTL_RATE_LIMIT_BURST", "200")
|
t.Setenv("CERTCTL_RATE_LIMIT_BURST", "200")
|
||||||
t.Setenv("CERTCTL_CORS_ORIGINS", "https://a.com,https://b.com")
|
t.Setenv("CERTCTL_CORS_ORIGINS", "https://a.com,https://b.com")
|
||||||
t.Setenv("CERTCTL_KEYGEN_MODE", "server")
|
t.Setenv("CERTCTL_KEYGEN_MODE", "server")
|
||||||
|
// Sprint 4 ARCH-003 made Load()→Validate() refuse to boot in
|
||||||
|
// server-keygen mode without an explicit demo-mode acknowledgement.
|
||||||
|
// This test exercises the "every CERTCTL_* env var set" path, so
|
||||||
|
// it sets KEYGEN_MODE=server — which now requires the demo-ack
|
||||||
|
// pair. Mirror the SEC-H3 demo-ack pattern: ACK=true + fresh TS
|
||||||
|
// within the 24h window.
|
||||||
|
t.Setenv("CERTCTL_DEMO_MODE_ACK", "true")
|
||||||
|
t.Setenv("CERTCTL_DEMO_MODE_ACK_TS", strconv.FormatInt(time.Now().Unix(), 10))
|
||||||
t.Setenv("CERTCTL_LOG_LEVEL", "debug")
|
t.Setenv("CERTCTL_LOG_LEVEL", "debug")
|
||||||
t.Setenv("CERTCTL_LOG_FORMAT", "text")
|
t.Setenv("CERTCTL_LOG_FORMAT", "text")
|
||||||
t.Setenv("CERTCTL_DATABASE_URL", "postgres://user:pass@db:5432/certctl")
|
t.Setenv("CERTCTL_DATABASE_URL", "postgres://user:pass@db:5432/certctl")
|
||||||
@@ -404,9 +421,14 @@ func TestLoad_CommaSeparatedList(t *testing.T) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Phase 2 SEC-H1 (2026-05-13) — AgentBootstrapTokenDenyEmpty staged flag.
|
// Phase 2 SEC-H1 (2026-05-13) introduced the AgentBootstrapTokenDenyEmpty
|
||||||
// When false (default), an empty token is permitted (v2.1.x warn-mode
|
// staged flag with default false. Acquisition-audit RED-003 closure
|
||||||
// pass-through preserved). When true, an empty token fails closed.
|
// (Sprint 5 ACQ, 2026-05-16) flipped the default to true. The test
|
||||||
|
// below preserves the back-compat path (operator explicitly opts back
|
||||||
|
// to the v2.1.x warn-mode pass-through); the new default behavior is
|
||||||
|
// covered by TestLoad_AgentBootstrapTokenDenyEmpty_DefaultIsTrue +
|
||||||
|
// TestValidate_AgentBootstrapTokenDenyEmpty_True_EmptyTokenFailsClosed
|
||||||
|
// further down in this file.
|
||||||
func TestValidate_AgentBootstrapTokenDenyEmpty_DefaultFalse_AllowsEmpty(t *testing.T) {
|
func TestValidate_AgentBootstrapTokenDenyEmpty_DefaultFalse_AllowsEmpty(t *testing.T) {
|
||||||
cfg := &Config{
|
cfg := &Config{
|
||||||
Server: validServerConfig(t),
|
Server: validServerConfig(t),
|
||||||
@@ -1918,3 +1940,411 @@ func TestValidate_Bundle2_CORSConcreteAllowlist_Accepted(t *testing.T) {
|
|||||||
t.Errorf("Validate() returned %v; want nil for concrete CORS allowlist", err)
|
t.Errorf("Validate() returned %v; want nil for concrete CORS allowlist", err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// DEPL-004 closure (Sprint 3, 2026-05-16). The Helm chart renders the
|
||||||
|
// bundled-Postgres URL with a literal "$(POSTGRES_PASSWORD)"
|
||||||
|
// placeholder. Kubernetes does NOT expand `$(VAR)` syntax when the env
|
||||||
|
// is sourced from a Secret (valueFrom.secretKeyRef), so the server
|
||||||
|
// receives the placeholder verbatim. expandDatabaseURL substitutes the
|
||||||
|
// token with os.Getenv("POSTGRES_PASSWORD") at Load() time.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestExpandDatabaseURL_SubstitutesPlaceholder(t *testing.T) {
|
||||||
|
t.Setenv("POSTGRES_PASSWORD", "s3cret!")
|
||||||
|
in := "postgres://certctl:$(POSTGRES_PASSWORD)@db:5432/certctl?sslmode=disable"
|
||||||
|
got := expandDatabaseURL(in)
|
||||||
|
want := "postgres://certctl:s3cret!@db:5432/certctl?sslmode=disable"
|
||||||
|
if got != want {
|
||||||
|
t.Errorf("expandDatabaseURL = %q; want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandDatabaseURL_NoPlaceholderPassesThrough(t *testing.T) {
|
||||||
|
// External-Postgres deploys bake the password into the URL string
|
||||||
|
// — the helper must not touch URLs that don't carry the placeholder.
|
||||||
|
t.Setenv("POSTGRES_PASSWORD", "ignored")
|
||||||
|
in := "postgres://user:realpw@external:5432/db?sslmode=require"
|
||||||
|
if got := expandDatabaseURL(in); got != in {
|
||||||
|
t.Errorf("expandDatabaseURL on non-placeholder URL = %q; want %q (no-op)", got, in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandDatabaseURL_PlaceholderButNoEnvLeftAlone(t *testing.T) {
|
||||||
|
// When POSTGRES_PASSWORD is unset, leave the URL alone so the
|
||||||
|
// downstream connection failure is the same as before (misconfig
|
||||||
|
// is the operator's, not our regression).
|
||||||
|
t.Setenv("POSTGRES_PASSWORD", "")
|
||||||
|
in := "postgres://certctl:$(POSTGRES_PASSWORD)@db:5432/certctl?sslmode=disable"
|
||||||
|
if got := expandDatabaseURL(in); got != in {
|
||||||
|
t.Errorf("expandDatabaseURL with no POSTGRES_PASSWORD = %q; want unchanged %q", got, in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestExpandDatabaseURL_MultipleOccurrences(t *testing.T) {
|
||||||
|
// Defensive: belt-and-suspenders. The chart only emits one
|
||||||
|
// placeholder today but ReplaceAll guards against future drift.
|
||||||
|
t.Setenv("POSTGRES_PASSWORD", "X")
|
||||||
|
in := "$(POSTGRES_PASSWORD)/$(POSTGRES_PASSWORD)"
|
||||||
|
want := "X/X"
|
||||||
|
if got := expandDatabaseURL(in); got != want {
|
||||||
|
t.Errorf("expandDatabaseURL = %q; want %q", got, want)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// ARCH-002 closure (Sprint 4, 2026-05-16). Auth Bundle 2 Phase 6
|
||||||
|
// shipped the OIDC session middleware + handler chain in code, but
|
||||||
|
// cmd/server/main.go retained a Phase-0 runtime guard that exited
|
||||||
|
// the process when CERTCTL_AUTH_TYPE=oidc. The guard was supposed
|
||||||
|
// to relax once the prerequisites landed; it didn't, and the
|
||||||
|
// README's "Sign in with OIDC SSO" claim was effectively a lie
|
||||||
|
// because the server refused to start with auth=oidc.
|
||||||
|
//
|
||||||
|
// Post-fix the runtime gate is centralised at
|
||||||
|
// config.IsRuntimeSupportedAuthType and accepts every entry in
|
||||||
|
// ValidAuthTypes(). These tests pin the new invariant — the
|
||||||
|
// runtime support set MUST equal the validator's allowed set.
|
||||||
|
// A future regression that flips back to "OIDC not supported"
|
||||||
|
// surfaces here.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestIsRuntimeSupportedAuthType_AcceptsAllValidEntries(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
for _, at := range ValidAuthTypes() {
|
||||||
|
if !IsRuntimeSupportedAuthType(at) {
|
||||||
|
t.Errorf("IsRuntimeSupportedAuthType(%q) = false; want true (every valid auth type must be runtime-supported)", at)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsRuntimeSupportedAuthType_AcceptsOIDC(t *testing.T) {
|
||||||
|
// Explicit ARCH-002 invariant — OIDC must boot cleanly.
|
||||||
|
t.Parallel()
|
||||||
|
if !IsRuntimeSupportedAuthType(AuthTypeOIDC) {
|
||||||
|
t.Fatalf("IsRuntimeSupportedAuthType(oidc) = false; the Bundle-2 stale runtime guard regressed (ARCH-002)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestIsRuntimeSupportedAuthType_RejectsUnknown(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
for _, bad := range []AuthType{"", "jwt", "saml", "mtls", "API-KEY"} {
|
||||||
|
if IsRuntimeSupportedAuthType(bad) {
|
||||||
|
t.Errorf("IsRuntimeSupportedAuthType(%q) = true; want false (unknown auth types must be rejected)", bad)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// ARCH-003 closure (Sprint 4, 2026-05-16). README claimed "private
|
||||||
|
// keys stay on your infrastructure" / "never touch the control plane"
|
||||||
|
// as a blanket promise. CERTCTL_KEYGEN_MODE=server breaks both — keys
|
||||||
|
// are minted in the server process and shipped to the renewal job.
|
||||||
|
// Pre-fix the server printed a boot WARN and started anyway, so the
|
||||||
|
// blanket claim was silently false in any deploy that flipped the flag
|
||||||
|
// without reading logs.
|
||||||
|
//
|
||||||
|
// Post-fix Validate() refuses to accept Mode=server unless
|
||||||
|
// CERTCTL_DEMO_MODE_ACK=true is also set (mirroring the SEC-H3
|
||||||
|
// 24-hour ACK pattern). Production deploys must use Mode=agent.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestValidate_RejectsServerKeygenWithoutDemoAck(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
cfg := &Config{
|
||||||
|
Server: validServerConfig(t),
|
||||||
|
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||||
|
Log: LogConfig{Level: "info", Format: "json"},
|
||||||
|
Auth: AuthConfig{Type: "api-key", Secret: "x", DemoModeAck: false},
|
||||||
|
Keygen: KeygenConfig{Mode: "server"},
|
||||||
|
Scheduler: SchedulerConfig{
|
||||||
|
RenewalCheckInterval: 1 * time.Hour,
|
||||||
|
JobProcessorInterval: 30 * time.Second,
|
||||||
|
AgentHealthCheckInterval: 2 * time.Minute,
|
||||||
|
NotificationProcessInterval: 1 * time.Minute,
|
||||||
|
NotificationRetryInterval: 2 * time.Minute,
|
||||||
|
RetryInterval: 5 * time.Minute,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
err := cfg.Validate()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("Validate(KeygenMode=server, DemoAck=false) returned nil; want fail-closed rejection")
|
||||||
|
}
|
||||||
|
if !strings.Contains(err.Error(), "CERTCTL_KEYGEN_MODE=server") {
|
||||||
|
t.Errorf("Validate err = %v; want error citing CERTCTL_KEYGEN_MODE=server", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidate_AcceptsServerKeygenWithDemoAck(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
// Operators who explicitly acknowledge the demo posture get to boot
|
||||||
|
// in server-keygen mode. Same pattern SEC-H3 uses for AUTH_TYPE=none.
|
||||||
|
tsRecent := strconv.FormatInt(time.Now().Unix(), 10)
|
||||||
|
cfg := &Config{
|
||||||
|
Server: validServerConfig(t),
|
||||||
|
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||||
|
Log: LogConfig{Level: "info", Format: "json"},
|
||||||
|
Auth: AuthConfig{
|
||||||
|
Type: "api-key",
|
||||||
|
Secret: "x",
|
||||||
|
DemoModeAck: true,
|
||||||
|
DemoModeAckTS: tsRecent,
|
||||||
|
},
|
||||||
|
Keygen: KeygenConfig{Mode: "server"},
|
||||||
|
Scheduler: SchedulerConfig{
|
||||||
|
RenewalCheckInterval: 1 * time.Hour,
|
||||||
|
JobProcessorInterval: 30 * time.Second,
|
||||||
|
AgentHealthCheckInterval: 2 * time.Minute,
|
||||||
|
NotificationProcessInterval: 1 * time.Minute,
|
||||||
|
NotificationRetryInterval: 2 * time.Minute,
|
||||||
|
RetryInterval: 5 * time.Minute,
|
||||||
|
JobTimeoutInterval: 10 * time.Minute,
|
||||||
|
AwaitingCSRTimeout: 24 * time.Hour,
|
||||||
|
AwaitingApprovalTimeout: 168 * time.Hour,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if err := cfg.Validate(); err != nil {
|
||||||
|
t.Errorf("Validate(KeygenMode=server, DemoAck=true, fresh TS) = %v; want nil", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestValidate_AgentKeygenIgnoresDemoAck(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
// The new gate must NOT regress production deploys — agent mode
|
||||||
|
// (the default) boots cleanly without any demo ACK.
|
||||||
|
cfg := &Config{
|
||||||
|
Server: validServerConfig(t),
|
||||||
|
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||||
|
Log: LogConfig{Level: "info", Format: "json"},
|
||||||
|
Auth: AuthConfig{Type: "api-key", Secret: "x", DemoModeAck: false},
|
||||||
|
Keygen: KeygenConfig{Mode: "agent"},
|
||||||
|
Scheduler: SchedulerConfig{
|
||||||
|
RenewalCheckInterval: 1 * time.Hour,
|
||||||
|
JobProcessorInterval: 30 * time.Second,
|
||||||
|
AgentHealthCheckInterval: 2 * time.Minute,
|
||||||
|
NotificationProcessInterval: 1 * time.Minute,
|
||||||
|
NotificationRetryInterval: 2 * time.Minute,
|
||||||
|
RetryInterval: 5 * time.Minute,
|
||||||
|
JobTimeoutInterval: 10 * time.Minute,
|
||||||
|
AwaitingCSRTimeout: 24 * time.Hour,
|
||||||
|
AwaitingApprovalTimeout: 168 * time.Hour,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if err := cfg.Validate(); err != nil {
|
||||||
|
t.Errorf("Validate(KeygenMode=agent, DemoAck=false) = %v; want nil (production default must boot)", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// newBufferLogger returns a slog.Logger that writes JSON records into the
|
||||||
|
// returned buffer, suitable for asserting WARN emission from
|
||||||
|
// warnExternalSslmodeDisable. SEC-013 closure (Sprint 2 ACQ).
|
||||||
|
func newBufferLogger() (*slog.Logger, *bytes.Buffer) {
|
||||||
|
var buf bytes.Buffer
|
||||||
|
h := slog.NewJSONHandler(&buf, &slog.HandlerOptions{Level: slog.LevelDebug})
|
||||||
|
return slog.New(h), &buf
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWarnExternalSslmodeDisable_FiresOnExternalHost asserts an external
|
||||||
|
// host (e.g. RDS) + sslmode=disable produces a WARN. SEC-013 closure
|
||||||
|
// (Sprint 2 ACQ, 2026-05-16). The advisory exists to surface the
|
||||||
|
// real-world landmine: an operator who points CERTCTL_DATABASE_URL at a
|
||||||
|
// managed-Postgres host outside the bridge network without flipping
|
||||||
|
// sslmode to verify-full.
|
||||||
|
func TestWarnExternalSslmodeDisable_FiresOnExternalHost(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
logger, buf := newBufferLogger()
|
||||||
|
warnExternalSslmodeDisable("postgres://certctl:secret@db.internal.example.com:5432/certctl?sslmode=disable", logger)
|
||||||
|
|
||||||
|
out := buf.String()
|
||||||
|
if !strings.Contains(out, `"level":"WARN"`) {
|
||||||
|
t.Fatalf("expected a WARN record, got: %s", out)
|
||||||
|
}
|
||||||
|
if !strings.Contains(out, "db.internal.example.com") {
|
||||||
|
t.Errorf("WARN should include the external host in structured fields; got: %s", out)
|
||||||
|
}
|
||||||
|
if !strings.Contains(out, "sslmode") {
|
||||||
|
t.Errorf("WARN should include the sslmode structured field; got: %s", out)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWarnExternalSslmodeDisable_QuietForLocalSafelist asserts the
|
||||||
|
// loopback + in-cluster service-name conventions stay silent. These are
|
||||||
|
// the legitimate sslmode=disable callers — compose bridge network
|
||||||
|
// (`postgres` / `certctl-postgres`), localhost dev loops, and K8s
|
||||||
|
// in-cluster service names (`*.svc.cluster.local`). SEC-013 closure.
|
||||||
|
func TestWarnExternalSslmodeDisable_QuietForLocalSafelist(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
silentHosts := []string{
|
||||||
|
"postgres://certctl@localhost:5432/certctl?sslmode=disable",
|
||||||
|
"postgres://certctl@127.0.0.1:5432/certctl?sslmode=disable",
|
||||||
|
"postgres://certctl@[::1]:5432/certctl?sslmode=disable",
|
||||||
|
"postgres://certctl@postgres:5432/certctl?sslmode=disable",
|
||||||
|
"postgres://certctl@certctl-postgres:5432/certctl?sslmode=disable",
|
||||||
|
"postgres://certctl@certctl-postgres.certctl.svc.cluster.local:5432/certctl?sslmode=disable",
|
||||||
|
}
|
||||||
|
for _, url := range silentHosts {
|
||||||
|
url := url
|
||||||
|
t.Run(url, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
logger, buf := newBufferLogger()
|
||||||
|
warnExternalSslmodeDisable(url, logger)
|
||||||
|
if buf.Len() != 0 {
|
||||||
|
t.Errorf("expected silence for safelisted host (%s); got: %s", url, buf.String())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWarnExternalSslmodeDisable_QuietWithoutDisable asserts that any
|
||||||
|
// sslmode other than `disable` (the production-grade modes) stays
|
||||||
|
// silent even with an external host. SEC-013 closure.
|
||||||
|
func TestWarnExternalSslmodeDisable_QuietWithoutDisable(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
for _, url := range []string{
|
||||||
|
"postgres://certctl@db.internal.example.com:5432/certctl?sslmode=verify-full&sslrootcert=/etc/ssl/ca.pem",
|
||||||
|
"postgres://certctl@db.internal.example.com:5432/certctl?sslmode=require",
|
||||||
|
"postgres://certctl@db.internal.example.com:5432/certctl", // no sslmode at all
|
||||||
|
} {
|
||||||
|
url := url
|
||||||
|
t.Run(url, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
logger, buf := newBufferLogger()
|
||||||
|
warnExternalSslmodeDisable(url, logger)
|
||||||
|
if buf.Len() != 0 {
|
||||||
|
t.Errorf("expected silence for non-disable sslmode (%s); got: %s", url, buf.String())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestWarnExternalSslmodeDisable_QuietOnUnparseableOrEmpty asserts the
|
||||||
|
// helper is permissive on garbage input — downstream sql.Open surfaces
|
||||||
|
// the real parse error; the SEC-013 advisory must not become a noisy
|
||||||
|
// hot path. SEC-013 closure.
|
||||||
|
func TestWarnExternalSslmodeDisable_QuietOnUnparseableOrEmpty(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
for _, url := range []string{
|
||||||
|
"",
|
||||||
|
"not-a-url",
|
||||||
|
"mysql://certctl@db:3306/x?sslmode=disable", // non-postgres scheme
|
||||||
|
} {
|
||||||
|
url := url
|
||||||
|
t.Run(url, func(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
logger, buf := newBufferLogger()
|
||||||
|
warnExternalSslmodeDisable(url, logger)
|
||||||
|
if buf.Len() != 0 {
|
||||||
|
t.Errorf("expected silence for unparseable/non-postgres input (%q); got: %s", url, buf.String())
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
// Acquisition-audit Sprint 5 ACQ — RED-003 deny-empty default flip
|
||||||
|
// (2026-05-16). Three new tests pin the new default + the two
|
||||||
|
// override paths (operator opt-back, demo-mode override).
|
||||||
|
// -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
// TestLoad_AgentBootstrapTokenDenyEmpty_DefaultIsTrue pins the post-
|
||||||
|
// 2026-05-16 default. Load() with no CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY
|
||||||
|
// set must produce a Config whose AuthConfig.AgentBootstrapTokenDenyEmpty
|
||||||
|
// is true. Together with the next test, this proves the default flip
|
||||||
|
// from false → true at the boot path.
|
||||||
|
func TestLoad_AgentBootstrapTokenDenyEmpty_DefaultIsTrue(t *testing.T) {
|
||||||
|
clearCertctlEnv(t)
|
||||||
|
setMinimalValidEnv(t)
|
||||||
|
// Set a real bootstrap token so the deny-empty + empty-token guard
|
||||||
|
// doesn't trip — we're asserting the default flag VALUE here, not
|
||||||
|
// the guard behavior.
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "a-real-32-byte-token-value-here-x")
|
||||||
|
|
||||||
|
cfg, err := Load()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Load() = %v; want nil", err)
|
||||||
|
}
|
||||||
|
if !cfg.Auth.AgentBootstrapTokenDenyEmpty {
|
||||||
|
t.Error("Load() default AgentBootstrapTokenDenyEmpty = false; want true (Sprint 5 ACQ flip on 2026-05-16)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestValidate_DenyEmptyDefault_RefusesWithoutToken pins the new
|
||||||
|
// default's effect: an empty token, with the flag at its
|
||||||
|
// post-2026-05-16 default of true, fails closed at Validate().
|
||||||
|
// Different shape from
|
||||||
|
// TestValidate_AgentBootstrapTokenDenyEmpty_True_EmptyTokenFailsClosed
|
||||||
|
// — that test sets the flag explicitly; this one drives the flag
|
||||||
|
// value from Load() defaults so it tracks any future default flip.
|
||||||
|
func TestValidate_DenyEmptyDefault_RefusesWithoutToken(t *testing.T) {
|
||||||
|
clearCertctlEnv(t)
|
||||||
|
setMinimalValidEnv(t)
|
||||||
|
// setMinimalValidEnv now sets CERTCTL_AGENT_BOOTSTRAP_TOKEN to
|
||||||
|
// a placeholder (post-Sprint-5 ACQ default-flip — most Load()-
|
||||||
|
// based tests need it). Override back to empty here because
|
||||||
|
// THIS test is specifically the empty-token + default-deny-empty
|
||||||
|
// fail-closed assertion.
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "")
|
||||||
|
// CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY deliberately unset
|
||||||
|
// so the default (true) applies.
|
||||||
|
|
||||||
|
_, err := Load()
|
||||||
|
if err == nil {
|
||||||
|
t.Fatal("Load() = nil; want ErrAgentBootstrapTokenRequired (deny-empty default flipped to true; empty token must fail closed)")
|
||||||
|
}
|
||||||
|
if !errors.Is(err, ErrAgentBootstrapTokenRequired) {
|
||||||
|
t.Errorf("Load() err = %v; want errors.Is to match ErrAgentBootstrapTokenRequired", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestValidate_DenyEmptyExplicitFalse_AllowsEmpty pins the v2.1.x
|
||||||
|
// back-compat path: an operator who explicitly opts out of the new
|
||||||
|
// default (CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY=false) keeps the
|
||||||
|
// warn-mode pass-through. CHANGELOG v2.2.0 documents this as a
|
||||||
|
// one-upgrade-window escape hatch for operators who haven't generated
|
||||||
|
// a token yet.
|
||||||
|
func TestValidate_DenyEmptyExplicitFalse_AllowsEmpty(t *testing.T) {
|
||||||
|
clearCertctlEnv(t)
|
||||||
|
setMinimalValidEnv(t)
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN_DENY_EMPTY", "false")
|
||||||
|
// Override setMinimalValidEnv's placeholder so we exercise the
|
||||||
|
// "operator explicit opt-out + empty token" path.
|
||||||
|
t.Setenv("CERTCTL_AGENT_BOOTSTRAP_TOKEN", "")
|
||||||
|
|
||||||
|
cfg, err := Load()
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Load() = %v; want nil (explicit deny-empty=false allows empty token)", err)
|
||||||
|
}
|
||||||
|
if cfg.Auth.AgentBootstrapTokenDenyEmpty {
|
||||||
|
t.Error("AgentBootstrapTokenDenyEmpty = true; want false (operator explicit opt-out)")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestValidate_DenyEmpty_DemoModeAckOverride_AllowsEmpty pins the
|
||||||
|
// demo-mode escape hatch. A demo deploy with
|
||||||
|
// CERTCTL_DEMO_MODE_ACK=true (plus the SEC-H3 24h-fresh TS) keeps
|
||||||
|
// the warn-mode pass-through even with deny-empty=true. The
|
||||||
|
// accompanying boot banner WARN in cmd/server/main.go keeps the
|
||||||
|
// posture visible to log scrapers — demo deploys already emit a
|
||||||
|
// prominent "DEMO MODE ACTIVE" banner at every boot.
|
||||||
|
func TestValidate_DenyEmpty_DemoModeAckOverride_AllowsEmpty(t *testing.T) {
|
||||||
|
cfg := &Config{
|
||||||
|
Server: validServerConfig(t),
|
||||||
|
Database: DatabaseConfig{URL: "postgres://localhost/certctl", MaxConnections: 25},
|
||||||
|
Log: LogConfig{Level: "info", Format: "json"},
|
||||||
|
Auth: AuthConfig{
|
||||||
|
Type: "none",
|
||||||
|
AgentBootstrapToken: "",
|
||||||
|
AgentBootstrapTokenDenyEmpty: true,
|
||||||
|
DemoModeAck: true,
|
||||||
|
// 24h-fresh TS — SEC-H3 already gates demo-mode boot on
|
||||||
|
// TS freshness; supply a current epoch so we exercise
|
||||||
|
// only the deny-empty-override leg, not the SEC-H3 leg.
|
||||||
|
DemoModeAckTS: strconv.FormatInt(time.Now().Unix(), 10),
|
||||||
|
},
|
||||||
|
Keygen: KeygenConfig{Mode: "agent"},
|
||||||
|
Scheduler: validSchedulerConfig(),
|
||||||
|
}
|
||||||
|
if err := cfg.Validate(); err != nil {
|
||||||
|
t.Fatalf("Validate() = %v; want nil (demo-mode override should allow empty token)", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -83,4 +83,28 @@ type NotifierConfig struct {
|
|||||||
// Default: true. Set to false for plain SMTP (not recommended).
|
// Default: true. Set to false for plain SMTP (not recommended).
|
||||||
// Setting: CERTCTL_SMTP_USE_TLS environment variable.
|
// Setting: CERTCTL_SMTP_USE_TLS environment variable.
|
||||||
SMTPUseTLS bool
|
SMTPUseTLS bool
|
||||||
|
|
||||||
|
// WebhookURL is the HTTP(S) endpoint for the generic webhook
|
||||||
|
// notifier. Acquisition-audit DOC-001 closure (Sprint 7 ACQ,
|
||||||
|
// 2026-05-16). When set, the cmd/server/main.go boot path
|
||||||
|
// constructs an internal/connector/notifier/webhook.Connector
|
||||||
|
// (full SafeHTTPDialContext SSRF guard + ValidateSafeURL pre-
|
||||||
|
// flight + HMAC-SHA256 signing) wrapped in NotifierAdapter so
|
||||||
|
// the simpler service.Notifier (Send + Channel) interface used
|
||||||
|
// by the notification service receives a "webhook" channel
|
||||||
|
// registration. Pre-Sprint-7 the impl existed in the tree but
|
||||||
|
// was unwired — README claimed "6 notifiers" while only 5
|
||||||
|
// were registered. Optional: leave empty to disable.
|
||||||
|
// Setting: CERTCTL_WEBHOOK_URL environment variable.
|
||||||
|
WebhookURL string
|
||||||
|
|
||||||
|
// WebhookSecret is the HMAC-SHA256 shared secret used by the
|
||||||
|
// webhook notifier to sign every outbound HTTP POST in the
|
||||||
|
// X-Webhook-Signature header. The receiver verifies the signature
|
||||||
|
// against the SAME secret before trusting the payload — without
|
||||||
|
// this guard, any host that can reach the operator's webhook
|
||||||
|
// endpoint could spoof certctl notifications. Optional but
|
||||||
|
// strongly recommended; empty disables signing (operator-
|
||||||
|
// acknowledged unsigned mode). Setting: CERTCTL_WEBHOOK_SECRET.
|
||||||
|
WebhookSecret string
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -170,6 +170,26 @@ type SchedulerConfig struct {
|
|||||||
// Setting: CERTCTL_RENEWAL_CONCURRENCY environment variable.
|
// Setting: CERTCTL_RENEWAL_CONCURRENCY environment variable.
|
||||||
RenewalConcurrency int
|
RenewalConcurrency int
|
||||||
|
|
||||||
|
// JobClaimLimit caps the number of Pending rows a single
|
||||||
|
// scheduler tick may claim via repository.JobRepository.ClaimPendingJobs.
|
||||||
|
// Default 1000.
|
||||||
|
//
|
||||||
|
// SCALE-001 closure (Sprint 2, 2026-05-16). Pre-fix the scheduler
|
||||||
|
// invoked ClaimPendingJobs with limit:0, which loads every Pending
|
||||||
|
// row in a single transaction. A 100K-job burst (cert-fleet sweep,
|
||||||
|
// post-outage recovery, etc.) would marshal the full queue into
|
||||||
|
// process memory before boundedFanOut's semaphore could back-
|
||||||
|
// pressure the upstream CAs. Capping the claim per tick keeps
|
||||||
|
// memory bounded; the next tick (JobProcessorInterval=30s default)
|
||||||
|
// picks up the rest.
|
||||||
|
//
|
||||||
|
// Operator-tune: bump for very-large-fleet deploys where 1000
|
||||||
|
// per 30s isn't enough throughput. Values ≤ 0 fall back to 1000
|
||||||
|
// rather than the legacy unlimited semantics — fail-safe.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_SCHEDULER_JOB_CLAIM_LIMIT environment variable.
|
||||||
|
JobClaimLimit int
|
||||||
|
|
||||||
// AgentHealthCheckInterval is how often the scheduler checks agent heartbeats.
|
// AgentHealthCheckInterval is how often the scheduler checks agent heartbeats.
|
||||||
// Default: 2 minutes. Minimum: 1 second. Marks agents offline if no recent heartbeat.
|
// Default: 2 minutes. Minimum: 1 second. Marks agents offline if no recent heartbeat.
|
||||||
// Setting: CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL environment variable.
|
// Setting: CERTCTL_SCHEDULER_AGENT_HEALTH_CHECK_INTERVAL environment variable.
|
||||||
@@ -321,6 +341,57 @@ type RateLimitConfig struct {
|
|||||||
// zero, BurstSize is used. Default: 0 (use BurstSize).
|
// zero, BurstSize is used. Default: 0 (use BurstSize).
|
||||||
// Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable.
|
// Setting: CERTCTL_RATE_LIMIT_PER_USER_BURST environment variable.
|
||||||
PerUserBurstSize int
|
PerUserBurstSize int
|
||||||
|
|
||||||
|
// BucketTTL bounds the unused-bucket lifetime in the token-bucket
|
||||||
|
// map. Idle buckets older than BucketTTL are reclaimed by a
|
||||||
|
// background sweeper running every (BucketTTL/4). Default 1 hour;
|
||||||
|
// values < 1 minute are clamped up to 1 minute in the limiter
|
||||||
|
// constructor. Set this lower if the server faces high-cardinality
|
||||||
|
// unauthenticated traffic (CGNAT churn, Tor exit lists, scanners)
|
||||||
|
// and the map RSS becomes a concern.
|
||||||
|
// SEC-006 closure (Sprint 2, 2026-05-16).
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_BUCKET_TTL environment variable.
|
||||||
|
BucketTTL time.Duration
|
||||||
|
|
||||||
|
// SlidingWindowBackend selects which backend implements the
|
||||||
|
// per-key sliding-window-log limiters wired in cmd/server/main.go
|
||||||
|
// (break-glass login, OCSP per-IP, cert-export per-actor, EST
|
||||||
|
// per-principal, EST failed-basic source-IP). Distinct from the
|
||||||
|
// token-bucket fields above — those are middleware RPS limits
|
||||||
|
// applied across every request via the http handler chain; this
|
||||||
|
// field controls the sliding-window-log primitive used by
|
||||||
|
// authenticated-but-shared-credential code paths.
|
||||||
|
//
|
||||||
|
// Valid values:
|
||||||
|
// "memory" — per-process, sync.Mutex-guarded map (historical
|
||||||
|
// default; perfect for single-replica deploys).
|
||||||
|
// "postgres" — cross-replica-consistent via the
|
||||||
|
// rate_limit_buckets table (migration 000046).
|
||||||
|
// SELECT FOR UPDATE arbitrates per-key access
|
||||||
|
// across the cluster. Adds ~2 DB round-trips per
|
||||||
|
// Allow call; acceptable on the gated hot path.
|
||||||
|
//
|
||||||
|
// Default: "memory". HA deploys with server.replicas > 1 should
|
||||||
|
// flip to "postgres" so a 2-replica deployment doesn't effectively
|
||||||
|
// double the per-key cap.
|
||||||
|
//
|
||||||
|
// Phase 13 Sprint 13.2/13.3 closure (architecture diligence audit
|
||||||
|
// ARCH-M1). See docs/operator/observability.md.
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_BACKEND environment variable.
|
||||||
|
SlidingWindowBackend string
|
||||||
|
|
||||||
|
// SlidingWindowJanitorInterval is how often the scheduler sweeps
|
||||||
|
// stale rows from rate_limit_buckets. A row is stale when its
|
||||||
|
// updated_at is older than the longest configured window any
|
||||||
|
// caller uses (currently 24h for the EST per-principal limiter).
|
||||||
|
// Default: 5 minutes. Minimum: 1 minute. No-op when
|
||||||
|
// SlidingWindowBackend = "memory" (the in-memory backend's
|
||||||
|
// prune-on-Allow path keeps buckets short-lived without a
|
||||||
|
// separate sweep).
|
||||||
|
//
|
||||||
|
// Setting: CERTCTL_RATE_LIMIT_JANITOR_INTERVAL environment variable.
|
||||||
|
SlidingWindowJanitorInterval time.Duration
|
||||||
}
|
}
|
||||||
|
|
||||||
// CORSConfig contains CORS configuration.
|
// CORSConfig contains CORS configuration.
|
||||||
|
|||||||
@@ -0,0 +1,106 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package webhook
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"crypto/rand"
|
||||||
|
"encoding/hex"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/connector/notifier"
|
||||||
|
)
|
||||||
|
|
||||||
|
// NotifierAdapter bridges the rich notifier.Connector interface
|
||||||
|
// (SendAlert / SendEvent / ValidateConfig) to the simpler service-
|
||||||
|
// layer service.Notifier interface (Send + Channel) used by the
|
||||||
|
// notification service for per-recipient expiry alerts + threshold
|
||||||
|
// notifications.
|
||||||
|
//
|
||||||
|
// Acquisition-audit DOC-001 closure (Sprint 7 ACQ, 2026-05-16).
|
||||||
|
// Pre-Sprint-7 the webhook notifier was a complete impl with full
|
||||||
|
// SSRF guard + HMAC-SHA256 signing + tests, but it was never wired
|
||||||
|
// in cmd/server/main.go — README claimed "6 notifiers" while only 5
|
||||||
|
// were actually registered. This adapter closes the wire gap so the
|
||||||
|
// "6 notifiers" claim is accurate. Mirrors the
|
||||||
|
// notifyemail.NotifierAdapter pattern.
|
||||||
|
//
|
||||||
|
// Method semantics:
|
||||||
|
//
|
||||||
|
// Send(ctx, recipient, subject, body) — constructs a
|
||||||
|
// notifier.Event with the three fields populated + a fresh
|
||||||
|
// random ID + the current UTC timestamp, then delegates to
|
||||||
|
// the underlying Connector's SendEvent. The webhook payload
|
||||||
|
// the recipient sees is the canonical {id, type, recipient,
|
||||||
|
// subject, body, metadata, created_at} JSON shape — same
|
||||||
|
// shape ValidateConfig probes for.
|
||||||
|
//
|
||||||
|
// Channel() — returns "webhook" so the notification service's
|
||||||
|
// per-channel routing matches the operator's
|
||||||
|
// CERTCTL_WEBHOOK_URL configuration.
|
||||||
|
//
|
||||||
|
// The Connector's per-request HMAC-SHA256 signing + SafeHTTPDialContext
|
||||||
|
// SSRF guard apply transitively — every Send call routes through
|
||||||
|
// SendEvent which routes through postWebhook which applies both
|
||||||
|
// defenses. No defense duplication is needed at the adapter layer.
|
||||||
|
type NotifierAdapter struct {
|
||||||
|
c *Connector
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewNotifierAdapter wraps a fully-configured webhook Connector for
|
||||||
|
// use as a service.Notifier. The Connector MUST be constructed via
|
||||||
|
// webhook.New (production) — newForTest is rejected by Go's package
|
||||||
|
// visibility from outside the webhook package, so production callers
|
||||||
|
// cannot accidentally adapt a permissive-validator connector.
|
||||||
|
func NewNotifierAdapter(c *Connector) *NotifierAdapter {
|
||||||
|
return &NotifierAdapter{c: c}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Channel returns the channel identifier used by the notification
|
||||||
|
// service's per-channel routing map.
|
||||||
|
func (a *NotifierAdapter) Channel() string {
|
||||||
|
return "webhook"
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send delivers a notification by translating the service-layer
|
||||||
|
// {recipient, subject, body} tuple into a notifier.Event and
|
||||||
|
// delegating to the underlying Connector's SendEvent. The Event
|
||||||
|
// carries a fresh 16-hex random ID (NOT a UUID — no extra dep
|
||||||
|
// needed; 128 bits of entropy is enough for de-dup at the receiver
|
||||||
|
// without colliding) and the current UTC time.
|
||||||
|
//
|
||||||
|
// The webhook recipient sees a JSON body like:
|
||||||
|
//
|
||||||
|
// {
|
||||||
|
// "id": "...",
|
||||||
|
// "type": "notification",
|
||||||
|
// "recipient": "<recipient>",
|
||||||
|
// "subject": "<subject>",
|
||||||
|
// "body": "<body>",
|
||||||
|
// "created_at": "<RFC3339>"
|
||||||
|
// }
|
||||||
|
//
|
||||||
|
// signed with HMAC-SHA256 in the X-Webhook-Signature header (when
|
||||||
|
// CERTCTL_WEBHOOK_SECRET is set).
|
||||||
|
func (a *NotifierAdapter) Send(ctx context.Context, recipient string, subject string, body string) error {
|
||||||
|
event := notifier.Event{
|
||||||
|
ID: adapterEventID(),
|
||||||
|
Type: "notification",
|
||||||
|
Recipient: recipient,
|
||||||
|
Subject: subject,
|
||||||
|
Body: body,
|
||||||
|
CreatedAt: time.Now().UTC(),
|
||||||
|
}
|
||||||
|
return a.c.SendEvent(ctx, event)
|
||||||
|
}
|
||||||
|
|
||||||
|
// adapterEventID returns a 32-character hex random ID for the
|
||||||
|
// adapter-side event. 16 bytes from crypto/rand is enough for de-
|
||||||
|
// duplication at the webhook recipient without adding a UUID
|
||||||
|
// dependency (we already use crypto/rand transitively).
|
||||||
|
func adapterEventID() string {
|
||||||
|
var b [16]byte
|
||||||
|
_, _ = rand.Read(b[:])
|
||||||
|
return hex.EncodeToString(b[:])
|
||||||
|
}
|
||||||
@@ -13,6 +13,7 @@ import (
|
|||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"log/slog"
|
"log/slog"
|
||||||
|
"os"
|
||||||
"regexp"
|
"regexp"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
@@ -81,13 +82,37 @@ var (
|
|||||||
)
|
)
|
||||||
|
|
||||||
// New creates a new Kubernetes Secrets target connector.
|
// New creates a new Kubernetes Secrets target connector.
|
||||||
// For now, returns a stub error since we're not pulling in k8s.io dependencies.
|
//
|
||||||
// The real implementation will use k8s.io/client-go to create a real K8s client.
|
// SEC-003-K8S closure (Sprint 4, 2026-05-16). The production
|
||||||
|
// k8s.io/client-go integration is not yet wired — realK8sClient's
|
||||||
|
// CRUD methods at the bottom of this file are stubs that return
|
||||||
|
// "real Kubernetes client not implemented." Pre-fix, New() would
|
||||||
|
// happily return a working-looking Connector wrapping the stub
|
||||||
|
// client; the operator would only see the failure when an actual
|
||||||
|
// deploy fired against a registered target. Now New() refuses to
|
||||||
|
// construct the connector unless CERTCTL_K8SSECRET_PREVIEW_ACK=true
|
||||||
|
// is set, mirroring the SEC-H3 demo-mode ACK pattern. Tests that
|
||||||
|
// need a working connector (with the in-memory mock client) call
|
||||||
|
// NewWithClient — that path is unchanged.
|
||||||
|
//
|
||||||
|
// README qualifies the connector as preview at line 67; the
|
||||||
|
// runtime guard here closes the gap where an operator could
|
||||||
|
// register a k8ssecret target through the GUI / API and silently
|
||||||
|
// land a non-functional deployment path in their fleet.
|
||||||
func New(cfg *Config, logger *slog.Logger) (*Connector, error) {
|
func New(cfg *Config, logger *slog.Logger) (*Connector, error) {
|
||||||
if cfg == nil {
|
if cfg == nil {
|
||||||
return nil, fmt.Errorf("Kubernetes config is required")
|
return nil, fmt.Errorf("Kubernetes config is required")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if os.Getenv("CERTCTL_K8SSECRET_PREVIEW_ACK") != "true" {
|
||||||
|
return nil, fmt.Errorf(
|
||||||
|
"k8ssecret connector is preview-only — the production client-go integration ships in a future bundle. " +
|
||||||
|
"To register a k8ssecret target on this build, set CERTCTL_K8SSECRET_PREVIEW_ACK=true on the server " +
|
||||||
|
"AND understand that the connector's CRUD calls will return \"real Kubernetes client not implemented\" " +
|
||||||
|
"until the integration lands. See README.md `Deploy automatically` line and " +
|
||||||
|
"docs/reference/deployment-model.md for the per-target guarantee matrix")
|
||||||
|
}
|
||||||
|
|
||||||
// Stub real K8s client — the actual implementation will use k8s.io/client-go
|
// Stub real K8s client — the actual implementation will use k8s.io/client-go
|
||||||
// For now, return error to guide users to use the agent with proper kubeconfig
|
// For now, return error to guide users to use the agent with proper kubeconfig
|
||||||
client := &realK8sClient{
|
client := &realK8sClient{
|
||||||
|
|||||||
@@ -644,3 +644,49 @@ func contains(s, substr string) bool {
|
|||||||
}
|
}
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// =============================================================================
|
||||||
|
// SEC-003-K8S closure (Sprint 4, 2026-05-16). The production realK8sClient's
|
||||||
|
// CRUD methods are stubs that return "real Kubernetes client not implemented."
|
||||||
|
// Pre-fix, New() returned a working-looking Connector wrapping the stub; the
|
||||||
|
// operator only saw the failure when a deploy actually fired. Now New()
|
||||||
|
// refuses to construct unless CERTCTL_K8SSECRET_PREVIEW_ACK=true is set,
|
||||||
|
// surfacing the preview-only state at registration time.
|
||||||
|
//
|
||||||
|
// The NewWithClient path used by tests in this package stays unchanged —
|
||||||
|
// it injects a mock client and doesn't gate on the env var.
|
||||||
|
// =============================================================================
|
||||||
|
|
||||||
|
func TestNew_RequiresPreviewACK(t *testing.T) {
|
||||||
|
t.Setenv("CERTCTL_K8SSECRET_PREVIEW_ACK", "")
|
||||||
|
cfg := &Config{Namespace: "default", SecretName: "tls-cert"}
|
||||||
|
conn, err := New(cfg, nil)
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("New() without ACK returned (conn=%v, err=nil); want preview-ACK rejection", conn)
|
||||||
|
}
|
||||||
|
if conn != nil {
|
||||||
|
t.Errorf("New() returned non-nil conn on rejection: %v", conn)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNew_AcceptsWithPreviewACK(t *testing.T) {
|
||||||
|
t.Setenv("CERTCTL_K8SSECRET_PREVIEW_ACK", "true")
|
||||||
|
cfg := &Config{Namespace: "default", SecretName: "tls-cert"}
|
||||||
|
conn, err := New(cfg, nil)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("New() with ACK = %v; want nil error", err)
|
||||||
|
}
|
||||||
|
if conn == nil {
|
||||||
|
t.Fatalf("New() with ACK returned nil connector")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestNew_RejectsNilConfigBeforeACKCheck(t *testing.T) {
|
||||||
|
// Defense-in-depth: the existing nil-config rejection still
|
||||||
|
// fires regardless of the ACK env, so an operator who flipped
|
||||||
|
// the ACK still can't construct with a missing config.
|
||||||
|
t.Setenv("CERTCTL_K8SSECRET_PREVIEW_ACK", "true")
|
||||||
|
if _, err := New(nil, nil); err == nil {
|
||||||
|
t.Fatalf("New(nil, ...) returned nil; want rejection of nil config")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -172,13 +172,20 @@ func (d *FileDriver) Load(ctx context.Context, path string) (Signer, error) {
|
|||||||
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
}
|
}
|
||||||
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
||||||
// (when set) OR contain literal ".." segments. The validator is in
|
// (when set) OR contain literal ".." segments. validateSafePath
|
||||||
// the same function as the os.ReadFile sink so CodeQL recognizes
|
// does the structured rejection; the inline assertion below
|
||||||
// the sanitizer in-scope.
|
// re-applies the canonical filepath.Rel + ".." rejection AT THE
|
||||||
|
// SINK so CodeQL's go/path-injection data-flow analyzer sees the
|
||||||
|
// sanitizer in-function (it doesn't reliably trace through
|
||||||
|
// function-call boundaries — Phase 6 commit 586308e shipped only
|
||||||
|
// validateSafePath and CodeQL alert #29 stayed open). Hotfix #13.
|
||||||
safePath, err := d.validateSafePath(path)
|
safePath, err := d.validateSafePath(path)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
}
|
}
|
||||||
|
if err := assertCleanAbsPath(safePath, d.SafeRoot); err != nil {
|
||||||
|
return nil, fmt.Errorf("signer.FileDriver.Load: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
pemBytes, err := os.ReadFile(safePath)
|
pemBytes, err := os.ReadFile(safePath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -229,13 +236,20 @@ func (d *FileDriver) Generate(ctx context.Context, alg Algorithm) (Signer, strin
|
|||||||
}
|
}
|
||||||
|
|
||||||
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
// CWE-22 path-traversal defense — reject paths that escape SafeRoot
|
||||||
// (when set) OR contain literal ".." segments. The validator is in
|
// (when set) OR contain literal ".." segments. validateSafePath
|
||||||
// the same function as the os.WriteFile sink below so CodeQL
|
// does the structured rejection; the inline assertion below
|
||||||
// recognizes the sanitizer in-scope.
|
// re-applies the canonical filepath.Rel + ".." rejection AT THE
|
||||||
|
// SINK so CodeQL's go/path-injection data-flow analyzer sees the
|
||||||
|
// sanitizer in-function (it doesn't reliably trace through
|
||||||
|
// function-call boundaries — Phase 6 commit 586308e shipped only
|
||||||
|
// validateSafePath and CodeQL alert #29 stayed open). Hotfix #13.
|
||||||
safeOut, err := d.validateSafePath(outPath)
|
safeOut, err := d.validateSafePath(outPath)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
||||||
}
|
}
|
||||||
|
if err := assertCleanAbsPath(safeOut, d.SafeRoot); err != nil {
|
||||||
|
return nil, "", fmt.Errorf("signer.FileDriver.Generate: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
// Harden the destination directory BEFORE generating the key. If
|
// Harden the destination directory BEFORE generating the key. If
|
||||||
// the directory check fails we bail without touching cryptography.
|
// the directory check fails we bail without touching cryptography.
|
||||||
@@ -306,6 +320,67 @@ func (d *FileDriver) Generate(ctx context.Context, alg Algorithm) (Signer, strin
|
|||||||
return wrapped, safeOut, nil
|
return wrapped, safeOut, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// assertCleanAbsPath re-asserts CWE-22 path-injection invariants AT
|
||||||
|
// THE SINK (the function that's about to call os.ReadFile /
|
||||||
|
// os.WriteFile), not via validateSafePath in a sibling function.
|
||||||
|
// CodeQL's go/path-injection data-flow analyzer doesn't reliably
|
||||||
|
// trace sanitizers across function-call boundaries — it scopes its
|
||||||
|
// recognized-sanitizer pattern matching to the same function as the
|
||||||
|
// sink. So duplicating the check inline (filepath.Rel-style
|
||||||
|
// containment + IsAbs + clean assertions) is the
|
||||||
|
// belt-and-suspenders that closes alert #29.
|
||||||
|
//
|
||||||
|
// Invariants enforced:
|
||||||
|
//
|
||||||
|
// 1. path is non-empty.
|
||||||
|
// 2. path is absolute (the validateSafePath caller resolves
|
||||||
|
// filepath.Abs upstream; if we get a non-absolute path here,
|
||||||
|
// something downstream broke the contract).
|
||||||
|
// 3. path is filepath.Clean'd (no trailing separators, no double
|
||||||
|
// separators, no redundant "./").
|
||||||
|
// 4. path's slash-normalized segments contain no literal "..".
|
||||||
|
// 5. When safeRoot is non-empty: filepath.Rel(safeRoot, path)
|
||||||
|
// returns a non-"../*" result (path is at or below safeRoot in
|
||||||
|
// the resolved-absolute-path tree). filepath.Rel is the
|
||||||
|
// canonical CodeQL-recognized containment-check pattern.
|
||||||
|
//
|
||||||
|
// All of these are guaranteed by a successful validateSafePath
|
||||||
|
// upstream; this function exists purely so CodeQL sees the
|
||||||
|
// sanitizer pattern at the sink's own function-scope.
|
||||||
|
func assertCleanAbsPath(path, safeRoot string) error {
|
||||||
|
if path == "" {
|
||||||
|
return errors.New("sink path is empty")
|
||||||
|
}
|
||||||
|
if !filepath.IsAbs(path) {
|
||||||
|
return fmt.Errorf("sink path %q is not absolute", path)
|
||||||
|
}
|
||||||
|
if path != filepath.Clean(path) {
|
||||||
|
return fmt.Errorf("sink path %q is not Clean'd", path)
|
||||||
|
}
|
||||||
|
for _, seg := range strings.Split(filepath.ToSlash(path), "/") {
|
||||||
|
if seg == ".." {
|
||||||
|
return fmt.Errorf("sink path %q contains parent-directory segment", path)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if safeRoot != "" {
|
||||||
|
rootAbs, err := filepath.Abs(filepath.Clean(safeRoot))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("resolve SafeRoot %q: %w", safeRoot, err)
|
||||||
|
}
|
||||||
|
rel, err := filepath.Rel(rootAbs, path)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("sink path %q vs SafeRoot %q: %w", path, safeRoot, err)
|
||||||
|
}
|
||||||
|
// filepath.Rel returns ".." or "../..." when path is outside
|
||||||
|
// rootAbs. Reject any such result. "." or a non-dot-relative
|
||||||
|
// suffix is in-bounds.
|
||||||
|
if rel == ".." || strings.HasPrefix(rel, ".."+string(filepath.Separator)) {
|
||||||
|
return fmt.Errorf("sink path %q resolves outside SafeRoot %q", path, safeRoot)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
func rsaBitsFor(a Algorithm) int {
|
func rsaBitsFor(a Algorithm) int {
|
||||||
switch a {
|
switch a {
|
||||||
case AlgorithmRSA3072:
|
case AlgorithmRSA3072:
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"os/user"
|
"os/user"
|
||||||
"strconv"
|
"strconv"
|
||||||
"syscall"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// runningAsRoot reports whether the current process has uid 0.
|
// runningAsRoot reports whether the current process has uid 0.
|
||||||
@@ -198,12 +197,13 @@ func lookupGID(groupname string) (int, error) {
|
|||||||
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
// unixOwnerFromStat extracts (uid, gid) from a Unix-style FileInfo.
|
||||||
// On non-Unix platforms or when the underlying stat doesn't expose
|
// On non-Unix platforms or when the underlying stat doesn't expose
|
||||||
// uid/gid, returns ok=false.
|
// uid/gid, returns ok=false.
|
||||||
func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) {
|
//
|
||||||
if fi == nil {
|
// Platform-specific implementations live in:
|
||||||
return -1, -1, false
|
// - ownership_unix.go (//go:build unix — uses *syscall.Stat_t)
|
||||||
}
|
// - ownership_windows.go (//go:build windows — stub returns false)
|
||||||
if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix {
|
//
|
||||||
return int(sysStat.Uid), int(sysStat.Gid), true
|
// The split exists because syscall.Stat_t is Unix-only — Windows
|
||||||
}
|
// has no equivalent shape, so any production tsx that names it
|
||||||
return -1, -1, false
|
// fails to compile on GOOS=windows. The cross-platform-build CI
|
||||||
}
|
// matrix caught this at Hotfix #16; the function was originally
|
||||||
|
// in this file pre-split.
|
||||||
|
|||||||
@@ -0,0 +1,33 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build unix
|
||||||
|
|
||||||
|
// Unix-side implementation of unixOwnerFromStat. The `unix` build
|
||||||
|
// constraint (Go 1.19+) covers linux / darwin / freebsd / openbsd /
|
||||||
|
// netbsd / dragonfly / solaris — every GOOS where *syscall.Stat_t
|
||||||
|
// is a valid type assertion target for os.FileInfo.Sys().
|
||||||
|
//
|
||||||
|
// Hotfix #16 (2026-05-14): pre-split, this function lived inline in
|
||||||
|
// ownership.go with an unconditional `syscall.Stat_t` reference. That
|
||||||
|
// failed `GOOS=windows go build` because the type is undefined on
|
||||||
|
// that platform. The split is the standard Go pattern — the same
|
||||||
|
// function name + signature is satisfied by either build of the
|
||||||
|
// package, callers don't know or care which.
|
||||||
|
|
||||||
|
package deploy
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"syscall"
|
||||||
|
)
|
||||||
|
|
||||||
|
func unixOwnerFromStat(fi os.FileInfo) (uid int, gid int, ok bool) {
|
||||||
|
if fi == nil {
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
|
if sysStat, isUnix := fi.Sys().(*syscall.Stat_t); isUnix {
|
||||||
|
return int(sysStat.Uid), int(sysStat.Gid), true
|
||||||
|
}
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
@@ -0,0 +1,35 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build windows
|
||||||
|
|
||||||
|
// Windows stub for unixOwnerFromStat. Windows has no uid/gid concept
|
||||||
|
// the way Unix does — file ownership is expressed via SIDs (Security
|
||||||
|
// Identifiers) and ACLs (Access Control Lists), and os.FileInfo.Sys()
|
||||||
|
// returns *syscall.Win32FileAttributeData which carries no
|
||||||
|
// ownership data the deploy package's existing call sites can use.
|
||||||
|
//
|
||||||
|
// All four callers — applyOwnership at ownership.go:75,
|
||||||
|
// preserveSourceOwner at atomic.go:237, and two test sites — already
|
||||||
|
// handle the ok=false return path by falling back to Plan.Defaults
|
||||||
|
// or the runtime's umask. Returning false here is the correct
|
||||||
|
// platform contract: "no native ownership available on this
|
||||||
|
// platform; use the supplied defaults."
|
||||||
|
//
|
||||||
|
// Hotfix #16 (2026-05-14): created to unblock the
|
||||||
|
// cross-platform-build Windows matrix in CI, which had been
|
||||||
|
// red since the agent's deploy package gained ownership-
|
||||||
|
// preservation semantics. The agent binary still compiles for
|
||||||
|
// Windows; ownership operations on Windows are no-ops (which
|
||||||
|
// matches operator expectations — the certctl-agent's
|
||||||
|
// chown/chmod codepaths gate on `runningAsRoot()` and Windows
|
||||||
|
// runs the agent as a service under a SID that doesn't
|
||||||
|
// translate to a uid anyway).
|
||||||
|
|
||||||
|
package deploy
|
||||||
|
|
||||||
|
import "os"
|
||||||
|
|
||||||
|
func unixOwnerFromStat(_ os.FileInfo) (uid int, gid int, ok bool) {
|
||||||
|
return -1, -1, false
|
||||||
|
}
|
||||||
@@ -825,6 +825,13 @@ func (m *mockAuditRepository) List(ctx context.Context, filter *repository.Audit
|
|||||||
return m.events, nil
|
return m.events, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// VerifyHashChain is the Sprint 6 COMP-001-HASH interface addition.
|
||||||
|
// In-memory mock: report "clean walk over N events"; real chain
|
||||||
|
// semantics are pinned by internal/repository/postgres/audit_chain_test.go.
|
||||||
|
func (m *mockAuditRepository) VerifyHashChain(ctx context.Context) (string, int, int, error) {
|
||||||
|
return "", -1, len(m.events), nil
|
||||||
|
}
|
||||||
|
|
||||||
type mockAgentRepository struct {
|
type mockAgentRepository struct {
|
||||||
agents map[string]*domain.Agent
|
agents map[string]*domain.Agent
|
||||||
}
|
}
|
||||||
@@ -961,6 +968,25 @@ func (m *mockTargetRepository) List(ctx context.Context) ([]*domain.DeploymentTa
|
|||||||
return targets, nil
|
return targets, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListPaginated mirrors the SQL-side window. SCALE-002 closure (Sprint 2).
|
||||||
|
func (m *mockTargetRepository) ListPaginated(ctx context.Context, limit, offset int) ([]*domain.DeploymentTarget, int64, error) {
|
||||||
|
all, _ := m.List(ctx)
|
||||||
|
if offset < 0 {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
if offset >= len(all) {
|
||||||
|
return nil, int64(len(all)), nil
|
||||||
|
}
|
||||||
|
if limit <= 0 {
|
||||||
|
return all[offset:], int64(len(all)), nil
|
||||||
|
}
|
||||||
|
end := offset + limit
|
||||||
|
if end > len(all) {
|
||||||
|
end = len(all)
|
||||||
|
}
|
||||||
|
return all[offset:end], int64(len(all)), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *mockTargetRepository) Get(ctx context.Context, id string) (*domain.DeploymentTarget, error) {
|
func (m *mockTargetRepository) Get(ctx context.Context, id string) (*domain.DeploymentTarget, error) {
|
||||||
target, ok := m.targets[id]
|
target, ok := m.targets[id]
|
||||||
if !ok {
|
if !ok {
|
||||||
@@ -1233,6 +1259,25 @@ func (m *mockIssuerRepository) List(ctx context.Context) ([]*domain.Issuer, erro
|
|||||||
return issuers, nil
|
return issuers, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListPaginated mirrors the SQL-side window. SCALE-002 closure (Sprint 2).
|
||||||
|
func (m *mockIssuerRepository) ListPaginated(ctx context.Context, limit, offset int) ([]*domain.Issuer, int64, error) {
|
||||||
|
all, _ := m.List(ctx)
|
||||||
|
if offset < 0 {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
if offset >= len(all) {
|
||||||
|
return nil, int64(len(all)), nil
|
||||||
|
}
|
||||||
|
if limit <= 0 {
|
||||||
|
return all[offset:], int64(len(all)), nil
|
||||||
|
}
|
||||||
|
end := offset + limit
|
||||||
|
if end > len(all) {
|
||||||
|
end = len(all)
|
||||||
|
}
|
||||||
|
return all[offset:end], int64(len(all)), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (m *mockIssuerRepository) Get(ctx context.Context, id string) (*domain.Issuer, error) {
|
func (m *mockIssuerRepository) Get(ctx context.Context, id string) (*domain.Issuer, error) {
|
||||||
issuer, ok := m.issuers[id]
|
issuer, ok := m.issuers[id]
|
||||||
if !ok {
|
if !ok {
|
||||||
|
|||||||
@@ -0,0 +1,195 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
//go:build integration
|
||||||
|
|
||||||
|
package integration
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
"github.com/testcontainers/testcontainers-go"
|
||||||
|
"github.com/testcontainers/testcontainers-go/wait"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1) — the falsifiable closure proof for cross-replica rate-limit
|
||||||
|
// consistency.
|
||||||
|
//
|
||||||
|
// Scenario:
|
||||||
|
// - ONE postgres container (representing the shared backend).
|
||||||
|
// - N=3 independent *PostgresSlidingWindowLimiter instances pointing
|
||||||
|
// at it (representing 3 server replicas — each replica's process
|
||||||
|
// has its own constructed limiter, but they all share the same
|
||||||
|
// database state).
|
||||||
|
// - 100 concurrent Allow("test-key") calls spread across the 3
|
||||||
|
// limiters via sync.WaitGroup.
|
||||||
|
// - Assert: exactly 10 succeed + 90 return ErrRateLimited.
|
||||||
|
//
|
||||||
|
// If the postgres backend's SELECT FOR UPDATE serialization weren't
|
||||||
|
// arbitrating across the 3 limiters, more than 10 calls would be
|
||||||
|
// allowed (each replica would independently let through 10/3 ≈ 4
|
||||||
|
// requests, giving ~12-15 successes depending on scheduling). The
|
||||||
|
// hard-pass on exactly-10 is what makes ARCH-M1 closure substantive
|
||||||
|
// rather than wishful.
|
||||||
|
//
|
||||||
|
// Gated by //go:build integration matching the rest of
|
||||||
|
// internal/integration/. Sprint 13.3 promotes this test to a
|
||||||
|
// required CI status check.
|
||||||
|
|
||||||
|
func TestRateLimit_PostgresBackend_CapEnforcedAcrossReplicas(t *testing.T) {
|
||||||
|
const (
|
||||||
|
replicas = 3
|
||||||
|
cap = 10
|
||||||
|
window = 1 * time.Minute
|
||||||
|
concurrentReq = 100
|
||||||
|
key = "test-key"
|
||||||
|
)
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// Boot a shared postgres container.
|
||||||
|
container, dsn := startPostgresContainer(ctx, t)
|
||||||
|
t.Cleanup(func() { _ = container.Terminate(context.Background()) })
|
||||||
|
|
||||||
|
// Each "replica" gets its own *sql.DB pool — same database, different
|
||||||
|
// connection pool — matching how N server processes would each open
|
||||||
|
// their own pool to the same control-plane database.
|
||||||
|
dbs := make([]*sql.DB, replicas)
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
db, err := sql.Open("postgres", dsn)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open db (replica %d): %v", i, err)
|
||||||
|
}
|
||||||
|
db.SetMaxOpenConns(8)
|
||||||
|
if err := db.Ping(); err != nil {
|
||||||
|
t.Fatalf("ping (replica %d): %v", i, err)
|
||||||
|
}
|
||||||
|
t.Cleanup(func() { db.Close() })
|
||||||
|
dbs[i] = db
|
||||||
|
}
|
||||||
|
|
||||||
|
// Apply the rate_limit_buckets migration via dbs[0]. All replicas
|
||||||
|
// see the same schema since they share the same database.
|
||||||
|
migPath := findMigrationFromHere("000046_rate_limit_buckets.up.sql")
|
||||||
|
body, err := os.ReadFile(migPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read migration: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := dbs[0].ExecContext(ctx, string(body)); err != nil {
|
||||||
|
t.Fatalf("apply migration: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Instantiate one limiter per replica.
|
||||||
|
limiters := make([]*ratelimit.PostgresSlidingWindowLimiter, replicas)
|
||||||
|
for i := 0; i < replicas; i++ {
|
||||||
|
limiters[i] = ratelimit.NewPostgresSlidingWindowLimiter(dbs[i], cap, window)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fire concurrentReq parallel Allow calls, round-robining across the
|
||||||
|
// replicas. Each call uses the SAME key + a SHARED `now` so the
|
||||||
|
// scenario is deterministic. The cross-replica row lock is what
|
||||||
|
// enforces the cap globally.
|
||||||
|
var (
|
||||||
|
allowed int64
|
||||||
|
denied int64
|
||||||
|
wg sync.WaitGroup
|
||||||
|
)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < concurrentReq; i++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func(idx int) {
|
||||||
|
defer wg.Done()
|
||||||
|
l := limiters[idx%replicas]
|
||||||
|
err := l.Allow(key, now)
|
||||||
|
if err == nil {
|
||||||
|
atomic.AddInt64(&allowed, 1)
|
||||||
|
} else if errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
atomic.AddInt64(&denied, 1)
|
||||||
|
} else {
|
||||||
|
t.Errorf("unexpected error from Allow: %v", err)
|
||||||
|
}
|
||||||
|
}(i)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
gotAllowed := atomic.LoadInt64(&allowed)
|
||||||
|
gotDenied := atomic.LoadInt64(&denied)
|
||||||
|
|
||||||
|
t.Logf("replicas=%d cap=%d concurrent=%d → allowed=%d denied=%d",
|
||||||
|
replicas, cap, concurrentReq, gotAllowed, gotDenied)
|
||||||
|
|
||||||
|
if gotAllowed != int64(cap) {
|
||||||
|
t.Errorf("allowed = %d, want exactly %d (cross-replica row lock should serialize Allow calls so exactly cap succeed)",
|
||||||
|
gotAllowed, cap)
|
||||||
|
}
|
||||||
|
if gotDenied != int64(concurrentReq-cap) {
|
||||||
|
t.Errorf("denied = %d, want %d (concurrentReq - cap)", gotDenied, concurrentReq-cap)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Local testcontainers harness. Kept in-file because the rest of
|
||||||
|
// internal/integration/ uses HTTP-against-running-server smoke tests
|
||||||
|
// against a docker-compose stack — different shape from ours.
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func startPostgresContainer(ctx context.Context, t *testing.T) (testcontainers.Container, string) {
|
||||||
|
t.Helper()
|
||||||
|
|
||||||
|
req := testcontainers.ContainerRequest{
|
||||||
|
Image: "postgres:16-alpine",
|
||||||
|
ExposedPorts: []string{"5432/tcp"},
|
||||||
|
Env: map[string]string{
|
||||||
|
"POSTGRES_DB": "certctl_test",
|
||||||
|
"POSTGRES_USER": "certctl",
|
||||||
|
"POSTGRES_PASSWORD": "certctl",
|
||||||
|
},
|
||||||
|
WaitingFor: wait.ForLog("database system is ready to accept connections").WithOccurrence(2),
|
||||||
|
}
|
||||||
|
container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||||
|
ContainerRequest: req,
|
||||||
|
Started: true,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start postgres container: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
host, err := container.Host(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container host: %v", err)
|
||||||
|
}
|
||||||
|
port, err := container.MappedPort(ctx, "5432")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container port: %v", err)
|
||||||
|
}
|
||||||
|
dsn := fmt.Sprintf("postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable",
|
||||||
|
host, port.Port())
|
||||||
|
return container, dsn
|
||||||
|
}
|
||||||
|
|
||||||
|
func findMigrationFromHere(filename string) string {
|
||||||
|
_, here, _, _ := runtime.Caller(0)
|
||||||
|
dir := filepath.Dir(here)
|
||||||
|
for i := 0; i < 6; i++ {
|
||||||
|
candidate := filepath.Join(dir, "migrations", filename)
|
||||||
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
dir = filepath.Dir(dir)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,150 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
// Package observability is the optional OpenTelemetry seed.
|
||||||
|
// Acquisition-audit DEPL-006 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
//
|
||||||
|
// What this package does
|
||||||
|
// ======================
|
||||||
|
//
|
||||||
|
// Init wires up an OTLP/HTTP tracer provider when
|
||||||
|
// CERTCTL_OTEL_ENABLED=true and registers it as the global
|
||||||
|
// otel.SetTracerProvider. The returned shutdown function MUST be
|
||||||
|
// deferred by the caller (typically cmd/server/main.go) so in-
|
||||||
|
// flight spans flush before process exit.
|
||||||
|
//
|
||||||
|
// When CERTCTL_OTEL_ENABLED is unset or false (the default), Init
|
||||||
|
// returns a no-op shutdown and does NOT register a tracer provider.
|
||||||
|
// The global otel.GetTracerProvider() therefore returns the SDK's
|
||||||
|
// noop provider; any spans created by future-instrumented code
|
||||||
|
// paths are silently discarded with no allocation cost. Zero
|
||||||
|
// behavior change for operators who don't opt in.
|
||||||
|
//
|
||||||
|
// What this package does NOT do
|
||||||
|
// =============================
|
||||||
|
//
|
||||||
|
// - No span instrumentation is added anywhere in the certctl code
|
||||||
|
// base by this commit. The DEPL-006 audit finding is closed by
|
||||||
|
// standing up the surface (initializer + config wiring + dep
|
||||||
|
// promotion); per-handler / per-query / per-connector spans are
|
||||||
|
// tracked as a v2.3 roadmap follow-up.
|
||||||
|
//
|
||||||
|
// - The hand-rolled Prometheus exposition handler at
|
||||||
|
// internal/api/handler/metrics.go::GetPrometheusMetrics is
|
||||||
|
// intentionally untouched. OTel is additive — operators with
|
||||||
|
// Prometheus continue to scrape the existing endpoint; operators
|
||||||
|
// with an OTel collector can opt in by setting CERTCTL_OTEL_ENABLED
|
||||||
|
// and OTEL_EXPORTER_OTLP_ENDPOINT.
|
||||||
|
//
|
||||||
|
// Transport choice
|
||||||
|
// ================
|
||||||
|
//
|
||||||
|
// The exporter uses OTLP/HTTP (proto-binary over HTTPS), not OTLP/gRPC.
|
||||||
|
// Both are valid OTel transports and downstream collectors accept
|
||||||
|
// either. OTLP/HTTP is chosen here to keep certctl's dependency
|
||||||
|
// surface narrow — gRPC pulls in google.golang.org/grpc +
|
||||||
|
// google.golang.org/genproto/* which materially expand the binary
|
||||||
|
// size and the supply-chain attack surface for a feature that today
|
||||||
|
// emits zero spans. Operators with a gRPC-only collector can wrap
|
||||||
|
// their collector with an OTel-collector tee or run the
|
||||||
|
// collector's OTLP/HTTP receiver alongside. If gRPC-direct
|
||||||
|
// becomes a real ask, swapping the exporter is a single-import
|
||||||
|
// change.
|
||||||
|
//
|
||||||
|
// Env vars
|
||||||
|
// ========
|
||||||
|
//
|
||||||
|
// CERTCTL_OTEL_ENABLED — gate (default false).
|
||||||
|
// OTEL_EXPORTER_OTLP_ENDPOINT — standard OTel env var; HTTP URL.
|
||||||
|
// Default (per OTel spec):
|
||||||
|
// http://localhost:4318.
|
||||||
|
// OTEL_EXPORTER_OTLP_HEADERS — standard OTel env var; auth
|
||||||
|
// header pairs for the collector.
|
||||||
|
// OTEL_SERVICE_NAME — overrides the default
|
||||||
|
// "certctl-server" resource label.
|
||||||
|
//
|
||||||
|
// All standard OTEL_* env vars the SDK consumes are honored
|
||||||
|
// automatically — this Init does not re-implement them.
|
||||||
|
package observability
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
"go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp"
|
||||||
|
"go.opentelemetry.io/otel/sdk/resource"
|
||||||
|
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||||
|
semconv "go.opentelemetry.io/otel/semconv/v1.27.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Config is the operator-facing config surface for the OTel seed.
|
||||||
|
// Plumbed in from internal/config/config.go::ObservabilityConfig at
|
||||||
|
// boot. The single field is Enabled — service name + endpoint +
|
||||||
|
// headers + protocol flow through the standard OTEL_* env vars
|
||||||
|
// honored directly by the OTel SDK (resource.WithFromEnv +
|
||||||
|
// otlptracehttp.New), no certctl-specific re-implementation.
|
||||||
|
type Config struct {
|
||||||
|
// Enabled gates the whole subsystem. When false, Init returns a
|
||||||
|
// no-op shutdown and registers nothing. CERTCTL_OTEL_ENABLED.
|
||||||
|
Enabled bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// Init initializes OpenTelemetry tracing if cfg.Enabled is true.
|
||||||
|
//
|
||||||
|
// The returned shutdown function flushes the in-flight span batcher
|
||||||
|
// and tears the tracer provider down. The caller MUST defer it
|
||||||
|
// before process exit; without the shutdown, the last batch of
|
||||||
|
// spans is lost.
|
||||||
|
//
|
||||||
|
// When disabled, Init returns a no-op shutdown that always succeeds.
|
||||||
|
// Callers can therefore unconditionally defer the returned function
|
||||||
|
// without branching on cfg.Enabled.
|
||||||
|
//
|
||||||
|
// The OTLP HTTP client created here connects lazily — Init does
|
||||||
|
// NOT block on the collector being reachable. An unreachable
|
||||||
|
// collector surfaces as failed export attempts in the SDK's
|
||||||
|
// internal error log, NOT as a boot-time error. This is intentional:
|
||||||
|
// observability MUST NOT block process startup.
|
||||||
|
func Init(ctx context.Context, cfg Config) (shutdown func(context.Context) error, err error) {
|
||||||
|
if !cfg.Enabled {
|
||||||
|
return noopShutdown, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// resource.WithFromEnv picks up OTEL_RESOURCE_ATTRIBUTES and
|
||||||
|
// OTEL_SERVICE_NAME from the environment — operators override
|
||||||
|
// service.name without code changes. WithProcess adds process.*
|
||||||
|
// attributes (PID, runtime info). The default service.name
|
||||||
|
// "certctl-server" applies only when OTEL_SERVICE_NAME is unset.
|
||||||
|
res, err := resource.New(ctx,
|
||||||
|
resource.WithAttributes(semconv.ServiceName("certctl-server")),
|
||||||
|
resource.WithFromEnv(),
|
||||||
|
resource.WithProcess(),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("observability: resource.New: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// otlptracehttp.New honors the standard OTel env vars:
|
||||||
|
// OTEL_EXPORTER_OTLP_ENDPOINT, OTEL_EXPORTER_OTLP_HEADERS,
|
||||||
|
// OTEL_EXPORTER_OTLP_INSECURE, OTEL_EXPORTER_OTLP_TIMEOUT,
|
||||||
|
// OTEL_EXPORTER_OTLP_PROTOCOL. The HTTP client connects lazily;
|
||||||
|
// New returns nil error even if the collector is unreachable.
|
||||||
|
exporter, err := otlptracehttp.New(ctx)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("observability: otlptracehttp.New: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
tp := sdktrace.NewTracerProvider(
|
||||||
|
sdktrace.WithResource(res),
|
||||||
|
sdktrace.WithBatcher(exporter),
|
||||||
|
)
|
||||||
|
otel.SetTracerProvider(tp)
|
||||||
|
|
||||||
|
return tp.Shutdown, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// noopShutdown is the disabled-mode return — always succeeds. Kept
|
||||||
|
// as a package-level var so we don't allocate a fresh closure on
|
||||||
|
// every disabled Init call.
|
||||||
|
var noopShutdown = func(context.Context) error { return nil }
|
||||||
@@ -0,0 +1,110 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package observability
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"go.opentelemetry.io/otel"
|
||||||
|
sdktrace "go.opentelemetry.io/otel/sdk/trace"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestInit_Disabled_NoOp pins the disabled-mode contract: Init with
|
||||||
|
// Enabled=false returns a non-nil shutdown that succeeds and does
|
||||||
|
// NOT register a real tracer provider. Acquisition-audit DEPL-006
|
||||||
|
// closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
func TestInit_Disabled_NoOp(t *testing.T) {
|
||||||
|
// Capture the global tracer provider before Init so we can assert
|
||||||
|
// it didn't change.
|
||||||
|
before := otel.GetTracerProvider()
|
||||||
|
|
||||||
|
shutdown, err := Init(context.Background(), Config{Enabled: false})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Init(Enabled=false) = %v; want nil", err)
|
||||||
|
}
|
||||||
|
if shutdown == nil {
|
||||||
|
t.Fatal("Init(Enabled=false) returned nil shutdown; want a no-op closure")
|
||||||
|
}
|
||||||
|
if got := otel.GetTracerProvider(); got != before {
|
||||||
|
t.Errorf("disabled Init mutated the global tracer provider; before=%T after=%T", before, got)
|
||||||
|
}
|
||||||
|
|
||||||
|
// shutdown must succeed cleanly (no panic, no error, no hang).
|
||||||
|
sctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
if err := shutdown(sctx); err != nil {
|
||||||
|
t.Errorf("noop shutdown returned %v; want nil", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestInit_Enabled_RegistersTracerProvider pins the enabled-mode
|
||||||
|
// contract: Init with Enabled=true returns a real shutdown and
|
||||||
|
// installs an SDK-backed tracer provider as the otel global. The
|
||||||
|
// OTLP exporter connects lazily so this test does NOT require a
|
||||||
|
// reachable collector — Init returns nil error even when no
|
||||||
|
// collector is running, and the shutdown drains gracefully.
|
||||||
|
// Acquisition-audit DEPL-006 closure (Sprint 6 ACQ, 2026-05-16).
|
||||||
|
func TestInit_Enabled_RegistersTracerProvider(t *testing.T) {
|
||||||
|
// Point the exporter at a localhost dead-end so the test never
|
||||||
|
// flakes against a real collector. Insecure mode skips the TLS
|
||||||
|
// handshake — otherwise the gRPC client would block on TLS even
|
||||||
|
// for the lazy connect path.
|
||||||
|
t.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://127.0.0.1:1") // unreachable port
|
||||||
|
t.Setenv("OTEL_EXPORTER_OTLP_INSECURE", "true")
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Snapshot + restore the global tracer provider so this test
|
||||||
|
// doesn't leak into other tests' state.
|
||||||
|
before := otel.GetTracerProvider()
|
||||||
|
t.Cleanup(func() { otel.SetTracerProvider(before) })
|
||||||
|
|
||||||
|
shutdown, err := Init(ctx, Config{Enabled: true})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Init(Enabled=true) = %v; want nil", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
sctx, scancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||||
|
defer scancel()
|
||||||
|
if err := shutdown(sctx); err != nil {
|
||||||
|
// Shutdown may fail if the lazy gRPC connect ultimately
|
||||||
|
// times out against the dead-end endpoint. That's a
|
||||||
|
// noisy-but-non-fatal outcome — the surface is wired
|
||||||
|
// correctly, only the destination is intentionally
|
||||||
|
// unreachable in this test.
|
||||||
|
t.Logf("shutdown returned %v (expected for unreachable endpoint)", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
|
got := otel.GetTracerProvider()
|
||||||
|
if _, ok := got.(*sdktrace.TracerProvider); !ok {
|
||||||
|
t.Errorf("enabled Init did not install an SDK tracer provider; got %T", got)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestInit_Enabled_RespectsOTEL_SERVICE_NAME pins that the standard
|
||||||
|
// OTEL_SERVICE_NAME env var overrides the certctl-server default —
|
||||||
|
// flowing through resource.WithFromEnv. No certctl-specific
|
||||||
|
// CERTCTL_OTEL_SERVICE_NAME env var exists; the OTel SDK's
|
||||||
|
// existing env-var surface is the only override path.
|
||||||
|
func TestInit_Enabled_RespectsOTEL_SERVICE_NAME(t *testing.T) {
|
||||||
|
t.Setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://127.0.0.1:1")
|
||||||
|
t.Setenv("OTEL_EXPORTER_OTLP_INSECURE", "true")
|
||||||
|
t.Setenv("OTEL_SERVICE_NAME", "certctl-override-test")
|
||||||
|
|
||||||
|
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
before := otel.GetTracerProvider()
|
||||||
|
t.Cleanup(func() { otel.SetTracerProvider(before) })
|
||||||
|
|
||||||
|
shutdown, err := Init(ctx, Config{Enabled: true})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("Init = %v; want nil", err)
|
||||||
|
}
|
||||||
|
defer shutdown(context.Background())
|
||||||
|
}
|
||||||
@@ -0,0 +1,412 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"runtime"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
_ "github.com/lib/pq"
|
||||||
|
"github.com/testcontainers/testcontainers-go"
|
||||||
|
"github.com/testcontainers/testcontainers-go/wait"
|
||||||
|
|
||||||
|
"github.com/certctl-io/certctl/internal/ratelimit"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): backend-equivalence test suite. Runs the same scenario
|
||||||
|
// surface against both backends (in-memory + postgres) via the shared
|
||||||
|
// Limiter interface — if the postgres backend's caller-visible
|
||||||
|
// semantics drift from the memory backend's, this file fails first.
|
||||||
|
//
|
||||||
|
// Mirrors the white-box test names in sliding_window_test.go: every
|
||||||
|
// public-surface behavior pinned there (cap, expiry, disabled bypass,
|
||||||
|
// empty-key short-circuit, concurrency) gets re-pinned here for the
|
||||||
|
// postgres backend.
|
||||||
|
//
|
||||||
|
// Postgres tests skip under -short (matches the pattern in
|
||||||
|
// internal/repository/postgres/testutil_test.go); CI's
|
||||||
|
// `go test -race -short -count=1 ./...` exercises only the memory
|
||||||
|
// half. The integration job runs the full suite.
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Backend-equivalence helpers
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
// limiterFactory builds a fresh Limiter for one test case.
|
||||||
|
// Memory backends discard `db`; postgres backends use it.
|
||||||
|
type limiterFactory func(t *testing.T, db *sql.DB, maxN int, window time.Duration) ratelimit.Limiter
|
||||||
|
|
||||||
|
func memoryFactory(t *testing.T, _ *sql.DB, maxN int, window time.Duration) ratelimit.Limiter {
|
||||||
|
t.Helper()
|
||||||
|
// Map cap of 10_000 — large enough that none of the equivalence
|
||||||
|
// scenarios trip the LRU-eviction branch (the eviction branch is
|
||||||
|
// memory-specific; postgres has no equivalent so it's not part of
|
||||||
|
// the cross-backend contract).
|
||||||
|
return ratelimit.NewSlidingWindowLimiter(maxN, window, 10_000)
|
||||||
|
}
|
||||||
|
|
||||||
|
func postgresFactory(t *testing.T, db *sql.DB, maxN int, window time.Duration) ratelimit.Limiter {
|
||||||
|
t.Helper()
|
||||||
|
if db == nil {
|
||||||
|
t.Fatal("postgresFactory requires a non-nil *sql.DB")
|
||||||
|
}
|
||||||
|
return ratelimit.NewPostgresSlidingWindowLimiter(db, maxN, window)
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Per-backend test entry points
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func TestSlidingWindowLimiter_Equivalence_Memory(t *testing.T) {
|
||||||
|
t.Run("AllowsUpToCap", func(t *testing.T) { caseAllowsUpToCap(t, memoryFactory, nil) })
|
||||||
|
t.Run("DistinctKeysIndependent", func(t *testing.T) { caseDistinctKeysIndependent(t, memoryFactory, nil) })
|
||||||
|
t.Run("WindowExpiry", func(t *testing.T) { caseWindowExpiry(t, memoryFactory, nil) })
|
||||||
|
t.Run("DisabledBypass", func(t *testing.T) { caseDisabledBypass(t, memoryFactory, nil) })
|
||||||
|
t.Run("NegativeCapDisabled", func(t *testing.T) { caseNegativeCapDisabled(t, memoryFactory, nil) })
|
||||||
|
t.Run("EmptyKeyShortCircuits", func(t *testing.T) { caseEmptyKeyShortCircuits(t, memoryFactory, nil) })
|
||||||
|
t.Run("ConcurrentRaceFree", func(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("race-style test under -short")
|
||||||
|
}
|
||||||
|
caseConcurrentRaceFree(t, memoryFactory, nil)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSlidingWindowLimiter_Equivalence_Postgres(t *testing.T) {
|
||||||
|
if testing.Short() {
|
||||||
|
t.Skip("postgres equivalence tests require testcontainers; skipped under -short")
|
||||||
|
}
|
||||||
|
tdb := setupTestDB(t)
|
||||||
|
defer tdb.teardown(t)
|
||||||
|
|
||||||
|
t.Run("AllowsUpToCap", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "AllowsUpToCap")
|
||||||
|
caseAllowsUpToCap(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("DistinctKeysIndependent", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "DistinctKeysIndependent")
|
||||||
|
caseDistinctKeysIndependent(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("WindowExpiry", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "WindowExpiry")
|
||||||
|
caseWindowExpiry(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("DisabledBypass", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "DisabledBypass")
|
||||||
|
caseDisabledBypass(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("NegativeCapDisabled", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "NegativeCapDisabled")
|
||||||
|
caseNegativeCapDisabled(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("EmptyKeyShortCircuits", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "EmptyKeyShortCircuits")
|
||||||
|
caseEmptyKeyShortCircuits(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
t.Run("ConcurrentRaceFree", func(t *testing.T) {
|
||||||
|
db := tdb.freshSchema(t, "ConcurrentRaceFree")
|
||||||
|
caseConcurrentRaceFree(t, postgresFactory, db)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Backend-agnostic test cases (one per behavior pinned in
|
||||||
|
// sliding_window_test.go's public-surface tests)
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
func caseAllowsUpToCap(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 3, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 3; i++ {
|
||||||
|
if err := l.Allow("k", now.Add(time.Duration(i)*time.Minute)); err != nil {
|
||||||
|
t.Fatalf("call %d should be allowed: %v", i+1, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := l.Allow("k", now.Add(4*time.Minute)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("4th call should be rate-limited; got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseDistinctKeysIndependent(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 1, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if err := l.Allow("k-1", now); err != nil {
|
||||||
|
t.Fatalf("first allow: %v", err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k-2", now); err != nil {
|
||||||
|
t.Fatalf("different key must have its own bucket: %v", err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k-1", now.Add(1*time.Second)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("repeat key should be limited; got %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseWindowExpiry(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 2, 1*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
if err := l.Allow("k", now.Add(30*time.Minute)); err != nil {
|
||||||
|
t.Fatal(err)
|
||||||
|
}
|
||||||
|
// Inside window — limited.
|
||||||
|
if err := l.Allow("k", now.Add(45*time.Minute)); !errors.Is(err, ratelimit.ErrRateLimited) {
|
||||||
|
t.Fatalf("inside-window 3rd call should be limited: %v", err)
|
||||||
|
}
|
||||||
|
// Past window — slots reopen.
|
||||||
|
if err := l.Allow("k", now.Add(2*time.Hour)); err != nil {
|
||||||
|
t.Fatalf("past-window call should be allowed (window reset): %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseDisabledBypass(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 0, 24*time.Hour) // maxN=0 → disabled
|
||||||
|
type disablable interface {
|
||||||
|
Disabled() bool
|
||||||
|
}
|
||||||
|
if d, ok := l.(disablable); ok && !d.Disabled() {
|
||||||
|
t.Fatal("limiter with maxN=0 must report Disabled()=true")
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 100; i++ {
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatalf("disabled limiter must allow everything: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseNegativeCapDisabled(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, -1, 24*time.Hour)
|
||||||
|
type disablable interface {
|
||||||
|
Disabled() bool
|
||||||
|
}
|
||||||
|
if d, ok := l.(disablable); ok && !d.Disabled() {
|
||||||
|
t.Fatal("negative maxN must produce a disabled limiter")
|
||||||
|
}
|
||||||
|
now := time.Now()
|
||||||
|
if err := l.Allow("k", now); err != nil {
|
||||||
|
t.Fatalf("disabled limiter must allow: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseEmptyKeyShortCircuits(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
// Empty key is the caller's defense-in-depth case — caller's
|
||||||
|
// validation upstream should reject empty-key events first. Limiter
|
||||||
|
// must not build a single shared bucket keyed by empty-key — that
|
||||||
|
// would be a chokepoint for every empty-key event.
|
||||||
|
l := mk(t, db, 1, 24*time.Hour)
|
||||||
|
now := time.Now()
|
||||||
|
for i := 0; i < 50; i++ {
|
||||||
|
if err := l.Allow("", now); err != nil {
|
||||||
|
t.Fatalf("empty key must short-circuit (call %d): %v", i, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func caseConcurrentRaceFree(t *testing.T, mk limiterFactory, db *sql.DB) {
|
||||||
|
l := mk(t, db, 50, 24*time.Hour)
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
for g := 0; g < 20; g++ {
|
||||||
|
wg.Add(1)
|
||||||
|
go func(id int) {
|
||||||
|
defer wg.Done()
|
||||||
|
now := time.Now()
|
||||||
|
key := fmt.Sprintf("k-%d", id)
|
||||||
|
for i := 0; i < 30; i++ {
|
||||||
|
_ = l.Allow(key, now)
|
||||||
|
}
|
||||||
|
}(g)
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
// Postgres-only testcontainers harness — mirrors
|
||||||
|
// internal/repository/postgres/testutil_test.go's setupTestDB +
|
||||||
|
// freshSchema pattern.
|
||||||
|
// ----------------------------------------------------------------
|
||||||
|
|
||||||
|
type testDB struct {
|
||||||
|
db *sql.DB
|
||||||
|
container testcontainers.Container
|
||||||
|
}
|
||||||
|
|
||||||
|
func setupTestDB(t *testing.T) *testDB {
|
||||||
|
t.Helper()
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
req := testcontainers.ContainerRequest{
|
||||||
|
Image: "postgres:16-alpine",
|
||||||
|
ExposedPorts: []string{"5432/tcp"},
|
||||||
|
Env: map[string]string{
|
||||||
|
"POSTGRES_DB": "certctl_test",
|
||||||
|
"POSTGRES_USER": "certctl",
|
||||||
|
"POSTGRES_PASSWORD": "certctl",
|
||||||
|
},
|
||||||
|
WaitingFor: wait.ForLog("database system is ready to accept connections").WithOccurrence(2),
|
||||||
|
}
|
||||||
|
container, err := testcontainers.GenericContainer(ctx, testcontainers.GenericContainerRequest{
|
||||||
|
ContainerRequest: req,
|
||||||
|
Started: true,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("start postgres container: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
host, err := container.Host(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container host: %v", err)
|
||||||
|
}
|
||||||
|
port, err := container.MappedPort(ctx, "5432")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("container port: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
connStr := fmt.Sprintf("postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable", host, port.Port())
|
||||||
|
db, err := sql.Open("postgres", connStr)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open db: %v", err)
|
||||||
|
}
|
||||||
|
// Pool size > 1 so the multi-goroutine concurrency case can hold
|
||||||
|
// multiple connections simultaneously; the row-lock arbitrates.
|
||||||
|
db.SetMaxOpenConns(8)
|
||||||
|
|
||||||
|
if err := db.Ping(); err != nil {
|
||||||
|
t.Fatalf("ping: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return &testDB{db: db, container: container}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (tdb *testDB) teardown(t *testing.T) {
|
||||||
|
t.Helper()
|
||||||
|
if tdb.db != nil {
|
||||||
|
tdb.db.Close()
|
||||||
|
}
|
||||||
|
if tdb.container != nil {
|
||||||
|
_ = tdb.container.Terminate(context.Background())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// freshSchema creates an isolated schema per test case + runs the
|
||||||
|
// rate_limit_buckets migration inside it. Returns a *sql.DB whose
|
||||||
|
// search_path is scoped to the new schema.
|
||||||
|
//
|
||||||
|
// Note: this helper takes a sub-test label (caller-supplied) so the
|
||||||
|
// schema name is deterministic-per-case + stable across runs. The
|
||||||
|
// canonical postgres testutil uses t.Name() but we're inside Run-
|
||||||
|
// nested subtests where t.Name() includes "/" — flatten it.
|
||||||
|
func (tdb *testDB) freshSchema(t *testing.T, label string) *sql.DB {
|
||||||
|
t.Helper()
|
||||||
|
schema := sanitizeSchemaName(label + "_" + t.Name())
|
||||||
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// One connection-scoped session so SET search_path persists.
|
||||||
|
conn, err := tdb.db.Conn(ctx)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("acquire conn: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if _, err := conn.ExecContext(ctx, fmt.Sprintf("CREATE SCHEMA IF NOT EXISTS %s", schema)); err != nil {
|
||||||
|
t.Fatalf("create schema: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := conn.ExecContext(ctx, fmt.Sprintf("SET search_path TO %s, public", schema)); err != nil {
|
||||||
|
t.Fatalf("set search_path: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Run the rate_limit_buckets migration in this schema. The migration
|
||||||
|
// is the only one that introduces our table; other migrations don't
|
||||||
|
// matter for limiter behavior.
|
||||||
|
migPath := findMigration("000046_rate_limit_buckets.up.sql")
|
||||||
|
body, err := os.ReadFile(migPath)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read migration: %v", err)
|
||||||
|
}
|
||||||
|
if _, err := conn.ExecContext(ctx, string(body)); err != nil {
|
||||||
|
t.Fatalf("apply migration: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Cleanup(func() {
|
||||||
|
conn.ExecContext(context.Background(), fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", schema))
|
||||||
|
conn.Close()
|
||||||
|
})
|
||||||
|
|
||||||
|
// Wrap the single connection in a *sql.DB-like by returning a fresh
|
||||||
|
// pool that goes through the same search_path. Simpler: just return
|
||||||
|
// the underlying *sql.DB and SET search_path session-wide by re-
|
||||||
|
// running the SET on every checkout. The cleanest move is to use
|
||||||
|
// the per-connection helper: return a *sql.DB that's actually a
|
||||||
|
// "limited to N=1 connection with search_path pinned" handle.
|
||||||
|
//
|
||||||
|
// Workaround the easy way: build a fresh *sql.DB whose dsn embeds
|
||||||
|
// search_path as a connection-time setting, so every connection
|
||||||
|
// auto-applies it.
|
||||||
|
dsn := connDSNWithSearchPath(tdb, schema)
|
||||||
|
scoped, err := sql.Open("postgres", dsn)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("open scoped db: %v", err)
|
||||||
|
}
|
||||||
|
scoped.SetMaxOpenConns(8)
|
||||||
|
t.Cleanup(func() { scoped.Close() })
|
||||||
|
|
||||||
|
// Sanity: row exists / table exists.
|
||||||
|
if _, err := scoped.ExecContext(ctx, "SELECT 1 FROM rate_limit_buckets LIMIT 1"); err != nil && !strings.Contains(err.Error(), "no rows") {
|
||||||
|
// Empty table is fine; only a missing-table error matters.
|
||||||
|
// "no rows" never fires here (we used Exec not Query).
|
||||||
|
t.Fatalf("smoke select: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return scoped
|
||||||
|
}
|
||||||
|
|
||||||
|
func connDSNWithSearchPath(tdb *testDB, schema string) string {
|
||||||
|
// Derive the DSN by introspection of the container's host/port.
|
||||||
|
// Couldn't pre-store because freshSchema can be called many times.
|
||||||
|
ctx := context.Background()
|
||||||
|
host, _ := tdb.container.Host(ctx)
|
||||||
|
port, _ := tdb.container.MappedPort(ctx, "5432")
|
||||||
|
return fmt.Sprintf(
|
||||||
|
"postgres://certctl:certctl@%s:%s/certctl_test?sslmode=disable&search_path=%s,public",
|
||||||
|
host, port.Port(), schema,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
func sanitizeSchemaName(name string) string {
|
||||||
|
name = strings.ToLower(name)
|
||||||
|
for _, ch := range []string{"/", " ", "-", "."} {
|
||||||
|
name = strings.ReplaceAll(name, ch, "_")
|
||||||
|
}
|
||||||
|
if len(name) > 50 {
|
||||||
|
name = name[:50]
|
||||||
|
}
|
||||||
|
return "test_rl_" + name
|
||||||
|
}
|
||||||
|
|
||||||
|
func findMigration(filename string) string {
|
||||||
|
_, here, _, _ := runtime.Caller(0)
|
||||||
|
// here = .../internal/ratelimit/equivalence_test.go
|
||||||
|
// migrations = .../migrations
|
||||||
|
dir := filepath.Dir(here)
|
||||||
|
for i := 0; i < 6; i++ {
|
||||||
|
candidate := filepath.Join(dir, "migrations", filename)
|
||||||
|
if _, err := os.Stat(candidate); err == nil {
|
||||||
|
return candidate
|
||||||
|
}
|
||||||
|
dir = filepath.Dir(dir)
|
||||||
|
}
|
||||||
|
return ""
|
||||||
|
}
|
||||||
@@ -0,0 +1,65 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the backend-selector factory. Wires every
|
||||||
|
// `ratelimit.NewSlidingWindowLimiter(...)` call site in
|
||||||
|
// cmd/server/main.go through here so the operator-chosen backend
|
||||||
|
// (CERTCTL_RATE_LIMIT_BACKEND={memory,postgres}) gates the limiter
|
||||||
|
// type without each call site replicating the switch.
|
||||||
|
//
|
||||||
|
// Caller-visible behavior contract: NewLimiter(backend="memory", ...)
|
||||||
|
// returns a *SlidingWindowLimiter identical to a direct
|
||||||
|
// NewSlidingWindowLimiter call. NewLimiter(backend="postgres", ...)
|
||||||
|
// returns a *PostgresSlidingWindowLimiter with the same Allow(key, now)
|
||||||
|
// signature + the same ErrRateLimited sentinel + the same maxN<=0
|
||||||
|
// disabled semantics. Sprint 13.3's "no signature change" rule is
|
||||||
|
// what makes the swap drop-in.
|
||||||
|
//
|
||||||
|
// The mapCap argument is the in-memory backend's per-instance
|
||||||
|
// key-cap (LRU-evicted under pressure). Postgres backend has no
|
||||||
|
// equivalent — the table grows until the scheduler janitor sweeps
|
||||||
|
// stale rows; mapCap is accepted + ignored for that backend so the
|
||||||
|
// factory signature stays drop-in identical to NewSlidingWindowLimiter.
|
||||||
|
|
||||||
|
// NewLimiter returns a Limiter backed by either the in-memory
|
||||||
|
// SlidingWindowLimiter (backend="memory") or the
|
||||||
|
// PostgresSlidingWindowLimiter (backend="postgres").
|
||||||
|
//
|
||||||
|
// `backend` is validated by config.Validate() at startup; any other
|
||||||
|
// value here panics — config validation is the SoT, this is just
|
||||||
|
// defensive in case the call site somehow bypasses startup
|
||||||
|
// validation.
|
||||||
|
//
|
||||||
|
// `db` is required when backend="postgres" and ignored when
|
||||||
|
// backend="memory". The factory does not nil-check db for the
|
||||||
|
// memory branch because requiring a meaningful db handle for the
|
||||||
|
// memory path would couple every limiter call site to the database
|
||||||
|
// pool unnecessarily.
|
||||||
|
//
|
||||||
|
// `maxN <= 0` disables the limiter (both backends honor the
|
||||||
|
// opt-out — all Allow calls return nil).
|
||||||
|
func NewLimiter(backend string, db *sql.DB, maxN int, window time.Duration, mapCap int) Limiter {
|
||||||
|
switch backend {
|
||||||
|
case "memory":
|
||||||
|
return NewSlidingWindowLimiter(maxN, window, mapCap)
|
||||||
|
case "postgres":
|
||||||
|
if db == nil {
|
||||||
|
panic("ratelimit.NewLimiter: backend=postgres requires a non-nil *sql.DB (config.Validate should have caught this earlier)")
|
||||||
|
}
|
||||||
|
return NewPostgresSlidingWindowLimiter(db, maxN, window)
|
||||||
|
default:
|
||||||
|
// Defensive — config.Validate() rejects anything else at
|
||||||
|
// startup. Reaching this branch implies a coding error in a
|
||||||
|
// future call site that bypasses validation.
|
||||||
|
panic(fmt.Sprintf("ratelimit.NewLimiter: unknown backend %q (must be memory or postgres)", backend))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import "time"
|
||||||
|
|
||||||
|
// Limiter is the rate-limit primitive every caller in cmd/server +
|
||||||
|
// internal/api/handler + internal/service consumes. Two backends
|
||||||
|
// satisfy this interface:
|
||||||
|
//
|
||||||
|
// - SlidingWindowLimiter (in-memory; the historical default;
|
||||||
|
// declared in sliding_window.go).
|
||||||
|
// - PostgresSlidingWindowLimiter (cross-replica-consistent;
|
||||||
|
// declared in postgres_sliding_window.go; introduced in Phase 13
|
||||||
|
// Sprint 13.2 for the ARCH-M1 substantive close).
|
||||||
|
//
|
||||||
|
// Sprint 13.3 (next) wires every call site through the operator-
|
||||||
|
// chosen backend via the CERTCTL_RATELIMIT_BACKEND={memory,postgres}
|
||||||
|
// env var. Until then, both backends compile + tests for both pass,
|
||||||
|
// but the production call sites still construct SlidingWindowLimiter
|
||||||
|
// directly.
|
||||||
|
//
|
||||||
|
// Sprint 13.2 signature note: the prompt template specified
|
||||||
|
// `Allow(key string) error`, but the actual repo signature has been
|
||||||
|
// `Allow(key string, now time.Time) error` since the EST RFC 7030
|
||||||
|
// hardening master bundle Phase 4.1 — the `now` parameter is what
|
||||||
|
// makes the memory limiter testable against synthetic time. The
|
||||||
|
// interface matches the actual signature so the existing
|
||||||
|
// SlidingWindowLimiter satisfies Limiter without a method-set change.
|
||||||
|
//
|
||||||
|
// Per CLAUDE.md "the repo is truth" principle, code grounded against
|
||||||
|
// the live signature (not the prompt's draft).
|
||||||
|
type Limiter interface {
|
||||||
|
// Allow records a request at the given key/time and returns
|
||||||
|
// ErrRateLimited if the configured cap is exceeded inside the
|
||||||
|
// configured window. nil otherwise.
|
||||||
|
//
|
||||||
|
// Empty `key` short-circuits to nil (caller's defense-in-depth;
|
||||||
|
// caller upstream validation should reject empty-key events
|
||||||
|
// first — building a single shared bucket keyed by empty-key
|
||||||
|
// would be a chokepoint for every empty-key event).
|
||||||
|
//
|
||||||
|
// Disabled limiters (maxN <= 0) return nil for every call.
|
||||||
|
Allow(key string, now time.Time) error
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compile-time interface satisfaction checks. Drift in either
|
||||||
|
// backend's Allow signature fails the build at this file before any
|
||||||
|
// caller breaks.
|
||||||
|
var (
|
||||||
|
_ Limiter = (*SlidingWindowLimiter)(nil)
|
||||||
|
_ Limiter = (*PostgresSlidingWindowLimiter)(nil)
|
||||||
|
)
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.3 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the scheduler-invoked janitor for the postgres-backed
|
||||||
|
// rate-limit bucket table. Sweeps rows whose updated_at is older than
|
||||||
|
// the longest configured window any caller uses — these rows can
|
||||||
|
// never be at-cap (every timestamp inside has aged past the window),
|
||||||
|
// so dropping them entirely is safe.
|
||||||
|
//
|
||||||
|
// The in-memory backend's prune-on-Allow path keeps buckets short-
|
||||||
|
// lived without a separate sweep; this file is postgres-only.
|
||||||
|
|
||||||
|
// PostgresGC drives the rate_limit_buckets sweep. Constructed from the
|
||||||
|
// same *sql.DB the limiters use; the scheduler holds it as a value
|
||||||
|
// satisfying the ratelimit.GarbageCollector interface (mirrors the
|
||||||
|
// shape of acme.GarbageCollector + sessions.GarbageCollector).
|
||||||
|
type PostgresGC struct {
|
||||||
|
db *sql.DB
|
||||||
|
maxWindow time.Duration
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPostgresGC returns a janitor that sweeps rows whose updated_at
|
||||||
|
// is older than `maxWindow` ago. Pass the longest window any caller
|
||||||
|
// in the deployment configures (the EST per-principal limiter uses
|
||||||
|
// 24h today; bump if a new caller introduces a longer window).
|
||||||
|
//
|
||||||
|
// maxWindow <= 0 disables the sweep — GarbageCollect becomes a
|
||||||
|
// no-op. Operator opt-out for sketchpad / single-replica deploys
|
||||||
|
// that still want the postgres backend (rare; the memory backend is
|
||||||
|
// the better fit).
|
||||||
|
func NewPostgresGC(db *sql.DB, maxWindow time.Duration) *PostgresGC {
|
||||||
|
return &PostgresGC{db: db, maxWindow: maxWindow}
|
||||||
|
}
|
||||||
|
|
||||||
|
// GarbageCollect deletes every rate_limit_buckets row whose
|
||||||
|
// updated_at is older than now-maxWindow. Returns the number of
|
||||||
|
// rows deleted + any error from the DELETE.
|
||||||
|
//
|
||||||
|
// Single statement, single round-trip — operates on the
|
||||||
|
// rate_limit_buckets_updated_at_idx index introduced in migration
|
||||||
|
// 000046. Idempotent: repeated calls find 0 rows.
|
||||||
|
func (g *PostgresGC) GarbageCollect(ctx context.Context) (int64, error) {
|
||||||
|
if g.maxWindow <= 0 {
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
cutoff := time.Now().Add(-g.maxWindow)
|
||||||
|
res, err := g.db.ExecContext(ctx, `
|
||||||
|
DELETE FROM rate_limit_buckets
|
||||||
|
WHERE updated_at < $1
|
||||||
|
`, cutoff)
|
||||||
|
if err != nil {
|
||||||
|
return 0, fmt.Errorf("ratelimit-gc: delete stale buckets: %w", err)
|
||||||
|
}
|
||||||
|
n, err := res.RowsAffected()
|
||||||
|
if err != nil {
|
||||||
|
// Driver doesn't expose RowsAffected; rare. Don't fail the
|
||||||
|
// sweep — the delete already ran.
|
||||||
|
return 0, nil
|
||||||
|
}
|
||||||
|
return n, nil
|
||||||
|
}
|
||||||
@@ -0,0 +1,228 @@
|
|||||||
|
// Copyright 2026 certctl LLC. All rights reserved.
|
||||||
|
// SPDX-License-Identifier: BUSL-1.1
|
||||||
|
|
||||||
|
package ratelimit
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"database/sql"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/lib/pq"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Phase 13 Sprint 13.2 closure (2026-05-14, architecture diligence audit
|
||||||
|
// ARCH-M1): the cross-replica-consistent rate-limit backend. Same
|
||||||
|
// algorithm as SlidingWindowLimiter (prune-on-Allow sliding-window log)
|
||||||
|
// but the state lives in postgres so N replicas see the same per-key
|
||||||
|
// bucket. Replaces the per-process in-memory limit when the operator
|
||||||
|
// sets CERTCTL_RATELIMIT_BACKEND=postgres (wired in Sprint 13.3).
|
||||||
|
//
|
||||||
|
// Algorithm
|
||||||
|
// =========
|
||||||
|
// Each Allow call runs a single BEGIN/COMMIT transaction:
|
||||||
|
//
|
||||||
|
// 1. INSERT ... ON CONFLICT (bucket_key) DO NOTHING — ensure the
|
||||||
|
// row exists so the SELECT FOR UPDATE below has something to lock.
|
||||||
|
// 2. SELECT timestamps FROM rate_limit_buckets WHERE bucket_key=$1
|
||||||
|
// FOR UPDATE — acquire the per-key row lock for the rest of the
|
||||||
|
// transaction.
|
||||||
|
// 3. Prune timestamps older than (now - window) in Go (reusing the
|
||||||
|
// unexported pruneOlderThan helper shared with SlidingWindowLimiter
|
||||||
|
// — single source of truth for the prune semantics).
|
||||||
|
// 4. If cardinality(pruned) >= maxN: persist the pruned state without
|
||||||
|
// appending, COMMIT, return ErrRateLimited.
|
||||||
|
// 5. Else: append `now`, persist, COMMIT, return nil.
|
||||||
|
//
|
||||||
|
// SELECT FOR UPDATE serializes Allow calls for the same key across
|
||||||
|
// replicas: replicas A and B firing simultaneous Allow("k") never
|
||||||
|
// race because Postgres' row-lock arbitrates. This is the entire
|
||||||
|
// reason for the close — the memory backend's sync.Mutex only
|
||||||
|
// arbitrates within a process; pg's row lock arbitrates the cluster.
|
||||||
|
//
|
||||||
|
// Why a transaction (not a single CTE)
|
||||||
|
// ====================================
|
||||||
|
// A "compute everything in one SQL statement" approach using
|
||||||
|
// INSERT ... ON CONFLICT DO UPDATE SET timestamps = CASE WHEN ... is
|
||||||
|
// possible but the conditional logic to gate the append on the
|
||||||
|
// pruned-cardinality requires nested CTEs whose check-then-act
|
||||||
|
// semantics are hard to read + harder to convince yourself are
|
||||||
|
// race-free across all isolation levels. The explicit transaction
|
||||||
|
// version above is correct under READ COMMITTED (Postgres' default),
|
||||||
|
// matches the memory backend's read-decide-write shape line-for-line,
|
||||||
|
// and shares the same prune helper. Two extra round-trips per Allow
|
||||||
|
// vs one is acceptable for the rate-limit hot path — the operation
|
||||||
|
// is gated anyway.
|
||||||
|
//
|
||||||
|
// Sprint 13.3 will wire the scheduler janitor loop that GCs rows
|
||||||
|
// whose updated_at is older than the longest configured window; the
|
||||||
|
// migration ships the supporting btree index on updated_at.
|
||||||
|
|
||||||
|
// PostgresSlidingWindowLimiter implements Limiter against the
|
||||||
|
// rate_limit_buckets table introduced in migration 000046.
|
||||||
|
//
|
||||||
|
// Constructed via NewPostgresSlidingWindowLimiter. The zero value is
|
||||||
|
// NOT usable — the db handle is required.
|
||||||
|
//
|
||||||
|
// Concurrency: safe for concurrent Allow calls across goroutines AND
|
||||||
|
// across N replicas (the underlying SELECT FOR UPDATE serializes
|
||||||
|
// per-key access across the cluster).
|
||||||
|
type PostgresSlidingWindowLimiter struct {
|
||||||
|
db *sql.DB
|
||||||
|
maxN int
|
||||||
|
window time.Duration
|
||||||
|
disabled bool // maxN <= 0 → all Allow calls return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// NewPostgresSlidingWindowLimiter returns a limiter with the given
|
||||||
|
// per-key cap + window. maxN <= 0 disables the limiter (all Allow
|
||||||
|
// calls return nil); matches the memory backend's opt-out semantics
|
||||||
|
// for test harnesses + sketchpad deploys.
|
||||||
|
//
|
||||||
|
// Window defaults to 24h when zero, mirroring SlidingWindowLimiter.
|
||||||
|
//
|
||||||
|
// The db argument is required + must outlive the limiter. Construction
|
||||||
|
// itself does NOT touch the database — DDL is owned by migration
|
||||||
|
// 000046_rate_limit_buckets.up.sql which runs at boot via
|
||||||
|
// cmd/server's RunMigrations path.
|
||||||
|
func NewPostgresSlidingWindowLimiter(db *sql.DB, maxN int, window time.Duration) *PostgresSlidingWindowLimiter {
|
||||||
|
if window <= 0 {
|
||||||
|
window = 24 * time.Hour
|
||||||
|
}
|
||||||
|
disabled := maxN <= 0
|
||||||
|
return &PostgresSlidingWindowLimiter{
|
||||||
|
db: db,
|
||||||
|
maxN: maxN,
|
||||||
|
window: window,
|
||||||
|
disabled: disabled,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Allow records a request at the given (key, now) and returns
|
||||||
|
// ErrRateLimited if the configured cap is exceeded inside the
|
||||||
|
// configured window. Matches SlidingWindowLimiter.Allow byte-for-byte
|
||||||
|
// in caller-visible semantics so Sprint 13.3's backend-selector swap
|
||||||
|
// is signature-clean.
|
||||||
|
//
|
||||||
|
// The `now` argument is the timestamp the call is "happening at".
|
||||||
|
// Used as the prune cutoff (entries older than now-window are dropped)
|
||||||
|
// and as the new appended entry. Tests pass synthetic `now` values
|
||||||
|
// to exercise window-expiry deterministically; production call sites
|
||||||
|
// pass time.Now() (matching how SlidingWindowLimiter is invoked
|
||||||
|
// today — see internal/api/handler/{est,export,certificates,
|
||||||
|
// auth_breakglass}.go).
|
||||||
|
//
|
||||||
|
// Empty `key` short-circuits to nil (matches the memory backend's
|
||||||
|
// chokepoint-avoidance contract).
|
||||||
|
func (l *PostgresSlidingWindowLimiter) Allow(key string, now time.Time) error {
|
||||||
|
if l.disabled {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if key == "" {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
ctx := context.Background()
|
||||||
|
tx, err := l.db.BeginTx(ctx, &sql.TxOptions{Isolation: sql.LevelReadCommitted})
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: begin tx: %w", err)
|
||||||
|
}
|
||||||
|
defer func() {
|
||||||
|
// Rollback is a no-op once the tx is committed; safe to defer
|
||||||
|
// unconditionally for the error paths.
|
||||||
|
_ = tx.Rollback()
|
||||||
|
}()
|
||||||
|
|
||||||
|
// Step 1: ensure the row exists so SELECT FOR UPDATE has something
|
||||||
|
// to lock. ON CONFLICT DO NOTHING is a no-op when the row already
|
||||||
|
// exists.
|
||||||
|
if _, err := tx.ExecContext(ctx, `
|
||||||
|
INSERT INTO rate_limit_buckets (bucket_key, timestamps, updated_at)
|
||||||
|
VALUES ($1, '{}', $2)
|
||||||
|
ON CONFLICT (bucket_key) DO NOTHING
|
||||||
|
`, key, now); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: ensure row: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 2: lock the row + read current state. lib/pq cannot scan a
|
||||||
|
// TIMESTAMPTZ[] column back into []time.Time directly: time.Time
|
||||||
|
// does not implement sql.Scanner, and pq.GenericArray's per-element
|
||||||
|
// scan path calls Scan() (not database/sql's convertAssign), so the
|
||||||
|
// inner Scan fails with
|
||||||
|
// "pq: scanning to time.Time is not implemented; only sql.Scanner".
|
||||||
|
// Workaround: ask Postgres to format each timestamp as a canonical
|
||||||
|
// ISO 8601 UTC string via to_char(... AT TIME ZONE 'UTC', ...), read
|
||||||
|
// the column as text[] via pq.StringArray (well-supported), and
|
||||||
|
// parse Go-side. The to_char format is fully deterministic (6-digit
|
||||||
|
// microseconds, "T" separator, "Z" suffix) regardless of the
|
||||||
|
// session's DateStyle / TimeZone settings.
|
||||||
|
const pgTimestampLayout = "2006-01-02T15:04:05.000000Z"
|
||||||
|
var tsStrings pq.StringArray
|
||||||
|
if err := tx.QueryRowContext(ctx, `
|
||||||
|
SELECT COALESCE(
|
||||||
|
ARRAY(
|
||||||
|
SELECT to_char(t AT TIME ZONE 'UTC', 'YYYY-MM-DD"T"HH24:MI:SS.US"Z"')
|
||||||
|
FROM unnest(timestamps) AS t
|
||||||
|
),
|
||||||
|
ARRAY[]::text[]
|
||||||
|
)
|
||||||
|
FROM rate_limit_buckets
|
||||||
|
WHERE bucket_key = $1
|
||||||
|
FOR UPDATE
|
||||||
|
`, key).Scan(&tsStrings); err != nil {
|
||||||
|
// Shouldn't happen — step 1 ensured the row exists. Treat
|
||||||
|
// the sql.ErrNoRows path as a no-op (be conservative; never
|
||||||
|
// over-limit on transient DB weirdness).
|
||||||
|
if errors.Is(err, sql.ErrNoRows) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return fmt.Errorf("ratelimit: select-for-update: %w", err)
|
||||||
|
}
|
||||||
|
ts := make([]time.Time, 0, len(tsStrings))
|
||||||
|
for _, s := range tsStrings {
|
||||||
|
parsed, err := time.Parse(pgTimestampLayout, s)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: parse stored timestamp %q: %w", s, err)
|
||||||
|
}
|
||||||
|
ts = append(ts, parsed.UTC())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 3: prune in Go via the shared helper. Same prune semantics
|
||||||
|
// as SlidingWindowLimiter — single source of truth.
|
||||||
|
cutoff := now.Add(-l.window)
|
||||||
|
pruned := pruneOlderThan(ts, cutoff)
|
||||||
|
|
||||||
|
// Step 4: decide.
|
||||||
|
rateLimited := len(pruned) >= l.maxN
|
||||||
|
if !rateLimited {
|
||||||
|
pruned = append(pruned, now)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Step 5: persist.
|
||||||
|
if _, err := tx.ExecContext(ctx, `
|
||||||
|
UPDATE rate_limit_buckets
|
||||||
|
SET timestamps = $2, updated_at = $3
|
||||||
|
WHERE bucket_key = $1
|
||||||
|
`, key, pq.Array(pruned), now); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: update: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := tx.Commit(); err != nil {
|
||||||
|
return fmt.Errorf("ratelimit: commit: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if rateLimited {
|
||||||
|
return ErrRateLimited
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disabled reports whether the limiter is in opt-out mode (maxN <= 0).
|
||||||
|
// Mirrors SlidingWindowLimiter.Disabled() so handler-side gating +
|
||||||
|
// admin-endpoint observability can ask the same question of either
|
||||||
|
// backend.
|
||||||
|
func (l *PostgresSlidingWindowLimiter) Disabled() bool {
|
||||||
|
return l.disabled
|
||||||
|
}
|
||||||
@@ -210,6 +210,11 @@ type OCSPResponderRepository interface {
|
|||||||
type IssuerRepository interface {
|
type IssuerRepository interface {
|
||||||
// List returns all issuers, optionally filtered.
|
// List returns all issuers, optionally filtered.
|
||||||
List(ctx context.Context) ([]*domain.Issuer, error)
|
List(ctx context.Context) ([]*domain.Issuer, error)
|
||||||
|
// ListPaginated returns a window of issuers (sorted by created_at DESC)
|
||||||
|
// plus the total row count. SCALE-002 closure (Sprint 2, 2026-05-16) —
|
||||||
|
// pushes pagination into the SQL layer so admin pages don't marshal
|
||||||
|
// the full table per request.
|
||||||
|
ListPaginated(ctx context.Context, limit, offset int) ([]*domain.Issuer, int64, error)
|
||||||
// Get retrieves an issuer by ID.
|
// Get retrieves an issuer by ID.
|
||||||
Get(ctx context.Context, id string) (*domain.Issuer, error)
|
Get(ctx context.Context, id string) (*domain.Issuer, error)
|
||||||
// Create stores a new issuer.
|
// Create stores a new issuer.
|
||||||
@@ -227,6 +232,10 @@ type IssuerRepository interface {
|
|||||||
type TargetRepository interface {
|
type TargetRepository interface {
|
||||||
// List returns all targets, optionally filtered.
|
// List returns all targets, optionally filtered.
|
||||||
List(ctx context.Context) ([]*domain.DeploymentTarget, error)
|
List(ctx context.Context) ([]*domain.DeploymentTarget, error)
|
||||||
|
// ListPaginated returns a window of deployment targets (sorted by
|
||||||
|
// created_at DESC) plus the total row count. SCALE-002 closure
|
||||||
|
// (Sprint 2, 2026-05-16).
|
||||||
|
ListPaginated(ctx context.Context, limit, offset int) ([]*domain.DeploymentTarget, int64, error)
|
||||||
// Get retrieves a target by ID.
|
// Get retrieves a target by ID.
|
||||||
Get(ctx context.Context, id string) (*domain.DeploymentTarget, error)
|
Get(ctx context.Context, id string) (*domain.DeploymentTarget, error)
|
||||||
// Create stores a new target.
|
// Create stores a new target.
|
||||||
@@ -490,6 +499,21 @@ type AuditRepository interface {
|
|||||||
CreateWithTx(ctx context.Context, q Querier, event *domain.AuditEvent) error
|
CreateWithTx(ctx context.Context, q Querier, event *domain.AuditEvent) error
|
||||||
// List returns audit events matching the filter criteria.
|
// List returns audit events matching the filter criteria.
|
||||||
List(ctx context.Context, filter *AuditFilter) ([]*domain.AuditEvent, error)
|
List(ctx context.Context, filter *AuditFilter) ([]*domain.AuditEvent, error)
|
||||||
|
// VerifyHashChain walks the per-row hash chain end-to-end (migration
|
||||||
|
// 000047 closure of Sprint 6 COMP-001-HASH) and returns the first
|
||||||
|
// break it finds. brokenAtID == "" + brokenAtPos == -1 means the
|
||||||
|
// chain validated; rowCount is the number of rows walked.
|
||||||
|
//
|
||||||
|
// Tamper-evidence layer that complements migration 000018's WORM
|
||||||
|
// trigger: WORM blocks the app role from UPDATE / DELETE, but a
|
||||||
|
// compliance superuser bypasses that trigger by design (retention
|
||||||
|
// purges, breach-recovery). Without the hash chain, such a role
|
||||||
|
// could rewrite history without detection. The scheduler's
|
||||||
|
// auditChainVerifyLoop calls this every
|
||||||
|
// CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL tick + increments the
|
||||||
|
// certctl_audit_chain_break_detected counter on a non-empty
|
||||||
|
// brokenAtID return.
|
||||||
|
VerifyHashChain(ctx context.Context) (brokenAtID string, brokenAtPos int, rowCount int, err error)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NotificationRepository defines operations for managing notifications.
|
// NotificationRepository defines operations for managing notifications.
|
||||||
@@ -550,6 +574,9 @@ type NotificationRepository interface {
|
|||||||
type TeamRepository interface {
|
type TeamRepository interface {
|
||||||
// List returns all teams.
|
// List returns all teams.
|
||||||
List(ctx context.Context) ([]*domain.Team, error)
|
List(ctx context.Context) ([]*domain.Team, error)
|
||||||
|
// ListPaginated returns a window of teams (sorted by created_at DESC)
|
||||||
|
// plus the total row count. SCALE-002 closure (Sprint 2, 2026-05-16).
|
||||||
|
ListPaginated(ctx context.Context, limit, offset int) ([]*domain.Team, int64, error)
|
||||||
// Get retrieves a team by ID.
|
// Get retrieves a team by ID.
|
||||||
Get(ctx context.Context, id string) (*domain.Team, error)
|
Get(ctx context.Context, id string) (*domain.Team, error)
|
||||||
// Create stores a new team.
|
// Create stores a new team.
|
||||||
@@ -578,6 +605,9 @@ type CertificateProfileRepository interface {
|
|||||||
type AgentGroupRepository interface {
|
type AgentGroupRepository interface {
|
||||||
// List returns all agent groups.
|
// List returns all agent groups.
|
||||||
List(ctx context.Context) ([]*domain.AgentGroup, error)
|
List(ctx context.Context) ([]*domain.AgentGroup, error)
|
||||||
|
// ListPaginated returns a window of agent groups (sorted by name)
|
||||||
|
// plus the total row count. SCALE-002 closure (Sprint 2, 2026-05-16).
|
||||||
|
ListPaginated(ctx context.Context, limit, offset int) ([]*domain.AgentGroup, int64, error)
|
||||||
// Get retrieves an agent group by ID.
|
// Get retrieves an agent group by ID.
|
||||||
Get(ctx context.Context, id string) (*domain.AgentGroup, error)
|
Get(ctx context.Context, id string) (*domain.AgentGroup, error)
|
||||||
// Create stores a new agent group.
|
// Create stores a new agent group.
|
||||||
|
|||||||
@@ -44,6 +44,40 @@ func (r *AgentGroupRepository) List(ctx context.Context) ([]*domain.AgentGroup,
|
|||||||
return groups, rows.Err()
|
return groups, rows.Err()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ListPaginated returns a slice of agent groups bounded by limit/offset
|
||||||
|
// plus the total count. SCALE-002 closure (Sprint 2, 2026-05-16).
|
||||||
|
func (r *AgentGroupRepository) ListPaginated(ctx context.Context, limit, offset int) ([]*domain.AgentGroup, int64, error) {
|
||||||
|
if limit <= 0 {
|
||||||
|
limit = 50
|
||||||
|
}
|
||||||
|
if offset < 0 {
|
||||||
|
offset = 0
|
||||||
|
}
|
||||||
|
var total int64
|
||||||
|
if err := r.db.QueryRowContext(ctx, `SELECT COUNT(*) FROM agent_groups`).Scan(&total); err != nil {
|
||||||
|
return nil, 0, fmt.Errorf("failed to count agent groups: %w", err)
|
||||||
|
}
|
||||||
|
rows, err := r.db.QueryContext(ctx,
|
||||||
|
`SELECT id, name, description, match_os, match_architecture, match_ip_cidr, match_version, enabled, created_at, updated_at
|
||||||
|
FROM agent_groups ORDER BY name LIMIT $1 OFFSET $2`, limit, offset)
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, fmt.Errorf("failed to query agent groups: %w", err)
|
||||||
|
}
|
||||||
|
defer rows.Close()
|
||||||
|
var groups []*domain.AgentGroup
|
||||||
|
for rows.Next() {
|
||||||
|
g, err := scanAgentGroup(rows)
|
||||||
|
if err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
groups = append(groups, g)
|
||||||
|
}
|
||||||
|
if err := rows.Err(); err != nil {
|
||||||
|
return nil, 0, err
|
||||||
|
}
|
||||||
|
return groups, total, nil
|
||||||
|
}
|
||||||
|
|
||||||
// Get retrieves an agent group by ID.
|
// Get retrieves an agent group by ID.
|
||||||
func (r *AgentGroupRepository) Get(ctx context.Context, id string) (*domain.AgentGroup, error) {
|
func (r *AgentGroupRepository) Get(ctx context.Context, id string) (*domain.AgentGroup, error) {
|
||||||
row := r.db.QueryRowContext(ctx,
|
row := r.db.QueryRowContext(ctx,
|
||||||
|
|||||||
@@ -166,3 +166,40 @@ func (r *AuditRepository) List(ctx context.Context, filter *repository.AuditFilt
|
|||||||
|
|
||||||
return events, nil
|
return events, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// VerifyHashChain calls the migration 000047 audit_events_verify_chain()
|
||||||
|
// stored function and returns its three OUT parameters. This is the
|
||||||
|
// Sprint 6 COMP-001-HASH tamper-evidence verifier — the scheduler's
|
||||||
|
// auditChainVerifyLoop invokes it every CERTCTL_AUDIT_CHAIN_VERIFY_INTERVAL
|
||||||
|
// tick and emits the certctl_audit_chain_break_detected counter on any
|
||||||
|
// non-empty brokenAtID.
|
||||||
|
//
|
||||||
|
// The chain walk happens entirely server-side (plpgsql, STABLE). For an
|
||||||
|
// audit_events table with N rows the cost is O(N) per call; we expect
|
||||||
|
// modest fleets (single-digit-millions of events) so the per-tick cost
|
||||||
|
// is bounded. Operators with very large audit tables can lengthen the
|
||||||
|
// interval — the metric is sticky once incremented, so even an hourly
|
||||||
|
// walk is enough lead time to surface tampering for human investigation.
|
||||||
|
func (r *AuditRepository) VerifyHashChain(ctx context.Context) (brokenAtID string, brokenAtPos int, rowCount int, err error) {
|
||||||
|
var (
|
||||||
|
brokenID sql.NullString
|
||||||
|
pos sql.NullInt32
|
||||||
|
total sql.NullInt32
|
||||||
|
)
|
||||||
|
row := r.db.QueryRowContext(ctx, `SELECT first_break_id, first_break_pos, row_count FROM audit_events_verify_chain()`)
|
||||||
|
if err := row.Scan(&brokenID, &pos, &total); err != nil {
|
||||||
|
return "", -1, 0, fmt.Errorf("audit_events_verify_chain: %w", err)
|
||||||
|
}
|
||||||
|
if brokenID.Valid {
|
||||||
|
brokenAtID = brokenID.String
|
||||||
|
}
|
||||||
|
if pos.Valid {
|
||||||
|
brokenAtPos = int(pos.Int32)
|
||||||
|
} else {
|
||||||
|
brokenAtPos = -1
|
||||||
|
}
|
||||||
|
if total.Valid {
|
||||||
|
rowCount = int(total.Int32)
|
||||||
|
}
|
||||||
|
return brokenAtID, brokenAtPos, rowCount, nil
|
||||||
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user